From 6fe7ccfd77415a6ba250c10c580eb3f9acf79753 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 25 Aug 2015 11:17:51 +0200
Subject: netfilter: ipset: Out of bound access in hash:net* types fixed

Dave Jones reported that KASan detected out of bounds access in hash:net*
types:

[   23.139532] ==================================================================
[   23.146130] BUG: KASan: out of bounds access in hash_net4_add_cidr+0x1db/0x220 at addr ffff8800d4844b58
[   23.152937] Write of size 4 by task ipset/457
[   23.159742] =============================================================================
[   23.166672] BUG kmalloc-512 (Not tainted): kasan: bad access detected
[   23.173641] -----------------------------------------------------------------------------
[   23.194668] INFO: Allocated in hash_net_create+0x16a/0x470 age=7 cpu=1 pid=456
[   23.201836]  __slab_alloc.constprop.66+0x554/0x620
[   23.208994]  __kmalloc+0x2f2/0x360
[   23.216105]  hash_net_create+0x16a/0x470
[   23.223238]  ip_set_create+0x3e6/0x740
[   23.230343]  nfnetlink_rcv_msg+0x599/0x640
[   23.237454]  netlink_rcv_skb+0x14f/0x190
[   23.244533]  nfnetlink_rcv+0x3f6/0x790
[   23.251579]  netlink_unicast+0x272/0x390
[   23.258573]  netlink_sendmsg+0x5a1/0xa50
[   23.265485]  SYSC_sendto+0x1da/0x2c0
[   23.272364]  SyS_sendto+0xe/0x10
[   23.279168]  entry_SYSCALL_64_fastpath+0x12/0x6f

The bug is fixed in the patch and the testsuite is extended in ipset
to check cidr handling more thoroughly.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_hash_gen.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index afe905c208af..691b54fcaf2a 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -152,9 +152,13 @@ htable_bits(u32 hashsize)
 #define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128)
 
 #ifdef IP_SET_HASH_WITH_NET0
+/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
 #define NLEN(family)		(SET_HOST_MASK(family) + 1)
+#define CIDR_POS(c)		((c) - 1)
 #else
+/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
 #define NLEN(family)		SET_HOST_MASK(family)
+#define CIDR_POS(c)		((c) - 2)
 #endif
 
 #else
@@ -305,7 +309,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 		} else if (h->nets[i].cidr[n] < cidr) {
 			j = i;
 		} else if (h->nets[i].cidr[n] == cidr) {
-			h->nets[cidr - 1].nets[n]++;
+			h->nets[CIDR_POS(cidr)].nets[n]++;
 			return;
 		}
 	}
@@ -314,7 +318,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 			h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
 	}
 	h->nets[i].cidr[n] = cidr;
-	h->nets[cidr - 1].nets[n] = 1;
+	h->nets[CIDR_POS(cidr)].nets[n] = 1;
 }
 
 static void
@@ -325,8 +329,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 	for (i = 0; i < nets_length; i++) {
 		if (h->nets[i].cidr[n] != cidr)
 			continue;
-		h->nets[cidr - 1].nets[n]--;
-		if (h->nets[cidr - 1].nets[n] > 0)
+		h->nets[CIDR_POS(cidr)].nets[n]--;
+		if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
 			return;
 		for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
 			h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
-- 
cgit v1.2.3-59-g8ed1b


From 96be5f2806cd65a2ebced3bfcdf7df0116e6c4a6 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Sat, 22 Aug 2015 08:44:11 +0300
Subject: netfilter: ipset: Fixing unnamed union init

In continue to proposed Vinson Lee's post [1], this patch fixes compilation
issues founded at gcc 4.4.7. The initialization of .cidr field of unnamed
unions causes compilation error in gcc 4.4.x.

References

Visible links
[1] https://lkml.org/lkml/2015/7/5/74

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_netnet.c     | 20 ++++++++++++++++++--
 net/netfilter/ipset/ip_set_hash_netportnet.c | 20 ++++++++++++++++++--
 2 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 3c862c0a76d1..a93dfebffa81 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -131,6 +131,13 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next,
 #define HOST_MASK	32
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netnet4_init(struct hash_netnet4_elem *e)
+{
+	e->cidr[0] = HOST_MASK;
+	e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		  const struct xt_action_param *par,
@@ -160,7 +167,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 {
 	const struct hash_netnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
-	struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+	struct hash_netnet4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
 	u32 ip = 0, ip_to = 0, last;
 	u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
@@ -169,6 +176,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+	hash_netnet4_init(&e);
 	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
 		return -IPSET_ERR_PROTOCOL;
@@ -357,6 +365,13 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netnet6_init(struct hash_netnet6_elem *e)
+{
+	e->cidr[0] = HOST_MASK;
+	e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		  const struct xt_action_param *par,
@@ -385,13 +400,14 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	ipset_adtfn adtfn = set->variant->adt[adt];
-	struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+	struct hash_netnet6_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
 	int ret;
 
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+	hash_netnet6_init(&e);
 	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
 		return -IPSET_ERR_PROTOCOL;
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 0c68734f5cc4..9a14c237830f 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -142,6 +142,13 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
 #define HOST_MASK	32
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netportnet4_init(struct hash_netportnet4_elem *e)
+{
+	e->cidr[0] = HOST_MASK;
+	e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		      const struct xt_action_param *par,
@@ -175,7 +182,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 {
 	const struct hash_netportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
-	struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+	struct hash_netportnet4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
 	u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
 	u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
@@ -185,6 +192,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+	hash_netportnet4_init(&e);
 	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
 		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -412,6 +420,13 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netportnet6_init(struct hash_netportnet6_elem *e)
+{
+	e->cidr[0] = HOST_MASK;
+	e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		      const struct xt_action_param *par,
@@ -445,7 +460,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 {
 	const struct hash_netportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
-	struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+	struct hash_netportnet6_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
 	u32 port, port_to;
 	bool with_ports = false;
@@ -454,6 +469,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+	hash_netportnet6_init(&e);
 	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
 		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
-- 
cgit v1.2.3-59-g8ed1b


From a9de9777d613500b089a7416f936bf3ae5f070d2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 28 Aug 2015 21:01:43 +0200
Subject: netfilter: nfnetlink: work around wrong endianess in res_id field

The convention in nfnetlink is to use network byte order in every header field
as well as in the attribute payload. The initial version of the batching
infrastructure assumes that res_id comes in host byte order though.

The only client of the batching infrastructure is nf_tables, so let's add a
workaround to address this inconsistency. We currently have 11 nfnetlink
subsystems according to NFNL_SUBSYS_COUNT, so we can assume that the subsystem
2560, ie. htons(10), will not be allocated anytime soon, so it can be an alias
of nf_tables from the nfnetlink batching path when interpreting the res_id
field.

Based on original patch from Florian Westphal.

Reported-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 0c0e8ecf02ab..70277b11f742 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -444,6 +444,7 @@ done:
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+	u_int16_t res_id;
 	int msglen;
 
 	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
@@ -468,7 +469,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 
 		nfgenmsg = nlmsg_data(nlh);
 		skb_pull(skb, msglen);
-		nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+		/* Work around old nft using host byte order */
+		if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+			res_id = NFNL_SUBSYS_NFTABLES;
+		else
+			res_id = ntohs(nfgenmsg->res_id);
+		nfnetlink_rcv_batch(skb, nlh, res_id);
 	} else {
 		netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 9cf94eab8b309e8bcc78b41dd1561c75b537dd0b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 Aug 2015 19:11:02 +0200
Subject: netfilter: conntrack: use nf_ct_tmpl_free in CT/synproxy error paths

Commit 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack
templates") migrated templates to the new allocator api, but forgot to
update error paths for them in CT and synproxy to use nf_ct_tmpl_free()
instead of nf_conntrack_free().

Due to that, memory is being freed into the wrong kmemcache, but also
we drop the per net reference count of ct objects causing an imbalance.

In Brad's case, this leads to a wrap-around of net->ct.count and thus
lets __nf_conntrack_alloc() refuse to create a new ct object:

  [   10.340913] xt_addrtype: ipv6 does not support BROADCAST matching
  [   10.810168] nf_conntrack: table full, dropping packet
  [   11.917416] r8169 0000:07:00.0 eth0: link up
  [   11.917438] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
  [   12.815902] nf_conntrack: table full, dropping packet
  [   15.688561] nf_conntrack: table full, dropping packet
  [   15.689365] nf_conntrack: table full, dropping packet
  [   15.690169] nf_conntrack: table full, dropping packet
  [   15.690967] nf_conntrack: table full, dropping packet
  [...]

With slab debugging, it also reports the wrong kmemcache (kmalloc-512 vs.
nf_conntrack_ffffffff81ce75c0) and reports poison overwrites, etc. Thus,
to fix the problem, export and use nf_ct_tmpl_free() instead.

Fixes: 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates")
Reported-by: Brad Jackson <bjackson0971@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 1 +
 net/netfilter/nf_conntrack_core.c    | 3 ++-
 net/netfilter/nf_synproxy_core.c     | 2 +-
 net/netfilter/xt_CT.c                | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 37cd3911d5c5..4023c4ce260f 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -292,6 +292,7 @@ extern unsigned int nf_conntrack_hash_rnd;
 void init_nf_conntrack_hash_rnd(void);
 
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags);
+void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3c20d02aee73..0625a42df108 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -320,12 +320,13 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
-static void nf_ct_tmpl_free(struct nf_conn *tmpl)
+void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
 	nf_ct_ext_destroy(tmpl);
 	nf_ct_ext_free(tmpl);
 	kfree(tmpl);
 }
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index d7f168527903..d6ee8f8b19b6 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -378,7 +378,7 @@ static int __net_init synproxy_net_init(struct net *net)
 err3:
 	free_percpu(snet->stats);
 err2:
-	nf_conntrack_free(ct);
+	nf_ct_tmpl_free(ct);
 err1:
 	return err;
 }
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 43ddeee404e9..f3377ce1ff18 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -233,7 +233,7 @@ out:
 	return 0;
 
 err3:
-	nf_conntrack_free(ct);
+	nf_ct_tmpl_free(ct);
 err2:
 	nf_ct_l3proto_module_put(par->family);
 err1:
-- 
cgit v1.2.3-59-g8ed1b


From d82f0f1fc8a4f214a50c9dfc64e3896f9894afb7 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 2 Sep 2015 16:20:21 -0300
Subject: sctp: fix dst leak

Commit 0ca50d12fe46 failed to release the reference to dst entries that
it decided to skip.

Fixes: 0ca50d12fe46 ("sctp: fix src address selection if using secondary addresses")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 4345790ad326..4abf94d4cce7 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -511,8 +511,10 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		 */
 		odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
 				     false);
-		if (!odev || odev->ifindex != fl4->flowi4_oif)
+		if (!odev || odev->ifindex != fl4->flowi4_oif) {
+			dst_release(&rt->dst);
 			continue;
+		}
 
 		dst = &rt->dst;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 410f03831c0768f2b1850d28ba697b167ddcb89b Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 2 Sep 2015 16:20:22 -0300
Subject: sctp: add routing output fallback

Commit 0ca50d12fe46 added a restriction that the address must belong to
the output interface, so that sctp will use the right interface even
when using secondary addresses.

But it breaks IPVS setups, on which people is used to attach VIP
addresses to loopback interface on real servers. It's preferred to
attach to the interface actually in use, but it's a very common setup
and that used to work.

This patch then saves the first routing good result, even if it would be
going out through an interface that doesn't have that address. If no
better hit found, it's then used. This effectively restores the original
behavior if no better interface could be found.

Fixes: 0ca50d12fe46 ("sctp: fix src address selection if using secondary addresses")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 4abf94d4cce7..b7143337e4fa 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -506,16 +506,22 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		if (IS_ERR(rt))
 			continue;
 
+		if (!dst)
+			dst = &rt->dst;
+
 		/* Ensure the src address belongs to the output
 		 * interface.
 		 */
 		odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
 				     false);
 		if (!odev || odev->ifindex != fl4->flowi4_oif) {
-			dst_release(&rt->dst);
+			if (&rt->dst != dst)
+				dst_release(&rt->dst);
 			continue;
 		}
 
+		if (dst != &rt->dst)
+			dst_release(dst);
 		dst = &rt->dst;
 		break;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 98a1f8282b8c37378c1b947d661a58942331ca90 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 26 Aug 2015 12:22:14 +0200
Subject: mac80211: Do not use sizeof() on pointer type

The rate_control_cap_mask() function takes a parameter mcs_mask, which
GCC will take to be u8 * even though it was declared with a fixed size.
This causes the following warning:

	net/mac80211/rate.c: In function 'rate_control_cap_mask':
	net/mac80211/rate.c:719:25: warning: 'sizeof' on array function parameter 'mcs_mask' will return size of 'u8 * {aka unsigned char *}' [-Wsizeof-array-argument]
	   for (i = 0; i < sizeof(mcs_mask); i++)
	                         ^
	net/mac80211/rate.c:684:10: note: declared here
	       u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN],
	          ^

This can be easily fixed by using the IEEE80211_HT_MCS_MASK_LEN directly
within the loop condition.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 9857693b91ec..9ce8883d5f44 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -716,7 +716,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata,
 
 		/* Filter out rates that the STA does not support */
 		*mask &= sta->supp_rates[sband->band];
-		for (i = 0; i < sizeof(mcs_mask); i++)
+		for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
 			mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i];
 
 		sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map;
-- 
cgit v1.2.3-59-g8ed1b


From 22f66895e60cfc55b92f6fa93f05bb3fbdbd0bed Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@intel.com>
Date: Tue, 18 Aug 2015 16:52:07 +0300
Subject: mac80211: protect non-HT BSS when HT TDLS traffic exists

HT TDLS traffic should be protected in a non-HT BSS to avoid
collisions. Therefore, when TDLS peers join/leave, check if
protection is (now) needed and set the ht_operation_mode of
the virtual interface according to the HT capabilities of the
TDLS peer(s).

This works because a non-HT BSS connection never sets (or
otherwise uses) the ht_operation_mode; it just means that
drivers must be aware that this field applies to all HT
traffic for this virtual interface, not just the traffic
within the BSS. Document that.

Signed-off-by: Avri Altman <avri.altman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  4 ++-
 net/mac80211/tdls.c    | 70 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e3314e516681..bfc569498bfa 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -477,7 +477,9 @@ struct ieee80211_event {
  * @chandef: Channel definition for this BSS -- the hardware might be
  *	configured a higher bandwidth than this BSS uses, for example.
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
- *	This field is only valid when the channel type is one of the HT types.
+ *	This field is only valid when the channel is a wide HT/VHT channel.
+ *	Note that with TDLS this can be the case (channel is HT, protection must
+ *	be used from this field) even when the BSS association isn't using HT.
  * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value
  *	implies disabled
  * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index aee701a5649e..4e202d0679b2 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1249,6 +1249,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata)
 	mutex_unlock(&local->chanctx_mtx);
 }
 
+static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata)
+{
+	struct sta_info *sta;
+	bool result = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+		if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+		    !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+		    !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) ||
+		    !sta->sta.ht_cap.ht_supported)
+			continue;
+		result = true;
+		break;
+	}
+	rcu_read_unlock();
+
+	return result;
+}
+
+static void
+iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
+				   struct sta_info *sta)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	bool tdls_ht;
+	u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED |
+			 IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+			 IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
+	u16 opmode;
+
+	/* Nothing to do if the BSS connection uses HT */
+	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+		return;
+
+	tdls_ht = (sta && sta->sta.ht_cap.ht_supported) ||
+		  iee80211_tdls_have_ht_peers(sdata);
+
+	opmode = sdata->vif.bss_conf.ht_operation_mode;
+
+	if (tdls_ht)
+		opmode |= protection;
+	else
+		opmode &= ~protection;
+
+	if (opmode == sdata->vif.bss_conf.ht_operation_mode)
+		return;
+
+	sdata->vif.bss_conf.ht_operation_mode = opmode;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+}
+
 int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 			const u8 *peer, enum nl80211_tdls_operation oper)
 {
@@ -1274,6 +1326,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 		return -ENOTSUPP;
 	}
 
+	/* protect possible bss_conf changes and avoid concurrency in
+	 * ieee80211_bss_info_change_notify()
+	 */
+	sdata_lock(sdata);
 	mutex_lock(&local->mtx);
 	tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
 
@@ -1287,16 +1343,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 
 		iee80211_tdls_recalc_chanctx(sdata);
 
-		rcu_read_lock();
+		mutex_lock(&local->sta_mtx);
 		sta = sta_info_get(sdata, peer);
 		if (!sta) {
-			rcu_read_unlock();
+			mutex_unlock(&local->sta_mtx);
 			ret = -ENOLINK;
 			break;
 		}
 
+		iee80211_tdls_recalc_ht_protection(sdata, sta);
+
 		set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
-		rcu_read_unlock();
+		mutex_unlock(&local->sta_mtx);
 
 		WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
 			     !ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
@@ -1318,6 +1376,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 		ieee80211_flush_queues(local, sdata, false);
 
 		ret = sta_info_destroy_addr(sdata, peer);
+
+		mutex_lock(&local->sta_mtx);
+		iee80211_tdls_recalc_ht_protection(sdata, NULL);
+		mutex_unlock(&local->sta_mtx);
+
 		iee80211_tdls_recalc_chanctx(sdata);
 		break;
 	default:
@@ -1335,6 +1398,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 				     &sdata->u.mgd.request_smps_work);
 
 	mutex_unlock(&local->mtx);
+	sdata_unlock(sdata);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4c0778933a3d7c35a94e8c35847acd9bb59a257d Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 25 Aug 2015 08:56:43 -0400
Subject: rfkill: Copy "all" global state to other types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When switching the state of all RFKill switches of type all we need to
replicate the RFKILL_TYPE_ALL global state to all the other types global
state, so it is used to initialize persistent RFKill switches on
register.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f12149a29cb1..b41e9ea2ffff 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
 {
 	struct rfkill *rfkill;
 
-	rfkill_global_states[type].cur = blocked;
+	if (type == RFKILL_TYPE_ALL) {
+		int i;
+
+		for (i = 0; i < NUM_RFKILL_TYPES; i++)
+			rfkill_global_states[i].cur = blocked;
+	} else {
+		rfkill_global_states[type].cur = blocked;
+	}
+
 	list_for_each_entry(rfkill, &rfkill_list, node) {
 		if (rfkill->type != type && type != RFKILL_TYPE_ALL)
 			continue;
-- 
cgit v1.2.3-59-g8ed1b


From 549cc1c560128d583698ba9a73af283fe87dbab8 Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <mail@maciej.szmigiero.name>
Date: Wed, 2 Sep 2015 19:00:31 +0200
Subject: cfg80211: regulatory: restore proper user alpha2

restore_regulatory_settings() should restore alpha2
as computed in restore_alpha2(), not raw user_alpha2 to
behave as described in the comment just above that code.

This fixes endless loop of calling CRDA for "00" and "97"
countries after resume from suspend on my laptop.

Looks like others had the same problem, too:
http://ath9k-devel.ath9k.narkive.com/knY5W6St/ath9k-and-crda-messages-in-logs
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/899335
https://forum.porteus.org/viewtopic.php?t=4975&p=36436
https://forums.opensuse.org/showthread.php/483356-Authentication-Regulatory-Domain-issues-ath5k-12-2

Signed-off-by: Maciej Szmigiero <mail@maciej.szmigiero.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index b144485946f2..2510b231451e 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2625,7 +2625,7 @@ static void restore_regulatory_settings(bool reset_user)
 	 * settings, user regulatory settings takes precedence.
 	 */
 	if (is_an_alpha2(alpha2))
-		regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER);
+		regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER);
 
 	spin_lock(&reg_requests_lock);
 	list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
-- 
cgit v1.2.3-59-g8ed1b


From 52a45f38ca5998db0394e782d137595a82a08b43 Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Sat, 15 Aug 2015 22:39:53 +0300
Subject: mac80211: avoid VHT usage with no 80MHz chans allowed

Currently if 80MHz channels are not allowed for use, the VHT IE is not
included in the probe request for an AP. This is not good enough if the
AP is configured with the wrong regulatory and supports VHT even where
prohibited or in TDLS scenarios.
Mark the ifmgd with the DISABLE_VHT flag for the misbehaving-AP case, and
unset VHT support from the peer-station entry for the TDLS case.

Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 16 ++++++++++++++++
 net/mac80211/vht.c  | 15 +++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 705ef1d040ed..cd7e55e08a23 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4267,6 +4267,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_supported_band *sband;
 	struct cfg80211_chan_def chandef;
 	int ret;
+	u32 i;
+	bool have_80mhz;
 
 	sband = local->hw.wiphy->bands[cbss->channel->band];
 
@@ -4317,6 +4319,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	/* Allow VHT if at least one channel on the sband supports 80 MHz */
+	have_80mhz = false;
+	for (i = 0; i < sband->n_channels; i++) {
+		if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+						IEEE80211_CHAN_NO_80MHZ))
+			continue;
+
+		have_80mhz = true;
+		break;
+	}
+
+	if (!have_80mhz)
+		ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
 	ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
 						     cbss->channel,
 						     ht_cap, ht_oper, vht_oper,
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 834ccdbc74be..ff1c798921a6 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
 	struct ieee80211_sta_vht_cap own_cap;
 	u32 cap_info, i;
+	bool have_80mhz;
 
 	memset(vht_cap, 0, sizeof(*vht_cap));
 
@@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	if (!vht_cap_ie || !sband->vht_cap.vht_supported)
 		return;
 
+	/* Allow VHT if at least one channel on the sband supports 80 MHz */
+	have_80mhz = false;
+	for (i = 0; i < sband->n_channels; i++) {
+		if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+						IEEE80211_CHAN_NO_80MHZ))
+			continue;
+
+		have_80mhz = true;
+		break;
+	}
+
+	if (!have_80mhz)
+		return;
+
 	/*
 	 * A VHT STA must support 40 MHz, but if we verify that here
 	 * then we break a few things - some APs (e.g. Netgear R6300v2
-- 
cgit v1.2.3-59-g8ed1b


From ef9be10c8c999e00b239eec24cf01952a308f8e7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 28 Aug 2015 10:44:20 +0200
Subject: mac80211: reject software RSSI CQM with beacon filtering

When beacon filtering is enabled the mac80211 software implementation
for RSSI CQM cannot work as beacons will not be available. Rather than
accepting such a configuration without proper effect, reject it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 685ec13ed7c2..17b1fe961c5d 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2468,6 +2468,10 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
 	    rssi_hyst == bss_conf->cqm_rssi_hyst)
 		return 0;
 
+	if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER &&
+	    !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
+		return -EOPNOTSUPP;
+
 	bss_conf->cqm_rssi_thold = rssi_thold;
 	bss_conf->cqm_rssi_hyst = rssi_hyst;
 
-- 
cgit v1.2.3-59-g8ed1b


From 25b4a44c19c83d98e8c0807a7ede07c1f28eab8b Mon Sep 17 00:00:00 2001
From: Richard Laing <richard.laing@alliedtelesis.co.nz>
Date: Thu, 3 Sep 2015 13:52:31 +1200
Subject: net/ipv6: Correct PIM6 mrt_lock handling

In the IPv6 multicast routing code the mrt_lock was not being released
correctly in the MFC iterator, as a result adding or deleting a MIF would
cause a hang because the mrt_lock could not be acquired.

This fix is a copy of the code for the IPv4 case and ensures that the lock
is released correctly.

Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
Acked-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 74ceb73c1c9a..5f36266b1f5e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -550,7 +550,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &mrt->mfc6_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == mrt->mfc6_cache_array)
+	else if (it->cache == &mrt->mfc6_cache_array[it->ct])
 		read_unlock(&mrt_lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0890cf6cb6ab1af650025670b1a839671a9a3fcb Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Sep 2015 14:04:17 +0200
Subject: switchdev: fix return value of switchdev_port_fdb_dump in case of
 error

switchdev_port_fdb_dump is used as .ndo_fdb_dump. Its return value is
idx, so we cannot return errval.

Fixes: 45d4122ca7cd ("switchdev: add support for fdb add/del/dump via switchdev_port_obj ops.")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Acked-by: Scott Feldman<sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/switchdev/switchdev.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 16c1c43980a1..fda38f830a10 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -853,12 +853,8 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		.cb = cb,
 		.idx = idx,
 	};
-	int err;
-
-	err = switchdev_port_obj_dump(dev, &dump.obj);
-	if (err)
-		return err;
 
+	switchdev_port_obj_dump(dev, &dump.obj);
 	return dump.idx;
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
-- 
cgit v1.2.3-59-g8ed1b


From 8f384c0177a03640312b9cb3638c998b32243b63 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 3 Sep 2015 16:24:52 -0400
Subject: RDS: rds_conn_lookup() should factor in the struct net for a match

Only return a conn if the rds_conn_net(conn) matches the struct
net passed to rds_conn_lookup().

Fixes: 467fa15356ac ("RDS-TCP: Support multiple RDS-TCP listen endpoints,
       one per netns.")

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index a50e652eb269..9b2de5e67d79 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 } while (0)
 
 /* rcu read lock must be held or the connection spinlock */
-static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+static struct rds_connection *rds_conn_lookup(struct net *net,
+					      struct hlist_head *head,
 					      __be32 laddr, __be32 faddr,
 					      struct rds_transport *trans)
 {
@@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 
 	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
 		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
-				conn->c_trans == trans) {
+		    conn->c_trans == trans && net == rds_conn_net(conn)) {
 			ret = conn;
 			break;
 		}
@@ -132,7 +133,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
 		goto new_conn;
 	rcu_read_lock();
-	conn = rds_conn_lookup(head, laddr, faddr, trans);
+	conn = rds_conn_lookup(net, head, laddr, faddr, trans);
 	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
 	    laddr == faddr && !is_outgoing) {
 		/* This is a looped back IB connection, and we're
@@ -239,7 +240,7 @@ new_conn:
 		if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
 			found = NULL;
 		else
-			found = rds_conn_lookup(head, laddr, faddr, trans);
+			found = rds_conn_lookup(net, head, laddr, faddr, trans);
 		if (found) {
 			trans->conn_free(conn->c_transport_data);
 			kmem_cache_free(rds_conn_slab, conn);
-- 
cgit v1.2.3-59-g8ed1b


From f88f69dd17f150e2abcc7e2d95f895f2546fa381 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Fri, 4 Sep 2015 13:07:40 -0700
Subject: openvswitch: Remove conntrack Kconfig option.

There's no particular desire to have conntrack action support in Open
vSwitch as an independently configurable bit, rather just to ensure
there is not a hard dependency. This exposed option doesn't accurately
reflect the conntrack dependency when enabled, so simplify this by
removing the option. Compile the support if NF_CONNTRACK is enabled.

Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/Kconfig     | 12 +-----------
 net/openvswitch/Makefile    |  4 +++-
 net/openvswitch/conntrack.h |  4 ++--
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index af7cdef42066..2a071f470d57 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -5,6 +5,7 @@
 config OPENVSWITCH
 	tristate "Open vSwitch"
 	depends on INET
+	depends on (!NF_CONNTRACK || NF_CONNTRACK)
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
@@ -31,17 +32,6 @@ config OPENVSWITCH
 
 	  If unsure, say N.
 
-config OPENVSWITCH_CONNTRACK
-	bool "Open vSwitch conntrack action support"
-	depends on OPENVSWITCH
-	depends on NF_CONNTRACK
-	default OPENVSWITCH
-	---help---
-	  If you say Y here, then Open vSwitch module will be able to pass
-	  packets through conntrack.
-
-	  Say N to exclude this support and reduce the binary size.
-
 config OPENVSWITCH_GRE
 	tristate "Open vSwitch GRE tunneling support"
 	depends on OPENVSWITCH
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 5b5913b06f54..60f809085b92 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -15,7 +15,9 @@ openvswitch-y := \
 	vport-internal_dev.o \
 	vport-netdev.o
 
-openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o
+ifneq ($(CONFIG_NF_CONNTRACK),)
+openvswitch-y += conntrack.o
+endif
 
 obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
 obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 3cb30667a7dc..43f5dd7a5577 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -19,7 +19,7 @@
 struct ovs_conntrack_info;
 enum ovs_key_attr;
 
-#if defined(CONFIG_OPENVSWITCH_CONNTRACK)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 void ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
@@ -82,5 +82,5 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key,
 }
 
 static inline void ovs_ct_free_action(const struct nlattr *a) { }
-#endif
+#endif /* CONFIG_NF_CONNTRACK */
 #endif /* ovs_conntrack.h */
-- 
cgit v1.2.3-59-g8ed1b


From bf361ad38165939049a2649b1a0078f3268d4bd1 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sat, 5 Sep 2015 21:27:57 -0400
Subject: net: bridge: check __vlan_vid_del for error

Since __vlan_del can return an error code, change its inner function
__vlan_vid_del to return an eventual error from switchdev_port_obj_del.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_vlan.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 3cd8cc9e804b..5f5a02b49a99 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -117,10 +117,11 @@ out_filt:
 	return err;
 }
 
-static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
-			   u16 vid)
+static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
+			  u16 vid)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	int err = 0;
 
 	/* If driver uses VLAN ndo ops, use 8021q to delete vid
 	 * on device, otherwise try switchdev ops to delete vid.
@@ -137,8 +138,12 @@ static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
 			},
 		};
 
-		switchdev_port_obj_del(dev, &vlan_obj);
+		err = switchdev_port_obj_del(dev, &vlan_obj);
+		if (err == -EOPNOTSUPP)
+			err = 0;
 	}
+
+	return err;
 }
 
 static int __vlan_del(struct net_port_vlans *v, u16 vid)
@@ -151,7 +156,11 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid)
 
 	if (v->port_idx) {
 		struct net_bridge_port *p = v->parent.port;
-		__vlan_vid_del(p->dev, p->br, vid);
+		int err;
+
+		err = __vlan_vid_del(p->dev, p->br, vid);
+		if (err)
+			return err;
 	}
 
 	clear_bit(vid, v->vlan_bitmap);
-- 
cgit v1.2.3-59-g8ed1b


From 7a577f013d6745c800a11a2911ddc9a3214e7f09 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sat, 5 Sep 2015 21:49:41 -0400
Subject: net: bridge: remove unnecessary switchdev include

Remove the unnecessary switchdev.h include from br_netlink.c.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index af5e187553fd..ea748c93a07f 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -16,7 +16,6 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
-#include <net/switchdev.h>
 #include <uapi/linux/if_bridge.h>
 
 #include "br_private.h"
-- 
cgit v1.2.3-59-g8ed1b


From 7845989cb4b3da1db903918c844fccb9817d34a0 Mon Sep 17 00:00:00 2001
From: Kolmakov Dmitriy <kolmakov.dmitriy@huawei.com>
Date: Mon, 7 Sep 2015 09:05:48 +0000
Subject: net: tipc: fix stall during bclink wakeup procedure

If an attempt to wake up users of broadcast link is made when there is
no enough place in send queue than it may hang up inside the
tipc_sk_rcv() function since the loop breaks only after the wake up
queue becomes empty. This can lead to complete CPU stall with the
following message generated by RCU:

INFO: rcu_sched self-detected stall on CPU { 0}  (t=2101 jiffies
					g=54225 c=54224 q=11465)
Task dump for CPU 0:
tpch            R  running task        0 39949  39948 0x0000000a
 ffffffff818536c0 ffff88181fa037a0 ffffffff8106a4be 0000000000000000
 ffffffff818536c0 ffff88181fa037c0 ffffffff8106d8a8 ffff88181fa03800
 0000000000000001 ffff88181fa037f0 ffffffff81094a50 ffff88181fa15680
Call Trace:
 <IRQ>  [<ffffffff8106a4be>] sched_show_task+0xae/0x120
 [<ffffffff8106d8a8>] dump_cpu_task+0x38/0x40
 [<ffffffff81094a50>] rcu_dump_cpu_stacks+0x90/0xd0
 [<ffffffff81097c3b>] rcu_check_callbacks+0x3eb/0x6e0
 [<ffffffff8106e53f>] ? account_system_time+0x7f/0x170
 [<ffffffff81099e64>] update_process_times+0x34/0x60
 [<ffffffff810a84d1>] tick_sched_handle.isra.18+0x31/0x40
 [<ffffffff810a851c>] tick_sched_timer+0x3c/0x70
 [<ffffffff8109a43d>] __run_hrtimer.isra.34+0x3d/0xc0
 [<ffffffff8109aa95>] hrtimer_interrupt+0xc5/0x1e0
 [<ffffffff81030d52>] ? native_smp_send_reschedule+0x42/0x60
 [<ffffffff81032f04>] local_apic_timer_interrupt+0x34/0x60
 [<ffffffff810335bc>] smp_apic_timer_interrupt+0x3c/0x60
 [<ffffffff8165a3fb>] apic_timer_interrupt+0x6b/0x70
 [<ffffffff81659129>] ? _raw_spin_unlock_irqrestore+0x9/0x10
 [<ffffffff8107eb9f>] __wake_up_sync_key+0x4f/0x60
 [<ffffffffa313ddd1>] tipc_write_space+0x31/0x40 [tipc]
 [<ffffffffa313dadf>] filter_rcv+0x31f/0x520 [tipc]
 [<ffffffffa313d699>] ? tipc_sk_lookup+0xc9/0x110 [tipc]
 [<ffffffff81659259>] ? _raw_spin_lock_bh+0x19/0x30
 [<ffffffffa314122c>] tipc_sk_rcv+0x2dc/0x3e0 [tipc]
 [<ffffffffa312e7ff>] tipc_bclink_wakeup_users+0x2f/0x40 [tipc]
 [<ffffffffa313ce26>] tipc_node_unlock+0x186/0x190 [tipc]
 [<ffffffff81597c1c>] ? kfree_skb+0x2c/0x40
 [<ffffffffa313475c>] tipc_rcv+0x2ac/0x8c0 [tipc]
 [<ffffffffa312ff58>] tipc_l2_rcv_msg+0x38/0x50 [tipc]
 [<ffffffff815a76d3>] __netif_receive_skb_core+0x5a3/0x950
 [<ffffffff815a98d3>] __netif_receive_skb+0x13/0x60
 [<ffffffff815a993e>] netif_receive_skb_internal+0x1e/0x90
 [<ffffffff815aa138>] napi_gro_receive+0x78/0xa0
 [<ffffffffa07f93f4>] tg3_poll_work+0xc54/0xf40 [tg3]
 [<ffffffff81597c8c>] ? consume_skb+0x2c/0x40
 [<ffffffffa07f9721>] tg3_poll_msix+0x41/0x160 [tg3]
 [<ffffffff815ab0f2>] net_rx_action+0xe2/0x290
 [<ffffffff8104b92a>] __do_softirq+0xda/0x1f0
 [<ffffffff8104bc26>] irq_exit+0x76/0xa0
 [<ffffffff81004355>] do_IRQ+0x55/0xf0
 [<ffffffff8165a12b>] common_interrupt+0x6b/0x6b
 <EOI>

The issue occurs only when tipc_sk_rcv() is used to wake up postponed
senders:

	tipc_bclink_wakeup_users()
		// wakeupq - is a queue which consists of special
		// 		 messages with SOCK_WAKEUP type.
		tipc_sk_rcv(wakeupq)
			...
			while (skb_queue_len(inputq)) {
				filter_rcv(skb)
					// Here the type of message is checked
					// and if it is SOCK_WAKEUP then
					// it tries to wake up a sender.
					tipc_write_space(sk)
						wake_up_interruptible_sync_poll()
			}

After the sender thread is woke up it can gather control and perform
an attempt to send a message. But if there is no enough place in send
queue it will call link_schedule_user() function which puts a message
of type SOCK_WAKEUP to the wakeup queue and put the sender to sleep.
Thus the size of the queue actually is not changed and the while()
loop never exits.

The approach I proposed is to wake up only senders for which there is
enough place in send queue so the described issue can't occur.
Moreover the same approach is already used to wake up senders on
unicast links.

I have got into the issue on our product code but to reproduce the
issue I changed a benchmark test application (from
tipcutils/demos/benchmark) to perform the following scenario:
	1. Run 64 instances of test application (nodes). It can be done
	   on the one physical machine.
	2. Each application connects to all other using TIPC sockets in
	   RDM mode.
	3. When setup is done all nodes start simultaneously send
	   broadcast messages.
	4. Everything hangs up.

The issue is reproducible only when a congestion on broadcast link
occurs. For example, when there are only 8 nodes it works fine since
congestion doesn't occur. Send queue limit is 40 in my case (I use a
critical importance level) and when 64 nodes send a message at the
same moment a congestion occurs every time.

Signed-off-by: Dmitry S Kolmakov <kolmakov.dmitriy@huawei.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 8b010c976b2f..41042de3ae9b 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -169,6 +169,30 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
 	}
 }
 
+/**
+ * bclink_prepare_wakeup - prepare users for wakeup after congestion
+ * @bcl: broadcast link
+ * @resultq: queue for users which can be woken up
+ * Move a number of waiting users, as permitted by available space in
+ * the send queue, from link wait queue to specified queue for wakeup
+ */
+static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq)
+{
+	int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
+	int imp, lim;
+	struct sk_buff *skb, *tmp;
+
+	skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) {
+		imp = TIPC_SKB_CB(skb)->chain_imp;
+		lim = bcl->window + bcl->backlog[imp].limit;
+		pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
+		if ((pnd[imp] + bcl->backlog[imp].len) >= lim)
+			continue;
+		skb_unlink(skb, &bcl->wakeupq);
+		skb_queue_tail(resultq, skb);
+	}
+}
+
 /**
  * tipc_bclink_wakeup_users - wake up pending users
  *
@@ -177,8 +201,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
 void tipc_bclink_wakeup_users(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_link *bcl = tn->bcl;
+	struct sk_buff_head resultq;
 
-	tipc_sk_rcv(net, &tn->bclink->link.wakeupq);
+	skb_queue_head_init(&resultq);
+	bclink_prepare_wakeup(bcl, &resultq);
+	tipc_sk_rcv(net, &resultq);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 74e98eb085889b0d2d4908f59f6e00026063014f Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 8 Sep 2015 10:53:40 -0400
Subject: RDS: verify the underlying transport exists before creating a
 connection

There was no verification that an underlying transport exists when creating
a connection, this would cause dereferencing a NULL ptr.

It might happen on sockets that weren't properly bound before attempting to
send a message, which will cause a NULL ptr deref:

[135546.047719] kasan: GPF could be caused by NULL-ptr deref or user memory accessgeneral protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN
[135546.051270] Modules linked in:
[135546.051781] CPU: 4 PID: 15650 Comm: trinity-c4 Not tainted 4.2.0-next-20150902-sasha-00041-gbaa1222-dirty #2527
[135546.053217] task: ffff8800835bc000 ti: ffff8800bc708000 task.ti: ffff8800bc708000
[135546.054291] RIP: __rds_conn_create (net/rds/connection.c:194)
[135546.055666] RSP: 0018:ffff8800bc70fab0  EFLAGS: 00010202
[135546.056457] RAX: dffffc0000000000 RBX: 0000000000000f2c RCX: ffff8800835bc000
[135546.057494] RDX: 0000000000000007 RSI: ffff8800835bccd8 RDI: 0000000000000038
[135546.058530] RBP: ffff8800bc70fb18 R08: 0000000000000001 R09: 0000000000000000
[135546.059556] R10: ffffed014d7a3a23 R11: ffffed014d7a3a21 R12: 0000000000000000
[135546.060614] R13: 0000000000000001 R14: ffff8801ec3d0000 R15: 0000000000000000
[135546.061668] FS:  00007faad4ffb700(0000) GS:ffff880252000000(0000) knlGS:0000000000000000
[135546.062836] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[135546.063682] CR2: 000000000000846a CR3: 000000009d137000 CR4: 00000000000006a0
[135546.064723] Stack:
[135546.065048]  ffffffffafe2055c ffffffffafe23fc1 ffffed00493097bf ffff8801ec3d0008
[135546.066247]  0000000000000000 00000000000000d0 0000000000000000 ac194a24c0586342
[135546.067438]  1ffff100178e1f78 ffff880320581b00 ffff8800bc70fdd0 ffff880320581b00
[135546.068629] Call Trace:
[135546.069028] ? __rds_conn_create (include/linux/rcupdate.h:856 net/rds/connection.c:134)
[135546.069989] ? rds_message_copy_from_user (net/rds/message.c:298)
[135546.071021] rds_conn_create_outgoing (net/rds/connection.c:278)
[135546.071981] rds_sendmsg (net/rds/send.c:1058)
[135546.072858] ? perf_trace_lock (include/trace/events/lock.h:38)
[135546.073744] ? lockdep_init (kernel/locking/lockdep.c:3298)
[135546.074577] ? rds_send_drop_to (net/rds/send.c:976)
[135546.075508] ? __might_fault (./arch/x86/include/asm/current.h:14 mm/memory.c:3795)
[135546.076349] ? __might_fault (mm/memory.c:3795)
[135546.077179] ? rds_send_drop_to (net/rds/send.c:976)
[135546.078114] sock_sendmsg (net/socket.c:611 net/socket.c:620)
[135546.078856] SYSC_sendto (net/socket.c:1657)
[135546.079596] ? SYSC_connect (net/socket.c:1628)
[135546.080510] ? trace_dump_stack (kernel/trace/trace.c:1926)
[135546.081397] ? ring_buffer_unlock_commit (kernel/trace/ring_buffer.c:2479 kernel/trace/ring_buffer.c:2558 kernel/trace/ring_buffer.c:2674)
[135546.082390] ? trace_buffer_unlock_commit (kernel/trace/trace.c:1749)
[135546.083410] ? trace_event_raw_event_sys_enter (include/trace/events/syscalls.h:16)
[135546.084481] ? do_audit_syscall_entry (include/trace/events/syscalls.h:16)
[135546.085438] ? trace_buffer_unlock_commit (kernel/trace/trace.c:1749)
[135546.085515] rds_ib_laddr_check(): addr 36.74.25.172 ret -99 node type -1

Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 9b2de5e67d79..49adeef8090c 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -190,6 +190,12 @@ new_conn:
 		}
 	}
 
+	if (trans == NULL) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(-ENODEV);
+		goto out;
+	}
+
 	conn->c_trans = trans;
 
 	ret = trans->conn_alloc(conn, gfp);
-- 
cgit v1.2.3-59-g8ed1b


From 6b9ea5a64ed5eeb3f68f2e6fcce0ed1179801d1e Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 8 Sep 2015 10:53:04 -0700
Subject: ipv6: fix multipath route replace error recovery

Problem:
The ecmp route replace support for ipv6 in the kernel, deletes the
existing ecmp route too early, ie when it installs the first nexthop.
If there is an error in installing the subsequent nexthops, its too late
to recover the already deleted existing route leaving the fib
in an inconsistent state.

This patch reduces the possibility of this by doing the following:
a) Changes the existing multipath route add code to a two stage process:
  build rt6_infos + insert them
	ip6_route_add rt6_info creation code is moved into
	ip6_route_info_create.
b) This ensures that most errors are caught during building rt6_infos
  and we fail early
c) Separates multipath add and del code. Because add needs the special
  two stage mode in a) and delete essentially does not care.
d) In any event if the code fails during inserting a route again, a
  warning is printed (This should be unlikely)

Before the patch:
$ip -6 route show
3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024
3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024
3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024

/* Try replacing the route with a duplicate nexthop */
$ip -6 route change 3000:1000:1000:1000::2/128 nexthop via
fe80::202:ff:fe00:b dev swp49s0 nexthop via fe80::202:ff:fe00:d dev
swp49s1 nexthop via fe80::202:ff:fe00:d dev swp49s1
RTNETLINK answers: File exists

$ip -6 route show
/* previously added ecmp route 3000:1000:1000:1000::2 dissappears from
 * kernel */

After the patch:
$ip -6 route show
3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024
3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024
3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024

/* Try replacing the route with a duplicate nexthop */
$ip -6 route change 3000:1000:1000:1000::2/128 nexthop via
fe80::202:ff:fe00:b dev swp49s0 nexthop via fe80::202:ff:fe00:d dev
swp49s1 nexthop via fe80::202:ff:fe00:d dev swp49s1
RTNETLINK answers: File exists

$ip -6 route show
3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024
3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024
3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024

Fixes: 27596472473a ("ipv6: fix ECMP route replacement")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 175 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f45cac6f8356..34539d3b843f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1748,7 +1748,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 	return -EINVAL;
 }
 
-int ip6_route_add(struct fib6_config *cfg)
+int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
 {
 	int err;
 	struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1756,7 +1756,6 @@ int ip6_route_add(struct fib6_config *cfg)
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
 	struct fib6_table *table;
-	struct mx6_config mxc = { .mx = NULL, };
 	int addr_type;
 
 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
@@ -1981,6 +1980,32 @@ install_route:
 
 	cfg->fc_nlinfo.nl_net = dev_net(dev);
 
+	*rt_ret = rt;
+
+	return 0;
+out:
+	if (dev)
+		dev_put(dev);
+	if (idev)
+		in6_dev_put(idev);
+	if (rt)
+		dst_free(&rt->dst);
+
+	*rt_ret = NULL;
+
+	return err;
+}
+
+int ip6_route_add(struct fib6_config *cfg)
+{
+	struct mx6_config mxc = { .mx = NULL, };
+	struct rt6_info *rt = NULL;
+	int err;
+
+	err = ip6_route_info_create(cfg, &rt);
+	if (err)
+		goto out;
+
 	err = ip6_convert_metrics(&mxc, cfg);
 	if (err)
 		goto out;
@@ -1988,14 +2013,12 @@ install_route:
 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
 
 	kfree(mxc.mx);
+
 	return err;
 out:
-	if (dev)
-		dev_put(dev);
-	if (idev)
-		in6_dev_put(idev);
 	if (rt)
 		dst_free(&rt->dst);
+
 	return err;
 }
 
@@ -2776,19 +2799,78 @@ errout:
 	return err;
 }
 
-static int ip6_route_multipath(struct fib6_config *cfg, int add)
+struct rt6_nh {
+	struct rt6_info *rt6_info;
+	struct fib6_config r_cfg;
+	struct mx6_config mxc;
+	struct list_head next;
+};
+
+static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
+{
+	struct rt6_nh *nh;
+
+	list_for_each_entry(nh, rt6_nh_list, next) {
+		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
+		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
+		        nh->r_cfg.fc_ifindex);
+	}
+}
+
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
+				 struct rt6_info *rt, struct fib6_config *r_cfg)
+{
+	struct rt6_nh *nh;
+	struct rt6_info *rtnh;
+	int err = -EEXIST;
+
+	list_for_each_entry(nh, rt6_nh_list, next) {
+		/* check if rt6_info already exists */
+		rtnh = nh->rt6_info;
+
+		if (rtnh->dst.dev == rt->dst.dev &&
+		    rtnh->rt6i_idev == rt->rt6i_idev &&
+		    ipv6_addr_equal(&rtnh->rt6i_gateway,
+				    &rt->rt6i_gateway))
+			return err;
+	}
+
+	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
+	if (!nh)
+		return -ENOMEM;
+	nh->rt6_info = rt;
+	err = ip6_convert_metrics(&nh->mxc, r_cfg);
+	if (err) {
+		kfree(nh);
+		return err;
+	}
+	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
+	list_add_tail(&nh->next, rt6_nh_list);
+
+	return 0;
+}
+
+static int ip6_route_multipath_add(struct fib6_config *cfg)
 {
 	struct fib6_config r_cfg;
 	struct rtnexthop *rtnh;
+	struct rt6_info *rt;
+	struct rt6_nh *err_nh;
+	struct rt6_nh *nh, *nh_safe;
 	int remaining;
 	int attrlen;
-	int err = 0, last_err = 0;
+	int err = 1;
+	int nhn = 0;
+	int replace = (cfg->fc_nlinfo.nlh &&
+		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
+	LIST_HEAD(rt6_nh_list);
 
 	remaining = cfg->fc_mp_len;
-beginning:
 	rtnh = (struct rtnexthop *)cfg->fc_mp;
 
-	/* Parse a Multipath Entry */
+	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
+	 * rt6_info structs per nexthop
+	 */
 	while (rtnh_ok(rtnh, remaining)) {
 		memcpy(&r_cfg, cfg, sizeof(*cfg));
 		if (rtnh->rtnh_ifindex)
@@ -2808,22 +2890,32 @@ beginning:
 			if (nla)
 				r_cfg.fc_encap_type = nla_get_u16(nla);
 		}
-		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+
+		err = ip6_route_info_create(&r_cfg, &rt);
+		if (err)
+			goto cleanup;
+
+		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
 		if (err) {
-			last_err = err;
-			/* If we are trying to remove a route, do not stop the
-			 * loop when ip6_route_del() fails (because next hop is
-			 * already gone), we should try to remove all next hops.
-			 */
-			if (add) {
-				/* If add fails, we should try to delete all
-				 * next hops that have been already added.
-				 */
-				add = 0;
-				remaining = cfg->fc_mp_len - remaining;
-				goto beginning;
-			}
+			dst_free(&rt->dst);
+			goto cleanup;
+		}
+
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	err_nh = NULL;
+	list_for_each_entry(nh, &rt6_nh_list, next) {
+		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
+		/* nh->rt6_info is used or freed at this point, reset to NULL*/
+		nh->rt6_info = NULL;
+		if (err) {
+			if (replace && nhn)
+				ip6_print_replace_route_err(&rt6_nh_list);
+			err_nh = nh;
+			goto add_errout;
 		}
+
 		/* Because each route is added like a single route we remove
 		 * these flags after the first nexthop: if there is a collision,
 		 * we have already failed to add the first nexthop:
@@ -2833,6 +2925,63 @@ beginning:
 		 */
 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
 						     NLM_F_REPLACE);
+		nhn++;
+	}
+
+	goto cleanup;
+
+add_errout:
+	/* Delete routes that were already added */
+	list_for_each_entry(nh, &rt6_nh_list, next) {
+		if (err_nh == nh)
+			break;
+		ip6_route_del(&nh->r_cfg);
+	}
+
+cleanup:
+	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
+		if (nh->rt6_info)
+			dst_free(&nh->rt6_info->dst);
+		if (nh->mxc.mx)
+			kfree(nh->mxc.mx);
+		list_del(&nh->next);
+		kfree(nh);
+	}
+
+	return err;
+}
+
+static int ip6_route_multipath_del(struct fib6_config *cfg)
+{
+	struct fib6_config r_cfg;
+	struct rtnexthop *rtnh;
+	int remaining;
+	int attrlen;
+	int err = 1, last_err = 0;
+
+	remaining = cfg->fc_mp_len;
+	rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+	/* Parse a Multipath Entry */
+	while (rtnh_ok(rtnh, remaining)) {
+		memcpy(&r_cfg, cfg, sizeof(*cfg));
+		if (rtnh->rtnh_ifindex)
+			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			if (nla) {
+				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+				r_cfg.fc_flags |= RTF_GATEWAY;
+			}
+		}
+		err = ip6_route_del(&r_cfg);
+		if (err)
+			last_err = err;
+
 		rtnh = rtnh_next(rtnh, &remaining);
 	}
 
@@ -2849,7 +2998,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return err;
 
 	if (cfg.fc_mp)
-		return ip6_route_multipath(&cfg, 0);
+		return ip6_route_multipath_del(&cfg);
 	else
 		return ip6_route_del(&cfg);
 }
@@ -2864,7 +3013,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return err;
 
 	if (cfg.fc_mp)
-		return ip6_route_multipath(&cfg, 1);
+		return ip6_route_multipath_add(&cfg);
 	else
 		return ip6_route_add(&cfg);
 }
-- 
cgit v1.2.3-59-g8ed1b


From f53de1e9a4aaf8cbe08845da6f7ff26a078ac507 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 9 Sep 2015 14:20:56 +0200
Subject: net: ipv6: use common fib_default_rule_pref

This switches IPv6 policy routing to use the shared
fib_default_rule_pref() function of IPv4 and DECnet. It is also used in
multicast routing for IPv4 as well as IPv6.

The motivation for this patch is a complaint about iproute2 behaving
inconsistent between IPv4 and IPv6 when adding policy rules: Formerly,
IPv6 rules were assigned a fixed priority of 0x3FFF whereas for IPv4 the
assigned priority value was decreased with each rule added.

Since then all users of the default_pref field have been converted to
assign the generic function fib_default_rule_pref(), fib_nl_newrule()
may just use it directly instead. Therefore get rid of the function
pointer altogether and make fib_default_rule_pref() static, as it's not
used outside fib_rules.c anymore.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h |  2 --
 net/core/fib_rules.c    | 10 +++-------
 net/decnet/dn_rules.c   |  1 -
 net/ipv4/fib_rules.c    |  1 -
 net/ipv4/ipmr.c         |  1 -
 net/ipv6/fib6_rules.c   |  6 ------
 net/ipv6/ip6mr.c        |  1 -
 7 files changed, 3 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 4e8f804f4589..59160de702b6 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -66,7 +66,6 @@ struct fib_rules_ops {
 					   struct nlattr **);
 	int			(*fill)(struct fib_rule *, struct sk_buff *,
 					struct fib_rule_hdr *);
-	u32			(*default_pref)(struct fib_rules_ops *ops);
 	size_t			(*nlmsg_payload)(struct fib_rule *);
 
 	/* Called after modifications to the rules set, must flush
@@ -118,5 +117,4 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
 		     struct fib_lookup_arg *);
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
 			 u32 flags);
-u32 fib_default_rule_pref(struct fib_rules_ops *ops);
 #endif
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ae8306e7c56f..bf77e3639ce0 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 }
 EXPORT_SYMBOL(fib_default_rule_add);
 
-u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 {
 	struct list_head *pos;
 	struct fib_rule *rule;
@@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 
 	return 0;
 }
-EXPORT_SYMBOL(fib_default_rule_pref);
 
 static void notify_rule_change(int event, struct fib_rule *rule,
 			       struct fib_rules_ops *ops, struct nlmsghdr *nlh,
@@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	}
 	rule->fr_net = net;
 
-	if (tb[FRA_PRIORITY])
-		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
+	                              : fib_default_rule_pref(ops);
 
 	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
@@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	else
 		rule->suppress_ifgroup = -1;
 
-	if (!tb[FRA_PRIORITY] && ops->default_pref)
-		rule->pref = ops->default_pref(ops);
-
 	err = -EINVAL;
 	if (tb[FRA_GOTO]) {
 		if (rule->action != FR_ACT_GOTO)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 9d66a0f72f90..295bbd6a56f2 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
 	.configure	= dn_fib_rule_configure,
 	.compare	= dn_fib_rule_compare,
 	.fill		= dn_fib_rule_fill,
-	.default_pref	= fib_default_rule_pref,
 	.flush_cache	= dn_fib_rule_flush_cache,
 	.nlgroup	= RTNLGRP_DECnet_RULE,
 	.policy		= dn_fib_rule_policy,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 18123d50f576..f2bda9e89c61 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.delete		= fib4_rule_delete,
 	.compare	= fib4_rule_compare,
 	.fill		= fib4_rule_fill,
-	.default_pref	= fib_default_rule_pref,
 	.nlmsg_payload	= fib4_rule_nlmsg_payload,
 	.flush_cache	= fib4_rule_flush_cache,
 	.nlgroup	= RTNLGRP_IPV4_RULE,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a2c0162c3ba..866ee89f5254 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
 	.match		= ipmr_rule_match,
 	.configure	= ipmr_rule_configure,
 	.compare	= ipmr_rule_compare,
-	.default_pref	= fib_default_rule_pref,
 	.fill		= ipmr_rule_fill,
 	.nlgroup	= RTNLGRP_IPV4_RULE,
 	.policy		= ipmr_rule_policy,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2367a16eae58..9f777ec59a59 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -258,11 +258,6 @@ nla_put_failure:
 	return -ENOBUFS;
 }
 
-static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
-{
-	return 0x3FFF;
-}
-
 static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
 {
 	return nla_total_size(16) /* dst */
@@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
 	.configure		= fib6_rule_configure,
 	.compare		= fib6_rule_compare,
 	.fill			= fib6_rule_fill,
-	.default_pref		= fib6_rule_default_pref,
 	.nlmsg_payload		= fib6_rule_nlmsg_payload,
 	.nlgroup		= RTNLGRP_IPV6_RULE,
 	.policy			= fib6_rule_policy,
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 5f36266b1f5e..0e004cc42a22 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
 	.match		= ip6mr_rule_match,
 	.configure	= ip6mr_rule_configure,
 	.compare	= ip6mr_rule_compare,
-	.default_pref	= fib_default_rule_pref,
 	.fill		= ip6mr_rule_fill,
 	.nlgroup	= RTNLGRP_IPV6_RULE,
 	.policy		= ip6mr_rule_policy,
-- 
cgit v1.2.3-59-g8ed1b


From 52fe51f8523751da0e79c85350c47eb3bb94da5b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 10 Sep 2015 06:57:12 +0800
Subject: ipv6: fix ifnullfree.cocci warnings

net/ipv6/route.c:2946:3-8: WARNING: NULL check before freeing functions like kfree, debugfs_remove, debugfs_remove_recursive or usb_free_urb is not needed. Maybe consider reorganizing relevant code to avoid passing NULL values.

 NULL check before some freeing functions is not needed.

 Based on checkpatch warning
 "kfree(NULL) is safe this check is probably not required"
 and kfreeaddr.cocci by Julia Lawall.

Generated by: scripts/coccinelle/free/ifnullfree.cocci

CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 34539d3b843f..53617d715188 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2942,8 +2942,7 @@ cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
 		if (nh->rt6_info)
 			dst_free(&nh->rt6_info->dst);
-		if (nh->mxc.mx)
-			kfree(nh->mxc.mx);
+		kfree(nh->mxc.mx);
 		list_del(&nh->next);
 		kfree(nh);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a66e36568e30ed3714c0e3a12bd3b64696343ff5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 10 Sep 2015 01:20:46 +0200
Subject: netlink, mmap: don't walk rx ring on poll if receive queue non-empty

In case of netlink mmap, there can be situations where received frames
have to be placed into the normal receive queue. The ring buffer indicates
this through NL_MMAP_STATUS_COPY, so the user is asked to pick them up
via recvmsg(2) syscall, and to put the slot back to NL_MMAP_STATUS_UNUSED.

Commit 0ef707700f1c ("netlink: rx mmap: fix POLLIN condition") changed
polling, so that we walk in the worst case the whole ring through the
new netlink_has_valid_frame(), for example, when the ring would have no
NL_MMAP_STATUS_VALID, but at least one NL_MMAP_STATUS_COPY frame.

Since we do a datagram_poll() already earlier to pick up a mask that could
possibly contain POLLIN | POLLRDNORM already (due to NL_MMAP_STATUS_COPY),
we can skip checking the rx ring entirely.

In case the kernel is compiled with !CONFIG_NETLINK_MMAP, then all this is
irrelevant anyway as netlink_poll() is just defined as datagram_poll().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 50889be1517d..173817a5dfad 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -674,12 +674,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 
 	mask = datagram_poll(file, sock, wait);
 
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	if (nlk->rx_ring.pg_vec) {
-		if (netlink_has_valid_frame(&nlk->rx_ring))
-			mask |= POLLIN | POLLRDNORM;
+	/* We could already have received frames in the normal receive
+	 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
+	 * so if mask contains pollin/etc already, there's no point
+	 * walking the ring.
+	 */
+	if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		if (nlk->rx_ring.pg_vec) {
+			if (netlink_has_valid_frame(&nlk->rx_ring))
+				mask |= POLLIN | POLLRDNORM;
+		}
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
 	}
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	spin_lock_bh(&sk->sk_write_queue.lock);
 	if (nlk->tx_ring.pg_vec) {
-- 
cgit v1.2.3-59-g8ed1b


From 6bb0fef489f667cf701853054f44579754f00a06 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 10 Sep 2015 02:10:57 +0200
Subject: netlink, mmap: fix edge-case leakages in nf queue zero-copy

When netlink mmap on receive side is the consumer of nf queue data,
it can happen that in some edge cases, we write skb shared info into
the user space mmap buffer:

Assume a possible rx ring frame size of only 4096, and the network skb,
which is being zero-copied into the netlink skb, contains page frags
with an overall skb->len larger than the linear part of the netlink
skb.

skb_zerocopy(), which is generic and thus not aware of the fact that
shared info cannot be accessed for such skbs then tries to write and
fill frags, thus leaking kernel data/pointers and in some corner cases
possibly writing out of bounds of the mmap area (when filling the
last slot in the ring buffer this way).

I.e. the ring buffer slot is then of status NL_MMAP_STATUS_VALID, has
an advertised length larger than 4096, where the linear part is visible
at the slot beginning, and the leaked sizeof(struct skb_shared_info)
has been written to the beginning of the next slot (also corrupting
the struct nl_mmap_hdr slot header incl. status etc), since skb->end
points to skb->data + ring->frame_size - NL_MMAP_HDRLEN.

The fix adds and lets __netlink_alloc_skb() take the actual needed
linear room for the network skb + meta data into account. It's completely
irrelevant for non-mmaped netlink sockets, but in case mmap sockets
are used, it can be decided whether the available skb_tailroom() is
really large enough for the buffer, or whether it needs to internally
fallback to a normal alloc_skb().

>From nf queue side, the information whether the destination port is
an mmap RX ring is not really available without extra port-to-socket
lookup, thus it can only be determined in lower layers i.e. when
__netlink_alloc_skb() is called that checks internally for this. I
chose to add the extra ldiff parameter as mmap will then still work:
We have data_len and hlen in nfqnl_build_packet_message(), data_len
is the full length (capped at queue->copy_range) for skb_zerocopy()
and hlen some possible part of data_len that needs to be copied; the
rem_len variable indicates the needed remaining linear mmap space.

The only other workaround in nf queue internally would be after
allocation time by f.e. cap'ing the data_len to the skb_tailroom()
iff we deal with an mmap skb, but that would 1) expose the fact that
we use a mmap skb to upper layers, and 2) trim the skb where we
otherwise could just have moved the full skb into the normal receive
queue.

After the patch, in my test case the ring slot doesn't fit and therefore
shows NL_MMAP_STATUS_COPY, where a full skb carries all the data and
thus needs to be picked up via recv().

Fixes: 3ab1f683bf8b ("nfnetlink: add support for memory mapped netlink")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h              | 13 +++++++++++--
 net/netfilter/nfnetlink_queue_core.c |  5 +++--
 net/netlink/af_netlink.c             | 18 ++++++++++++------
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 9120edb650a0..639e9b8b0e4d 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -68,8 +68,17 @@ extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
-extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-					 u32 dst_portid, gfp_t gfp_mask);
+
+extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+					   unsigned int ldiff, u32 dst_portid,
+					   gfp_t gfp_mask);
+static inline struct sk_buff *
+netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid,
+		  gfp_t gfp_mask)
+{
+	return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask);
+}
+
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 			     __u32 group, gfp_t allocation);
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 685cc6a17163..a5cd6d90b78b 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			   __be32 **packet_id_ptr)
 {
 	size_t size;
-	size_t data_len = 0, cap_len = 0;
+	size_t data_len = 0, cap_len = 0, rem_len = 0;
 	unsigned int hlen = 0;
 	struct sk_buff *skb;
 	struct nlattr *nla;
@@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		hlen = min_t(unsigned int, hlen, data_len);
 		size += sizeof(struct nlattr) + hlen;
 		cap_len = entskb->len;
+		rem_len = data_len - hlen;
 		break;
 	}
 
@@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			size += nla_total_size(seclen);
 	}
 
-	skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+	skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
 				  GFP_ATOMIC);
 	if (!skb) {
 		skb_tx_error(entskb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 173817a5dfad..7f86d3b55060 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1844,15 +1844,16 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
-struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-				  u32 dst_portid, gfp_t gfp_mask)
+struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+				    unsigned int ldiff, u32 dst_portid,
+				    gfp_t gfp_mask)
 {
 #ifdef CONFIG_NETLINK_MMAP
+	unsigned int maxlen, linear_size;
 	struct sock *sk = NULL;
 	struct sk_buff *skb;
 	struct netlink_ring *ring;
 	struct nl_mmap_hdr *hdr;
-	unsigned int maxlen;
 
 	sk = netlink_getsockbyportid(ssk, dst_portid);
 	if (IS_ERR(sk))
@@ -1863,7 +1864,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
 	if (ring->pg_vec == NULL)
 		goto out_put;
 
-	if (ring->frame_size - NL_MMAP_HDRLEN < size)
+	/* We need to account the full linear size needed as a ring
+	 * slot cannot have non-linear parts.
+	 */
+	linear_size = size + ldiff;
+	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
 		goto out_put;
 
 	skb = alloc_skb_head(gfp_mask);
@@ -1877,13 +1882,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
 
 	/* check again under lock */
 	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-	if (maxlen < size)
+	if (maxlen < linear_size)
 		goto out_free;
 
 	netlink_forward_ring(ring);
 	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
 	if (hdr == NULL)
 		goto err2;
+
 	netlink_ring_setup_skb(skb, sk, ring, hdr);
 	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
 	atomic_inc(&ring->pending);
@@ -1909,7 +1915,7 @@ out:
 #endif
 	return alloc_skb(size, gfp_mask);
 }
-EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
 
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 05c5a46d71f621df620fbabbd7758ee1b44575ad Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 9 Sep 2015 21:54:37 -0700
Subject: tcp: generate CA_EVENT_TX_START on data frames

Issuing a CC TX_START event on control frames like pure ACK
is a waste of time, as a CC should not care.

Following patch needs this change, as we want CUBIC to properly track
idle time at a low cost, with a single TX_START being generated.

Yuchung might slightly refine the condition triggering TX_START
on a followup patch.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Cc: Jana Iyengar <jri@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Sangtae Ha <sangtae.ha@gmail.com>
Cc: Lawrence Brakmo <lawrence@brakmo.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1188e4fcf23b..f9a8a12b62ee 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -164,6 +164,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
+	if (tcp_packets_in_flight(tp) == 0)
+		tcp_ca_event(sk, CA_EVENT_TX_START);
+
 	tp->lsndtime = now;
 
 	/* If it is a reply for ato after last received
@@ -940,9 +943,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
-	if (tcp_packets_in_flight(tp) == 0)
-		tcp_ca_event(sk, CA_EVENT_TX_START);
-
 	/* if no packet is in qdisc/device queue, then allow XPS to select
 	 * another queue. We can be called from tcp_tsq_handler()
 	 * which holds one reference to sk_wmem_alloc.
-- 
cgit v1.2.3-59-g8ed1b


From 30927520dbae297182990bb21d08762bcc35ce1d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Sep 2015 21:55:07 -0700
Subject: tcp_cubic: better follow cubic curve after idle period

Jana Iyengar found an interesting issue on CUBIC :

The epoch is only updated/reset initially and when experiencing losses.
The delta "t" of now - epoch_start can be arbitrary large after app idle
as well as the bic_target. Consequentially the slope (inverse of
ca->cnt) would be really large, and eventually ca->cnt would be
lower-bounded in the end to 2 to have delayed-ACK slow-start behavior.

This particularly shows up when slow_start_after_idle is disabled
as a dangerous cwnd inflation (1.5 x RTT) after few seconds of idle
time.

Jana initial fix was to reset epoch_start if app limited,
but Neal pointed out it would ask the CUBIC algorithm to recalculate the
curve so that we again start growing steeply upward from where cwnd is
now (as CUBIC does just after a loss). Ideally we'd want the cwnd growth
curve to be the same shape, just shifted later in time by the amount of
the idle period.

Reported-by: Jana Iyengar <jri@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Sangtae Ha <sangtae.ha@gmail.com>
Cc: Lawrence Brakmo <lawrence@brakmo.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cubic.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 28011fb1f4a2..c6ded6b2a79f 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -151,6 +151,21 @@ static void bictcp_init(struct sock *sk)
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_TX_START) {
+		s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime;
+		struct bictcp *ca = inet_csk_ca(sk);
+
+		/* We were application limited (idle) for a while.
+		 * Shift epoch_start to keep cwnd growth to cubic curve.
+		 */
+		if (ca->epoch_start && delta > 0)
+			ca->epoch_start += delta;
+		return;
+	}
+}
+
 /* calculate the cubic root of x using a table lookup followed by one
  * Newton-Raphson iteration.
  * Avg err ~= 0.195%
@@ -450,6 +465,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
+	.cwnd_event	= bictcp_cwnd_event,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "cubic",
-- 
cgit v1.2.3-59-g8ed1b