From 18a912e9a832dcfc7db9e055c7e41701ff5f9e69 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 22 Jul 2015 10:43:22 +0300
Subject: ipv4: fib_select_default should match the prefix

fib_trie starting from 4.1 can link fib aliases from
different prefixes in same list. Make sure the alternative
gateways are in same table and for same prefix (0) by
checking tb_id and fa_slen.

Fixes: 79e5ad2ceb00 ("fib_trie: Remove leaf_info")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c7358ea4ae93..e1079583b8b7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1207,12 +1207,17 @@ void fib_select_default(struct fib_result *res)
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
 	struct fib_table *tb = res->table;
+	u8 slen = 32 - res->prefixlen;
 	int order = -1, last_idx = -1;
 	struct fib_alias *fa;
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
 
+		if (fa->fa_slen != slen)
+			continue;
+		if (fa->tb_id != tb->tb_id)
+			continue;
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-- 
cgit v1.2.3-59-g8ed1b


From 2392debc2be721a7d5b907cbcbc0ebb858dead01 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 22 Jul 2015 10:43:23 +0300
Subject: ipv4: consider TOS in fib_select_default

fib_select_default considers alternative routes only when
res->fi is for the first alias in res->fa_head. In the
common case this can happen only when the initial lookup
matches the first alias with highest TOS value. This
prevents the alternative routes to require specific TOS.

This patch solves the problem as follows:

- routes that require specific TOS should be returned by
fib_select_default only when TOS matches, as already done
in fib_table_lookup. This rule implies that depending on the
TOS we can have many different lists of alternative gateways
and we have to keep the last used gateway (fa_default) in first
alias for the TOS instead of using single tb_default value.

- as the aliases are ordered by many keys (TOS desc,
fib_priority asc), we restrict the possible results to
routes with matching TOS and lowest metric (fib_priority)
and routes that match any TOS, again with lowest metric.

For example, packet with TOS 8 can not use gw3 (not lowest
metric), gw4 (different TOS) and gw6 (not lowest metric),
all other gateways can be used:

tos 8 via gw1 metric 2 <--- res->fa_head and res->fi
tos 8 via gw2 metric 2
tos 8 via gw3 metric 3
tos 4 via gw4
tos 0 via gw5
tos 0 via gw6 metric 1

Reported-by: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  3 +--
 net/ipv4/fib_lookup.h    |  1 +
 net/ipv4/fib_semantics.c | 36 +++++++++++++++++++++++++-----------
 net/ipv4/fib_trie.c      |  3 ++-
 net/ipv4/route.c         |  2 +-
 5 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 49c142bdf01e..5fa643b4e891 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -183,7 +183,6 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 struct fib_table {
 	struct hlist_node	tb_hlist;
 	u32			tb_id;
-	int			tb_default;
 	int			tb_num_default;
 	struct rcu_head		rcu;
 	unsigned long 		*tb_data;
@@ -290,7 +289,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			u8 tos, int oif, struct net_device *dev,
 			struct in_device *idev, u32 *itag);
-void fib_select_default(struct fib_result *res);
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 static inline int fib_num_tclassid_users(struct net *net)
 {
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c6211ed60b03..9c02920725db 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -13,6 +13,7 @@ struct fib_alias {
 	u8			fa_state;
 	u8			fa_slen;
 	u32			tb_id;
+	s16			fa_default;
 	struct rcu_head		rcu;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e1079583b8b7..3a06586b170c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1202,28 +1202,40 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
 }
 
 /* Must be invoked inside of an RCU protected region.  */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
 	struct fib_table *tb = res->table;
 	u8 slen = 32 - res->prefixlen;
 	int order = -1, last_idx = -1;
-	struct fib_alias *fa;
+	struct fib_alias *fa, *fa1 = NULL;
+	u32 last_prio = res->fi->fib_priority;
+	u8 last_tos = 0;
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
 
 		if (fa->fa_slen != slen)
 			continue;
+		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+			continue;
 		if (fa->tb_id != tb->tb_id)
 			continue;
+		if (next_fi->fib_priority > last_prio &&
+		    fa->fa_tos == last_tos) {
+			if (last_tos)
+				continue;
+			break;
+		}
+		if (next_fi->fib_flags & RTNH_F_DEAD)
+			continue;
+		last_tos = fa->fa_tos;
+		last_prio = next_fi->fib_priority;
+
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-
-		if (next_fi->fib_priority > res->fi->fib_priority)
-			break;
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
@@ -1233,10 +1245,11 @@ void fib_select_default(struct fib_result *res)
 		if (!fi) {
 			if (next_fi != res->fi)
 				break;
+			fa1 = fa;
 		} else if (!fib_detect_death(fi, order, &last_resort,
-					     &last_idx, tb->tb_default)) {
+					     &last_idx, fa1->fa_default)) {
 			fib_result_assign(res, fi);
-			tb->tb_default = order;
+			fa1->fa_default = order;
 			goto out;
 		}
 		fi = next_fi;
@@ -1244,20 +1257,21 @@ void fib_select_default(struct fib_result *res)
 	}
 
 	if (order <= 0 || !fi) {
-		tb->tb_default = -1;
+		if (fa1)
+			fa1->fa_default = -1;
 		goto out;
 	}
 
 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-				tb->tb_default)) {
+			      fa1->fa_default)) {
 		fib_result_assign(res, fi);
-		tb->tb_default = order;
+		fa1->fa_default = order;
 		goto out;
 	}
 
 	if (last_idx >= 0)
 		fib_result_assign(res, last_resort);
-	tb->tb_default = last_idx;
+	fa1->fa_default = last_idx;
 out:
 	return;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 15d32612e3c6..81797e065b21 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1171,6 +1171,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			new_fa->fa_slen = fa->fa_slen;
 			new_fa->tb_id = tb->tb_id;
+			new_fa->fa_default = -1;
 
 			err = switchdev_fib_ipv4_add(key, plen, fi,
 						     new_fa->fa_tos,
@@ -1222,6 +1223,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->fa_state = 0;
 	new_fa->fa_slen = slen;
 	new_fa->tb_id = tb->tb_id;
+	new_fa->fa_default = -1;
 
 	/* (Optionally) offload fib entry to switch hardware. */
 	err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
@@ -1990,7 +1992,6 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
 		return NULL;
 
 	tb->tb_id = id;
-	tb->tb_default = -1;
 	tb->tb_num_default = 0;
 	tb->tb_data = (alias ? alias->__data : tb->__data);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d0362a2de3d3..e681b852ced1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2176,7 +2176,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	if (!res.prefixlen &&
 	    res.table->tb_num_default > 1 &&
 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
-		fib_select_default(&res);
+		fib_select_default(fl4, &res);
 
 	if (!fl4->saddr)
 		fl4->saddr = FIB_RES_PREFSRC(net, res);
-- 
cgit v1.2.3-59-g8ed1b


From d1fe19444d82e399e38c1594c71b850eca8e9de0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:37 +0200
Subject: inet: frag: don't re-use chainlist for evictor

commit 65ba1f1ec0eff ("inet: frags: fix a race between inet_evict_bucket
and inet_frag_kill") describes the bug, but the fix doesn't work reliably.

Problem is that ->flags member can be set on other cpu without chainlock
being held by that task, i.e. the RMW-Cycle can clear INET_FRAG_EVICTED
bit after we put the element on the evictor private list.

We can crash when walking the 'private' evictor list since an element can
be deleted from list underneath the evictor.

Join work with Nikolay Alexandrov.

Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue")
Reported-by: Johan Schuijt <johan@transip.nl>
Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Nikolay Alexandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 2 ++
 net/ipv4/inet_fragment.c | 8 +++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e1300b3dd597..56a3a5685f76 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -45,6 +45,7 @@ enum {
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
+ * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
  */
 struct inet_frag_queue {
 	spinlock_t		lock;
@@ -59,6 +60,7 @@ struct inet_frag_queue {
 	__u8			flags;
 	u16			max_size;
 	struct netns_frags	*net;
+	struct hlist_node	list_evictor;
 };
 
 #define INETFRAGS_HASHSZ	1024
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5e346a082e5f..172234864fec 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -151,14 +151,13 @@ evict_again:
 		}
 
 		fq->flags |= INET_FRAG_EVICTED;
-		hlist_del(&fq->list);
-		hlist_add_head(&fq->list, &expired);
+		hlist_add_head(&fq->list_evictor, &expired);
 		++evicted;
 	}
 
 	spin_unlock(&hb->chain_lock);
 
-	hlist_for_each_entry_safe(fq, n, &expired, list)
+	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
 		f->frag_expire((unsigned long) fq);
 
 	return evicted;
@@ -284,8 +283,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 	struct inet_frag_bucket *hb;
 
 	hb = get_frag_bucket_locked(fq, f);
-	if (!(fq->flags & INET_FRAG_EVICTED))
-		hlist_del(&fq->list);
+	hlist_del(&fq->list);
 	spin_unlock(&hb->chain_lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0e60d245a0be7fdbb723607f1d6621007916b252 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:38 +0200
Subject: inet: frag: change *_frag_mem_limit functions to take netns_frags as
 argument

Followup patch will call it after inet_frag_queue was freed, so q->net
doesn't work anymore (but netf = q->net; free(q); mem_limit(netf) would).

Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  8 ++++----
 net/ieee802154/6lowpan/reassembly.c     |  6 +++---
 net/ipv4/inet_fragment.c                |  4 ++--
 net/ipv4/ip_fragment.c                  | 10 +++++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c |  6 +++---
 net/ipv6/reassembly.c                   |  6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 56a3a5685f76..e71ca17024f2 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -141,14 +141,14 @@ static inline int frag_mem_limit(struct netns_frags *nf)
 	return percpu_counter_read(&nf->mem);
 }
 
-static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
+	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
 }
 
-static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
+	__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
 }
 
 static inline void init_frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index f46e4d1306f2..214d44aef35b 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -207,7 +207,7 @@ found:
 	} else {
 		fq->q.meat += skb->len;
 	}
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
@@ -287,7 +287,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 		clone->data_len = clone->len;
 		head->data_len -= clone->len;
 		head->len -= clone->len;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	WARN_ON(head == NULL);
@@ -310,7 +310,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&fq->q, sum_truesize);
+	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 172234864fec..4473232e4e88 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -328,7 +328,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 		fp = xp;
 	}
 	sum = sum_truesize + f->qsize;
-	sub_frag_mem_limit(q, sum);
+	sub_frag_mem_limit(q->net, sum);
 
 	if (f->destructor)
 		f->destructor(q);
@@ -388,7 +388,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 
 	q->net = nf;
 	f->constructor(q, arg);
-	add_frag_mem_limit(q, f->qsize);
+	add_frag_mem_limit(nf, f->qsize);
 
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 31f71b15cfba..b4a77d021c0d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -309,7 +309,7 @@ static int ip_frag_reinit(struct ipq *qp)
 		kfree_skb(fp);
 		fp = xp;
 	} while (fp);
-	sub_frag_mem_limit(&qp->q, sum_truesize);
+	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
 	qp->q.len = 0;
@@ -455,7 +455,7 @@ found:
 				qp->q.fragments = next;
 
 			qp->q.meat -= free_it->len;
-			sub_frag_mem_limit(&qp->q, free_it->truesize);
+			sub_frag_mem_limit(qp->q.net, free_it->truesize);
 			kfree_skb(free_it);
 		}
 	}
@@ -479,7 +479,7 @@ found:
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
-	add_frag_mem_limit(&qp->q, skb->truesize);
+	add_frag_mem_limit(qp->q.net, skb->truesize);
 	if (offset == 0)
 		qp->q.flags |= INET_FRAG_FIRST_IN;
 
@@ -587,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(&qp->q, clone->truesize);
+		add_frag_mem_limit(qp->q.net, clone->truesize);
 	}
 
 	skb_push(head, head->data - skb_network_header(head));
@@ -615,7 +615,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&qp->q, sum_truesize);
+	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6f187c8d8a1b..6d02498172c1 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -348,7 +348,7 @@ found:
 	fq->ecn |= ecn;
 	if (payload_len > fq->q.max_size)
 		fq->q.max_size = payload_len;
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -430,7 +430,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 		clone->ip_summed = head->ip_summed;
 
 		NFCT_FRAG6_CB(clone)->orig = NULL;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -454,7 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 	}
-	sub_frag_mem_limit(&fq->q, head->truesize);
+	sub_frag_mem_limit(fq->q.net, head->truesize);
 
 	head->ignore_df = 1;
 	head->next = NULL;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 8ffa2c8cce77..5c3bbca6a150 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -330,7 +330,7 @@ found:
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -443,7 +443,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -481,7 +481,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&fq->q, sum_truesize);
+	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
-- 
cgit v1.2.3-59-g8ed1b


From 5719b296fb81502d0dbbb4e87b3235e5bdcdfc6b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:39 +0200
Subject: inet: frag: don't wait for timer deletion when evicting

Frank reports 'NMI watchdog: BUG: soft lockup' errors when
load is high.  Instead of (potentially) unbounded restarts of the
eviction process, just skip to the next entry.

One caveat is that, when a netns is exiting, a timer may still be running
by the time inet_evict_bucket returns.

We use the frag memory accounting to wait for outstanding timers,
so that when we free the percpu counter we can be sure no running
timer will trip over it.

Reported-and-tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4473232e4e88..a00ca4c00c35 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -131,24 +131,14 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 	unsigned int evicted = 0;
 	HLIST_HEAD(expired);
 
-evict_again:
 	spin_lock(&hb->chain_lock);
 
 	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
 		if (!inet_fragq_should_evict(fq))
 			continue;
 
-		if (!del_timer(&fq->timer)) {
-			/* q expiring right now thus increment its refcount so
-			 * it won't be freed under us and wait until the timer
-			 * has finished executing then destroy it
-			 */
-			atomic_inc(&fq->refcnt);
-			spin_unlock(&hb->chain_lock);
-			del_timer_sync(&fq->timer);
-			inet_frag_put(fq, f);
-			goto evict_again;
-		}
+		if (!del_timer(&fq->timer))
+			continue;
 
 		fq->flags |= INET_FRAG_EVICTED;
 		hlist_add_head(&fq->list_evictor, &expired);
@@ -239,18 +229,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 	int i;
 
 	nf->low_thresh = 0;
-	local_bh_disable();
 
 evict_again:
+	local_bh_disable();
 	seq = read_seqbegin(&f->rnd_seqlock);
 
 	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
 		inet_evict_bucket(f, &f->hash[i]);
 
-	if (read_seqretry(&f->rnd_seqlock, seq))
-		goto evict_again;
-
 	local_bh_enable();
+	cond_resched();
+
+	if (read_seqretry(&f->rnd_seqlock, seq) ||
+	    percpu_counter_sum(&nf->mem))
+		goto evict_again;
 
 	percpu_counter_destroy(&nf->mem);
 }
@@ -284,6 +276,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 
 	hb = get_frag_bucket_locked(fq, f);
 	hlist_del(&fq->list);
+	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
@@ -295,7 +288,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
 		fq_unlink(fq, f);
 		atomic_dec(&fq->refcnt);
-		fq->flags |= INET_FRAG_COMPLETE;
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
@@ -328,11 +320,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 		fp = xp;
 	}
 	sum = sum_truesize + f->qsize;
-	sub_frag_mem_limit(q->net, sum);
 
 	if (f->destructor)
 		f->destructor(q);
 	kmem_cache_free(f->frags_cachep, q);
+
+	sub_frag_mem_limit(nf, sum);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-- 
cgit v1.2.3-59-g8ed1b


From caaecdd3d3f8ec0ea9906c54b1dd8ec8316d26b9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 23 Jul 2015 12:05:40 +0200
Subject: inet: frags: remove INET_FRAG_EVICTED and use list_evictor for the
 test

We can simply remove the INET_FRAG_EVICTED flag to avoid all the flags
race conditions with the evictor and use a participation test for the
evictor list, when we're at that point (after inet_frag_kill) in the
timer there're 2 possible cases:

1. The evictor added the entry to its evictor list while the timer was
waiting for the chainlock
or
2. The timer unchained the entry and the evictor won't see it

In both cases we should be able to see list_evictor correctly due
to the sync on the chainlock.

Joint work with Florian Westphal.

Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 7 +++++--
 net/ipv4/inet_fragment.c | 1 -
 net/ipv4/ip_fragment.c   | 2 +-
 net/ipv6/reassembly.c    | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e71ca17024f2..53eead2da743 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -21,13 +21,11 @@ struct netns_frags {
  * @INET_FRAG_FIRST_IN: first fragment has arrived
  * @INET_FRAG_LAST_IN: final fragment has arrived
  * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
- * @INET_FRAG_EVICTED: frag queue is being evicted
  */
 enum {
 	INET_FRAG_FIRST_IN	= BIT(0),
 	INET_FRAG_LAST_IN	= BIT(1),
 	INET_FRAG_COMPLETE	= BIT(2),
-	INET_FRAG_EVICTED	= BIT(3)
 };
 
 /**
@@ -127,6 +125,11 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
 		inet_frag_destroy(q, f);
 }
 
+static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+{
+	return !hlist_unhashed(&q->list_evictor);
+}
+
 /* Memory Tracking Functions. */
 
 /* The default percpu_counter batch size is not big enough to scale to
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a00ca4c00c35..d0a7c0319e3d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -140,7 +140,6 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 		if (!del_timer(&fq->timer))
 			continue;
 
-		fq->flags |= INET_FRAG_EVICTED;
 		hlist_add_head(&fq->list_evictor, &expired);
 		++evicted;
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b4a77d021c0d..921138f6c97c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -202,7 +202,7 @@ static void ip_expire(unsigned long arg)
 	ipq_kill(qp);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+	if (!inet_frag_evicting(&qp->q)) {
 		struct sk_buff *head = qp->q.fragments;
 		const struct iphdr *iph;
 		int err;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c3bbca6a150..f1159bb76e0a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -144,7 +144,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 
-	if (fq->q.flags & INET_FRAG_EVICTED)
+	if (inet_frag_evicting(&fq->q))
 		goto out_rcu_unlock;
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-- 
cgit v1.2.3-59-g8ed1b


From dfbafc995304ebb9a9b03f65083e6e9cea143b20 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 24 Jul 2015 18:19:25 +0200
Subject: tcp: fix recv with flags MSG_WAITALL | MSG_PEEK

Currently, tcp_recvmsg enters a busy loop in sk_wait_data if called
with flags = MSG_WAITALL | MSG_PEEK.

sk_wait_data waits for sk_receive_queue not empty, but in this case,
the receive queue is not empty, but does not contain any skb that we
can use.

Add a "last skb seen on receive queue" argument to sk_wait_data, so
that it sleeps until the receive queue has new skbs.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=99461
Link: https://sourceware.org/bugzilla/show_bug.cgi?id=18493
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1205258
Reported-by: Enrico Scholz <rh-bugzilla@ensc.de>
Reported-by: Dan Searle <dan@censornet.com>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  2 +-
 net/core/sock.c    |  5 +++--
 net/dccp/proto.c   |  2 +-
 net/ipv4/tcp.c     | 11 +++++++----
 net/llc/af_llc.c   |  4 ++--
 5 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index 05a8c1aea251..f21f0708ec59 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -902,7 +902,7 @@ void sk_stream_kill_queues(struct sock *sk);
 void sk_set_memalloc(struct sock *sk);
 void sk_clear_memalloc(struct sock *sk);
 
-int sk_wait_data(struct sock *sk, long *timeo);
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
 
 struct request_sock_ops;
 struct timewait_sock_ops;
diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db46070..8a14f1285fc4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1967,20 +1967,21 @@ static void __release_sock(struct sock *sk)
  * sk_wait_data - wait for data to arrive at sk_receive_queue
  * @sk:    sock to wait on
  * @timeo: for how long
+ * @skb:   last skb seen on sk_receive_queue
  *
  * Now socket state including sk->sk_err is changed only under lock,
  * hence we may omit checks after joining wait queue.
  * We check receive queue before schedule() only as optimization;
  * it is very likely that release_sock() added new data.
  */
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 {
 	int rc;
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 	finish_wait(sk_sleep(sk), &wait);
 	return rc;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 52a94016526d..b5cf13a28009 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -886,7 +886,7 @@ verify_sock_status:
 			break;
 		}
 
-		sk_wait_data(sk, &timeo);
+		sk_wait_data(sk, &timeo, NULL);
 		continue;
 	found_ok_skb:
 		if (len > skb->len)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7f4056785acc..45534a5ab430 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -780,7 +780,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 				ret = -EAGAIN;
 				break;
 			}
-			sk_wait_data(sk, &timeo);
+			sk_wait_data(sk, &timeo, NULL);
 			if (signal_pending(current)) {
 				ret = sock_intr_errno(timeo);
 				break;
@@ -1575,7 +1575,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
@@ -1635,7 +1635,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		/* Next get a buffer. */
 
+		last = skb_peek_tail(&sk->sk_receive_queue);
 		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			last = skb;
 			/* Now that we have two receive queues this
 			 * shouldn't happen.
 			 */
@@ -1754,8 +1756,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
 			lock_sock(sk);
-		} else
-			sk_wait_data(sk, &timeo);
+		} else {
+			sk_wait_data(sk, &timeo, last);
+		}
 
 		if (user_recv) {
 			int chunk;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8fd9febaa5ba..8dab4e569571 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -613,7 +613,7 @@ static int llc_wait_data(struct sock *sk, long timeo)
 		if (signal_pending(current))
 			break;
 		rc = 0;
-		if (sk_wait_data(sk, &timeo))
+		if (sk_wait_data(sk, &timeo, NULL))
 			break;
 	}
 	return rc;
@@ -802,7 +802,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			release_sock(sk);
 			lock_sock(sk);
 		} else
-			sk_wait_data(sk, &timeo);
+			sk_wait_data(sk, &timeo, NULL);
 
 		if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
 			net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n",
-- 
cgit v1.2.3-59-g8ed1b


From 1513069edcf8dd86cfd8d5daef482b97d6b93df6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Mon, 27 Jul 2015 13:08:06 -0700
Subject: fib_trie: Drop unnecessary calls to leaf_pull_suffix

It was reported that update_suffix was taking a long time on systems where
a large number of leaves were attached to a single node.  As it turns out
fib_table_flush was calling update_suffix for each leaf that didn't have all
of the aliases stripped from it.  As a result, on this large node removing
one leaf would result in us calling update_suffix for every other leaf on
the node.

The fix is to just remove the calls to leaf_pull_suffix since they are
redundant as we already have a call in resize that will go through and
update the suffix length for the node before we exit out of
fib_table_flush or fib_table_flush_external.

Reported-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 81797e065b21..37c4bb89a708 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1793,8 +1793,6 @@ void fib_table_flush_external(struct fib_table *tb)
 		if (hlist_empty(&n->leaf)) {
 			put_child_root(pn, n->key, NULL);
 			node_free(n);
-		} else {
-			leaf_pull_suffix(pn, n);
 		}
 	}
 }
@@ -1864,8 +1862,6 @@ int fib_table_flush(struct fib_table *tb)
 		if (hlist_empty(&n->leaf)) {
 			put_child_root(pn, n->key, NULL);
 			node_free(n);
-		} else {
-			leaf_pull_suffix(pn, n);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 865b804244f228e80fb62abe464296399253cce8 Mon Sep 17 00:00:00 2001
From: David Ward <david.ward@ll.mit.edu>
Date: Sun, 26 Jul 2015 12:18:58 -0400
Subject: net/ipv4: suppress NETDEV_UP notification on address lifetime update

This notification causes the FIB to be updated, which is not needed
because the address already exists, and more importantly it may undo
intentional changes that were made to the FIB after the address was
originally added. (As a point of comparison, when an address becomes
deprecated because its preferred lifetime expired, a notification on
this chain is not generated.)

The motivation for this commit is fixing an incompatibility between
DHCP clients which set and update the address lifetime according to
the lease, and a commercial VPN client which replaces kernel routes
in a way that outbound traffic is sent only through the tunnel (and
disconnects if any further route changes are detected via netlink).

Signed-off-by: David Ward <david.ward@ll.mit.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e813196c91c7..2d9cb1748f81 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -882,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
 		queue_delayed_work(system_power_efficient_wq,
 				&check_lifetime_work, 0);
 		rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
-		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
 	}
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 11c91ef98f37cd743098de26160fffd7f9bd40e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 Jul 2015 11:33:50 +0200
Subject: arp: filter NOARP neighbours for SIOCGARP

When arp is off on a device, and ioctl(SIOCGARP) is queried,
a buggy answer is given with MAC address of the device, instead
of the mac address of the destination/gateway.

We filter out NUD_NOARP neighbours for /proc/net/arp,
we must do the same for SIOCGARP ioctl.

Tested:

lpaa23:~# ./arp 10.246.7.190
MAC=00:01:e8:22:cb:1d      // correct answer

lpaa23:~# ip link set dev eth0 arp off
lpaa23:~# cat /proc/net/arp   # check arp table is now 'empty'
IP address       HW type     Flags       HW address    Mask     Device
lpaa23:~# ./arp 10.246.7.190
MAC=00:1a:11:c3:0d:7f   // buggy answer before patch (this is eth0 mac)

After patch :

lpaa23:~# ip link set dev eth0 arp off
lpaa23:~# ./arp 10.246.7.190
ioctl(SIOCGARP) failed: No such device or address

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Vytautas Valancius <valas@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 933a92820d26..6c8b1fbafce8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1017,14 +1017,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
 
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		read_lock_bh(&neigh->lock);
-		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
-		r->arp_flags = arp_state_to_flags(neigh);
-		read_unlock_bh(&neigh->lock);
-		r->arp_ha.sa_family = dev->type;
-		strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+		if (!(neigh->nud_state & NUD_NOARP)) {
+			read_lock_bh(&neigh->lock);
+			memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+			r->arp_flags = arp_state_to_flags(neigh);
+			read_unlock_bh(&neigh->lock);
+			r->arp_ha.sa_family = dev->type;
+			strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+			err = 0;
+		}
 		neigh_release(neigh);
-		err = 0;
 	}
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b