From 9aee40006190a3cda9a4d2dbae71e92617c8c362 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Mon, 23 Jul 2018 17:49:39 -0700
Subject: tcp: ack immediately when a cwr packet arrives

We observed high 99 and 99.9% latencies when doing RPCs with DCTCP. The
problem is triggered when the last packet of a request arrives CE
marked. The reply will carry the ECE mark causing TCP to shrink its cwnd
to 1 (because there are no packets in flight). When the 1st packet of
the next request arrives, the ACK was sometimes delayed even though it
is CWR marked, adding up to 40ms to the RPC latency.

This patch insures that CWR marked data packets arriving will be acked
immediately.

Packetdrill script to reproduce the problem:

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
0.110 < [ect0] . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4

0.200 < [ect0] . 1:1001(1000) ack 1 win 257
0.200 > [ect01] . 1:1(0) ack 1001

0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 1:2(1) ack 1001

0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 2:3(1) ack 2001

0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
0.200 > [ect01] . 3:3(0) ack 4001

0.210 < [ce] P. 4001:4501(500) ack 3 win 257

+0.001 read(4, ..., 4500) = 4500
+0 write(4, ..., 1) = 1
+0 > [ect01] PE. 3:4(1) ack 4501

+0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
// Previously the ACK sequence below would be 4501, causing a long RTO
+0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack

+0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
+0 > [ect01] . 4:4(0) ack 6501     // now acks everything

+0.500 < F. 9501:9501(0) ack 4 win 257

Modified based on comments by Neal Cardwell <ncardwell@google.com>

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3bcd30a2ba06..f9dcb29be12d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -246,8 +246,15 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
 
 static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
 {
-	if (tcp_hdr(skb)->cwr)
+	if (tcp_hdr(skb)->cwr) {
 		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+
+		/* If the sender is telling us it has entered CWR, then its
+		 * cwnd may be very low (even just 1 packet), so we should ACK
+		 * immediately.
+		 */
+		tcp_enter_quickack_mode((struct sock *)tp, 2);
+	}
 }
 
 static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
-- 
cgit v1.2.3-59-g8ed1b


From b87bac1012c483462e7776c7b7320b659dbb3295 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 25 Jul 2018 06:06:13 +0000
Subject: net: igmp: make function __ip_mc_inc_group() static

Fixes the following sparse warnings:

net/ipv4/igmp.c:1391:6: warning:
 symbol '__ip_mc_inc_group' was not declared. Should it be static?

Fixes: 6e2059b53f98 ("ipv4/igmp: init group mode as INCLUDE when join source group")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 28fef7d15959..75151be21413 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1387,7 +1387,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
 /*
  *	A socket has joined a multicast group on device dev.
  */
-void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode)
+static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+			      unsigned int mode)
 {
 	struct ip_mc_list *im;
 #ifdef CONFIG_IP_MULTICAST
-- 
cgit v1.2.3-59-g8ed1b


From 9fc12023d6f51551d6ca9ed7e02ecc19d79caf17 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Fri, 27 Jul 2018 18:15:46 +0200
Subject: ipv4: remove BUG_ON() from fib_compute_spec_dst

Remove BUG_ON() from fib_compute_spec_dst routine and check
in_dev pointer during flowi4 data structure initialization.
fib_compute_spec_dst routine can be run concurrently with device removal
where ip_ptr net_device pointer is set to NULL. This can happen
if userspace enables pkt info on UDP rx socket and the device
is removed while traffic is flowing

Fixes: 35ebf65e851c ("ipv4: Create and use fib_compute_spec_dst() helper")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e46cdd310e5f..2998b0e47d4b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,19 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 		return ip_hdr(skb)->daddr;
 
 	in_dev = __in_dev_get_rcu(dev);
-	BUG_ON(!in_dev);
 
 	net = dev_net(dev);
 
 	scope = RT_SCOPE_UNIVERSE;
 	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+		bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
 		struct flowi4 fl4 = {
 			.flowi4_iif = LOOPBACK_IFINDEX,
 			.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 			.daddr = ip_hdr(skb)->saddr,
 			.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
 			.flowi4_scope = scope,
-			.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+			.flowi4_mark = vmark ? skb->mark : 0,
 		};
 		if (!fib_lookup(net, &fl4, &res, 0))
 			return FIB_RES_PREFSRC(net, res);
-- 
cgit v1.2.3-59-g8ed1b


From 383d470936c05554219094a4d364d964cb324827 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 27 Jul 2018 17:19:12 -0400
Subject: tcp_bbr: fix bw probing to raise in-flight data for very small BDPs

For some very small BDPs (with just a few packets) there was a
quantization effect where the target number of packets in flight
during the super-unity-gain (1.25x) phase of gain cycling was
implicitly truncated to a number of packets no larger than the normal
unity-gain (1.0x) phase of gain cycling. This meant that in multi-flow
scenarios some flows could get stuck with a lower bandwidth, because
they did not push enough packets inflight to discover that there was
more bandwidth available. This was really only an issue in multi-flow
LAN scenarios, where RTTs and BDPs are low enough for this to be an
issue.

This fix ensures that gain cycling can raise inflight for small BDPs
by ensuring that in PROBE_BW mode target inflight values with a
super-unity gain are always greater than inflight values with a gain
<= 1. Importantly, this applies whether the inflight value is
calculated for use as a cwnd value, or as a target inflight value for
the end of the super-unity phase in bbr_is_next_cycle_phase() (both
need to be bigger to ensure we can probe with more packets in flight
reliably).

This is a candidate fix for stable releases.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Priyaranjan Jha <priyarjha@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 58e2f479ffb4..4bfff3c87e8e 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -354,6 +354,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
 	cwnd = (cwnd + 1) & ~1U;
 
+	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+	if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+		cwnd += 2;
+
 	return cwnd;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 56e2c94f055d328f5f6b0a5c1721cca2f2d4e0a1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jul 2018 20:09:11 -0700
Subject: inet: frag: enforce memory limits earlier

We currently check current frags memory usage only when
a new frag queue is created. This allows attackers to first
consume the memory budget (default : 4 MB) creating thousands
of frag queues, then sending tiny skbs to exceed high_thresh
limit by 2 to 3 order of magnitude.

Note that before commit 648700f76b03 ("inet: frags: use rhashtables
for reassembly units"), work queue could be starved under DOS,
getting no cpu cycles.
After commit 648700f76b03, only the per frag queue timer can eventually
remove an incomplete frag queue and its skbs.

Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Jann Horn <jannh@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Peter Oskolkov <posk@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 1e4cf3ab560f..0d70608cc2e1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -157,9 +157,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
-		return NULL;
-
 	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (!q)
 		return NULL;
@@ -204,6 +201,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 {
 	struct inet_frag_queue *fq;
 
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+		return NULL;
+
 	rcu_read_lock();
 
 	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
-- 
cgit v1.2.3-59-g8ed1b


From 4672694bd4f1aebdab0ad763ae4716e89cb15221 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jul 2018 21:50:29 -0700
Subject: ipv4: frags: handle possible skb truesize change

ip_frag_queue() might call pskb_pull() on one skb that
is already in the fragment queue.

We need to take care of possible truesize change, or we
might have an imbalance of the netns frags memory usage.

IPv6 is immune to this bug, because RFC5722, Section 4,
amended by Errata ID 3089 states :

  When reassembling an IPv6 datagram, if
  one or more its constituent fragments is determined to be an
  overlapping fragment, the entire datagram (and any constituent
  fragments) MUST be silently discarded.

Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8e9528ebaa8e..d14d741fb05e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -383,11 +383,16 @@ found:
 		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
 
 		if (i < next->len) {
+			int delta = -next->truesize;
+
 			/* Eat head of the next overlapped fragment
 			 * and leave the loop. The next ones cannot overlap.
 			 */
 			if (!pskb_pull(next, i))
 				goto err;
+			delta += next->truesize;
+			if (delta)
+				add_frag_mem_limit(qp->q.net, delta);
 			next->ip_defrag_offset += i;
 			qp->q.meat -= i;
 			if (next->ip_summed != CHECKSUM_UNNECESSARY)
-- 
cgit v1.2.3-59-g8ed1b