From 15ecbe94a45ef88491ca459b26efdd02f91edb6d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Jun 2018 08:47:21 -0700 Subject: tcp: add one more quick ack after after ECN events Larry Brakmo proposal ( https://patchwork.ozlabs.org/patch/935233/ tcp: force cwnd at least 2 in tcp_cwnd_reduction) made us rethink about our recent patch removing ~16 quick acks after ECN events. tcp_enter_quickack_mode(sk, 1) makes sure one immediate ack is sent, but in the case the sender cwnd was lowered to 1, we do not want to have a delayed ack for the next packet we will receive. Fixes: 522040ea5fdd ("tcp: do not aggressively quick ack after ECN events") Signed-off-by: Eric Dumazet Reported-by: Neal Cardwell Cc: Lawrence Brakmo Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 355d3dffd021..045d930d01a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -265,7 +265,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) @@ -273,7 +273,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; -- cgit v1.2.3-59-g8ed1b From c860e997e9170a6d68f9d1e6e2cf61f572191aaf Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 27 Jun 2018 16:04:48 -0700 Subject: tcp: fix Fast Open key endianness Fast Open key could be stored in different endian based on the CPU. Previously hosts in different endianness in a server farm using the same key config (sysctl value) would produce different cookies. This patch fixes it by always storing it as little endian to keep same API for LE hosts. Reported-by: Daniele Iamartino Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d06247ba08b2..af0a857d8352 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -265,8 +265,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; - int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + __le32 key[4]; + int ret, i; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) @@ -275,11 +276,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) - memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else - memset(user_key, 0, sizeof(user_key)); + memset(key, 0, sizeof(key)); rcu_read_unlock(); + for (i = 0; i < ARRAY_SIZE(key); i++) + user_key[i] = le32_to_cpu(key[i]); + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", user_key[0], user_key[1], user_key[2], user_key[3]); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); @@ -290,13 +294,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(net, NULL, user_key, + + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + + tcp_fastopen_reset_cipher(net, NULL, key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], + user_key[0], user_key[1], user_key[2], user_key[3], (char *)tbl.data, ret); kfree(tbl.data); return ret; -- cgit v1.2.3-59-g8ed1b From 1236f22fbae15df3736ab4a984c64c0c6ee6254c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 Jun 2018 13:07:53 +0300 Subject: tcp: prevent bogus FRTO undos with non-SACK flows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If SACK is not enabled and the first cumulative ACK after the RTO retransmission covers more than the retransmitted skb, a spurious FRTO undo will trigger (assuming FRTO is enabled for that RTO). The reason is that any non-retransmitted segment acknowledged will set FLAG_ORIG_SACK_ACKED in tcp_clean_rtx_queue even if there is no indication that it would have been delivered for real (the scoreboard is not kept with TCPCB_SACKED_ACKED bits in the non-SACK case so the check for that bit won't help like it does with SACK). Having FLAG_ORIG_SACK_ACKED set results in the spurious FRTO undo in tcp_process_loss. We need to use more strict condition for non-SACK case and check that none of the cumulatively ACKed segments were retransmitted to prove that progress is due to original transmissions. Only then keep FLAG_ORIG_SACK_ACKED set, allowing FRTO undo to proceed in non-SACK case. (FLAG_ORIG_SACK_ACKED is planned to be renamed to FLAG_ORIG_PROGRESS to better indicate its purpose but to keep this change minimal, it will be done in another patch). Besides burstiness and congestion control violations, this problem can result in RTO loop: When the loss recovery is prematurely undoed, only new data will be transmitted (if available) and the next retransmission can occur only after a new RTO which in case of multiple losses (that are not for consecutive packets) requires one RTO per loss to recover. Signed-off-by: Ilpo Järvinen Tested-by: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 045d930d01a9..8e5522c6833a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3181,6 +3181,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); + + /* If any of the cumulatively ACKed segments was + * retransmitted, non-SACK case cannot confirm that + * progress was due to original transmission due to + * lack of TCPCB_SACKED_ACKED bits even if some of + * the packets may have been never retransmitted. + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; -- cgit v1.2.3-59-g8ed1b From 603d4cf8fe095b1ee78f423d514427be507fb513 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Sat, 30 Jun 2018 17:38:55 +0200 Subject: net: fix use-after-free in GRO with ESP Since the addition of GRO for ESP, gro_receive can consume the skb and return -EINPROGRESS. In that case, the lower layer GRO handler cannot touch the skb anymore. Commit 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") converted some of the gro_receive handlers that can lead to ESP's gro_receive so that they wouldn't access the skb when -EINPROGRESS is returned, but missed other spots, mainly in tunneling protocols. This patch finishes the conversion to using skb_gro_flush_final(), and adds a new helper, skb_gro_flush_final_remcsum(), used in VXLAN and GUE. Fixes: 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 4 +--- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/8021q/vlan.c | 2 +- net/ipv4/fou.c | 4 +--- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- 7 files changed, 26 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 750eaa53bf0c..ada33c2d9ac2 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -476,7 +476,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aee0e60471f1..f6bb1d54d4bd 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -623,9 +623,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, flush = 0; out: - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..3d0cc0b5cec2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2789,11 +2789,31 @@ static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff **pp, + int flush, + struct gro_remcsum *grc) +{ + if (PTR_ERR(pp) != -EINPROGRESS) { + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; + } +} #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff **pp, + int flush, + struct gro_remcsum *grc) +{ + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; +} #endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..8ccee3d01822 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -693,7 +693,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..c9ec1603666b 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -448,9 +448,7 @@ next_proto: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..6a7d980105f6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..69c54540d5b4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -394,7 +394,7 @@ unflush: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); -- cgit v1.2.3-59-g8ed1b