From 044a832a7779c0638bea2d0fea901c055b995f4a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 12 Jan 2015 13:38:49 +0100 Subject: xfrm: Fix local error reporting crash with interfamily tunnels We set the outer mode protocol too early. As a result, the local error handler might dispatch to the wrong address family and report the error to a wrong socket type. We fix this by setting the outer protocol to the skb after we accessed the inner mode for the last time, right before we do the atcual encapsulation where we switch finally to the outer mode. Reported-by: Chris Ruehl Tested-by: Chris Ruehl Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index d5f6bd9a210a..dab73813cb92 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -63,6 +63,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + skb->protocol = htons(ETH_P_IP); return x->outer_mode->output2(x, skb); } @@ -71,7 +72,6 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb->protocol = htons(ETH_P_IP); #ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; -- cgit v1.2.3-59-g8ed1b From 9949afa42be0b76f5832db112ce51bb6b35b2abb Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 10 Mar 2015 17:17:03 -0400 Subject: tcp: fix tcp_cong_avoid_ai() credit accumulation bug with decreases in w The recent change to tcp_cong_avoid_ai() to handle stretch ACKs introduced a bug where snd_cwnd_cnt could accumulate a very large value while w was large, and then if w was reduced snd_cwnd could be incremented by a large delta, leading to a large burst and high packet loss. This was tickled when CUBIC's bictcp_update() sets "ca->cnt = 100 * cwnd". This bug crept in while preparing the upstream version of 814d488c6126. Testing: This patch has been tested in datacenter netperf transfers and live youtube.com and google.com servers. Fixes: 814d488c6126 ("tcp: fix the timid additive increase on stretch ACKs") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d694088214cd..62856e185a93 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -378,6 +378,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + /* If credits accumulated at a higher w, apply them gently now. */ + if (tp->snd_cwnd_cnt >= w) { + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd++; + } + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; -- cgit v1.2.3-59-g8ed1b From d578e18ce93f5d33a7120fd57c453e22a4c0fc37 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 10 Mar 2015 17:17:04 -0400 Subject: tcp: restore 1.5x per RTT limit to CUBIC cwnd growth in congestion avoidance Commit 814d488c6126 ("tcp: fix the timid additive increase on stretch ACKs") fixed a bug where tcp_cong_avoid_ai() would either credit a connection with an increase of snd_cwnd_cnt, or increase snd_cwnd, but not both, resulting in cwnd increasing by 1 packet on at most every alternate invocation of tcp_cong_avoid_ai(). Although the commit correctly implemented the CUBIC algorithm, which can increase cwnd by as much as 1 packet per 1 packet ACKed (2x per RTT), in practice that could be too aggressive: in tests on network paths with small buffers, YouTube server retransmission rates nearly doubled. This commit restores CUBIC to a maximum cwnd growth rate of 1 packet per 2 packets ACKed (1.5x per RTT). In YouTube tests this restored retransmit rates to low levels. Testing: This patch has been tested in datacenter netperf transfers and live youtube.com and google.com servers. Fixes: 9cd981dcf174 ("tcp: fix stretch ACK bugs in CUBIC") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 4b276d1ed980..06d3d665a9fd 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -306,8 +306,10 @@ tcp_friendliness: } } - if (ca->cnt == 0) /* cannot be zero */ - ca->cnt = 1; + /* The maximum rate of cwnd increase CUBIC allows is 1 packet per + * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. + */ + ca->cnt = max(ca->cnt, 2U); } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) -- cgit v1.2.3-59-g8ed1b From c29390c6dfeee0944ac6b5610ebbe403944378fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2015 18:42:02 -0700 Subject: xps: must clear sender_cpu before forwarding John reported that my previous commit added a regression on his router. This is because sender_cpu & napi_id share a common location, so get_xps_queue() can see garbage and perform an out of bound access. We need to make sure sender_cpu is cleared before doing the transmit, otherwise any NIC busy poll enabled (skb_mark_napi_id()) can trigger this bug. Signed-off-by: Eric Dumazet Reported-by: John Bisected-by: John Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++++ net/core/skbuff.c | 2 +- net/ipv4/ip_forward.c | 1 + net/ipv6/ip6_output.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 30007afe70b3..f54d6659713a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -948,6 +948,13 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_sender_cpu_clear(struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + skb->sender_cpu = 0; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f80507823531..434e78e5254d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4173,7 +4173,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; - skb->sender_cpu = 0; + skb_sender_cpu_clear(skb); skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 787b3c294ce6..d9bc28ac5d1b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,6 +67,7 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + skb_sender_cpu_clear(skb); return dst_output(skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0a04a37305d5..7e80b61b51ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -318,6 +318,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct sk_buff *skb) { + skb_sender_cpu_clear(skb); return dst_output(skb); } -- cgit v1.2.3-59-g8ed1b From c8e2c80d7ec00d020320f905822bf49c5ad85250 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Mar 2015 09:49:59 -0700 Subject: inet_diag: fix possible overflow in inet_diag_dump_one_icsk() inet_diag_dump_one_icsk() allocates too small skb. Add inet_sk_attr_size() helper right before inet_sk_diag_fill() so that it can be updated if/when new attributes are added. iproute2/ss currently does not use this dump_one() interface, this might explain nobody noticed this problem yet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 81751f12645f..592aff37366b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -71,6 +71,20 @@ static inline void inet_diag_unlock_handler( mutex_unlock(&inet_diag_table_mutex); } +static size_t inet_sk_attr_size(void) +{ + return nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct inet_diag_req_v2 *req, struct user_namespace *user_ns, @@ -326,9 +340,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64, GFP_KERNEL); + rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; -- cgit v1.2.3-59-g8ed1b From cb7cf8a33ff73cf638481d1edf883d8968f934f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Mar 2015 12:19:24 -0700 Subject: inet: Clean up inet_csk_wait_for_connect() vs. might_sleep() I got the following trace with current net-next kernel : [14723.885290] WARNING: CPU: 26 PID: 22658 at kernel/sched/core.c:7285 __might_sleep+0x89/0xa0() [14723.885325] do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait_exclusive+0x34/0xa0 [14723.885355] CPU: 26 PID: 22658 Comm: netserver Not tainted 4.0.0-dbg-DEV #1379 [14723.885359] ffffffff81a223a8 ffff881fae9e7ca8 ffffffff81650b5d 0000000000000001 [14723.885364] ffff881fae9e7cf8 ffff881fae9e7ce8 ffffffff810a72e7 0000000000000000 [14723.885367] ffffffff81a57620 000000000000093a 0000000000000000 ffff881fae9e7e64 [14723.885371] Call Trace: [14723.885377] [] dump_stack+0x4c/0x65 [14723.885382] [] warn_slowpath_common+0x97/0xe0 [14723.885386] [] warn_slowpath_fmt+0x46/0x50 [14723.885390] [] ? trace_hardirqs_on_caller+0x10d/0x1d0 [14723.885393] [] ? prepare_to_wait_exclusive+0x34/0xa0 [14723.885396] [] ? prepare_to_wait_exclusive+0x34/0xa0 [14723.885399] [] __might_sleep+0x89/0xa0 [14723.885403] [] lock_sock_nested+0x36/0xb0 [14723.885406] [] ? release_sock+0x173/0x1c0 [14723.885411] [] inet_csk_accept+0x157/0x2a0 [14723.885415] [] ? abort_exclusive_wait+0xc0/0xc0 [14723.885419] [] inet_accept+0x2d/0x150 [14723.885424] [] SYSC_accept4+0xff/0x210 [14723.885428] [] ? retint_swapgs+0xe/0x44 [14723.885431] [] ? trace_hardirqs_on_caller+0x10d/0x1d0 [14723.885437] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [14723.885441] [] SyS_accept+0x10/0x20 [14723.885444] [] system_call_fastpath+0x12/0x17 [14723.885447] ---[ end trace ff74cd83355b1873 ]--- In commit 26cabd31259ba43f68026ce3f62b78094124333f Peter added a sched_annotate_sleep() in sk_wait_event() Is the following patch needed as well ? Alternative would be to use sk_wait_event() from inet_csk_wait_for_connect() Signed-off-by: Eric Dumazet Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 14d02ea905b6..3e44b9b0b78e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -268,6 +268,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); + sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) -- cgit v1.2.3-59-g8ed1b From d22e1537181188e5dc8cbc51451832625035bdc2 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Thu, 19 Mar 2015 19:19:30 -0400 Subject: tcp: fix tcp fin memory accounting tcp_send_fin() does not account for the memory it allocates properly, so sk_forward_alloc can be negative in cases where we've sent a FIN: ss example output (ss -amn | grep -B1 f4294): tcp FIN-WAIT-1 0 1 192.168.0.1:45520 192.0.2.1:8080 skmem:(r0,rb87380,t0,tb87380,f4294966016,w1280,o0,bl0) Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a2a796c5536b..1db253e36045 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2773,15 +2773,11 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (skb) break; yield(); } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); -- cgit v1.2.3-59-g8ed1b