From b3de7559afbb7a8a35b4be975a6adf6c5e3cdca0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 24 Sep 2010 13:22:06 +0000 Subject: tcp: fix TSO FACK loss marking in tcp_mark_head_lost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When TCP uses FACK algorithm to mark lost packets in tcp_mark_head_lost(), if the number of packets in the (TSO) skb is greater than the number of packets that should be marked lost, TCP incorrectly exits the loop and marks no packets lost in the skb. This underestimates tp->lost_out and affects the recovery/retransmission. This patch fargments the skb and marks the correct amount of packets lost. Signed-off-by: Yuchung Cheng Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 149e79ac2891..b55f60f6fcbe 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2545,7 +2545,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) cnt += tcp_skb_pcount(skb); if (cnt > packets) { - if (tcp_is_sack(tp) || (oldcnt >= packets)) + if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + (oldcnt >= packets)) break; mss = skb_shinfo(skb)->gso_size; -- cgit v1.2.3-59-g8ed1b From 7e1b33e5ea392dfc984fc63b76ca75acbf249dcd Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Mon, 27 Sep 2010 15:02:18 -0700 Subject: ipv6: add IPv6 to neighbour table overflow warning IPv4 and IPv6 have separate neighbour tables, so the warning messages should be distinguishable. [ Add a suitable message prefix on the ipv4 side as well -DaveM ] Signed-off-by: Ulrich Weber Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- net/ipv6/route.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6298f75d5e93..ac6559cb54f9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1231,7 +1231,7 @@ restart: } if (net_ratelimit()) - printk(KERN_WARNING "Neighbour table overflow.\n"); + printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); rt_drop(rt); return -ENOBUFS; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d126365ac046..8323136bdc54 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -670,7 +670,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad if (net_ratelimit()) printk(KERN_WARNING - "Neighbour table overflow.\n"); + "ipv6: Neighbour table overflow.\n"); dst_free(&rt->dst); return NULL; } -- cgit v1.2.3-59-g8ed1b From 01db403cf99f739f86903314a489fb420e0e254f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Sep 2010 20:24:54 -0700 Subject: tcp: Fix >4GB writes on 64-bit. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes kernel bugzilla #16603 tcp_sendmsg() truncates iov_len to an 'int' which a 4GB write to write zero bytes, for example. There is also the problem higher up of how verify_iovec() works. It wants to prevent the total length from looking like an error return value. However it does this using 'int', but syscalls return 'long' (and thus signed 64-bit on 64-bit machines). So it could trigger false-positives on 64-bit as written. So fix it to use 'long'. Reported-by: Olaf Bonorden Reported-by: Daniel Büse Reported-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/socket.h | 2 +- net/core/iovec.c | 5 +++-- net/ipv4/tcp.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/socket.h b/include/linux/socket.h index a2fada9becb6..a8f56e1ec760 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -322,7 +322,7 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata, int offset, unsigned int len, __wsum *csump); -extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode); +extern long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode); extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len); extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, int offset, int len); diff --git a/net/core/iovec.c b/net/core/iovec.c index 1cd98df412df..e6b133b77ccb 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -35,9 +35,10 @@ * in any case. */ -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { - int size, err, ct; + int size, ct; + long err; if (m->msg_namelen) { if (mode == VERIFY_READ) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 95d75d443927..f115ea68a4ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -943,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sg = sk->sk_route_caps & NETIF_F_SG; while (--iovlen >= 0) { - int seglen = iov->iov_len; + size_t seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; iov++; -- cgit v1.2.3-59-g8ed1b From 4d22f7d372f5769c6c0149e427ed6353e2dcfe61 Mon Sep 17 00:00:00 2001 From: Damian Lukowski Date: Tue, 28 Sep 2010 13:08:32 -0700 Subject: net-2.6: SYN retransmits: Add new parameter to retransmits_timed_out() Fixes kernel Bugzilla Bug 18952 This patch adds a syn_set parameter to the retransmits_timed_out() routine and updates its callers. If not set, TCP_RTO_MIN is taken as the calculation basis as before. If set, TCP_TIMEOUT_INIT is used instead, so that sysctl_syn_retries represents the actual amount of SYN retransmissions in case no SYNACKs are received when establishing a new connection. Signed-off-by: Damian Lukowski Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c35b469e851c..74c54b30600f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -135,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) /* This function calculates a "timeout" which is equivalent to the timeout of a * TCP connection after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN. + * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if + * syn_set flag is set. */ static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary) + unsigned int boundary, + bool syn_set) { unsigned int timeout, linear_backoff_thresh; unsigned int start_ts; + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -151,12 +154,12 @@ static bool retransmits_timed_out(struct sock *sk, else start_ts = tcp_sk(sk)->retrans_stamp; - linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); + linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; + timeout = ((2 << boundary) - 1) * rto_base; else - timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; return (tcp_time_stamp - start_ts) >= timeout; @@ -167,14 +170,15 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; - bool do_reset; + bool do_reset, syn_set = 0; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + syn_set = 1; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -187,14 +191,14 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (retransmits_timed_out(sk, retry_until)) { + if (retransmits_timed_out(sk, retry_until, syn_set)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -436,7 +440,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; -- cgit v1.2.3-59-g8ed1b From 68c1f3a96c32a4fe15ebadae45c8145a5e5a66d2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 28 Sep 2010 22:37:56 -0700 Subject: ip_gre: Fix dependencies wrt. ipv6. The GRE tunnel driver needs to invoke icmpv6 helpers in the ipv6 stack when ipv6 support is enabled. Therefore if IPV6 is enabled, we have to enforce that GRE's enabling (modular or static) matches that of ipv6. Reported-by: Patrick McHardy Reported-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 571f8950ed06..72380a30d1c8 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -217,6 +217,7 @@ config NET_IPIP config NET_IPGRE tristate "IP: GRE tunnels over IP" + depends on IPV6 || IPV6=n help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the -- cgit v1.2.3-59-g8ed1b From c5d3557103f8bef81d7a150ab9cc970099cd58a2 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 3 Oct 2010 15:37:42 +0000 Subject: Revert "ipv4: Make INET_LRO a bool instead of tristate." This reverts commit e81963b180ac502fda0326edf059b1e29cdef1a2. LRO is now deprecated in favour of GRO, and only a few drivers use it, so it is desirable to build it as a module in distribution kernels. The original change to prevent building it as a module was made in an attempt to avoid the case where some dependents are set to y and some to m, and INET_LRO can be set to m rather than y. However, the Kconfig system will reliably set INET_LRO=y in this case. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 72380a30d1c8..7cd7760144f7 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,7 +413,7 @@ config INET_XFRM_MODE_BEET If unsure, say Y. config INET_LRO - bool "Large Receive Offload (ipv4/tcp)" + tristate "Large Receive Offload (ipv4/tcp)" default y ---help--- Support for Large Receive Offload (ipv4/tcp). -- cgit v1.2.3-59-g8ed1b From 5b7c84066733c5dfb0e4016d939757b38de189e4 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 30 Sep 2010 14:29:40 +0000 Subject: ipv4: correct IGMP behavior on v3 query during v2-compatibility mode A recent patch to allow IGMPv2 responses to IGMPv3 queries bypasses length checks for valid query lengths, incorrectly resets the v2_seen timer, and does not support IGMPv1. The following patch responds with a v2 report as required by IGMPv2 while correcting the other problems introduced by the patch. Signed-Off-By: David L Stevens Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1fdcacd36ce7..2a4bb76f2132 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -834,7 +834,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int mark = 0; - if (len == 8 || IGMP_V2_SEEN(in_dev)) { + if (len == 8) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ @@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, igmpv3_clear_delrec(in_dev); } else if (len < 12) { return; /* ignore bogus packet; freed by caller */ + } else if (IGMP_V1_SEEN(in_dev)) { + /* This is a v3 query with v1 queriers present */ + max_delay = IGMP_Query_Response_Interval; + group = 0; + } else if (IGMP_V2_SEEN(in_dev)) { + /* this is a v3 query with v2 queriers present; + * Interpretation of the max_delay code is problematic here. + * A real v2 host would use ih_code directly, while v3 has a + * different encoding. We use the v3 encoding as more likely + * to be intended in a v3 query. + */ + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return; -- cgit v1.2.3-59-g8ed1b