From 08dcdbf6a7b9d14c2302c5bd0c5390ddf122f664 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Feb 2013 12:18:52 +0000
Subject: ipv6: use a stronger hash for tcp

It looks like its possible to open thousands of TCP IPv6
sessions on a server, all landing in a single slot of TCP hash
table. Incoming packets have to lookup sockets in a very
long list.

We should hash all bits from foreign IPv6 addresses, using
a salt and hash mix, not a simple XOR.

inet6_ehashfn() can also separately use the ports, instead
of xoring them.

Reported-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h |  8 ++++----
 include/net/inet_sock.h        |  1 +
 include/net/ipv6.h             | 12 ++++++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 7ca75cbbf75e..fd4ee016ba5c 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -28,16 +28,16 @@
 
 struct inet_hashinfo;
 
-/* I have no idea if this is a good hash for v6 or not. -DaveM */
 static inline unsigned int inet6_ehashfn(struct net *net,
 				const struct in6_addr *laddr, const u16 lport,
 				const struct in6_addr *faddr, const __be16 fport)
 {
-	u32 ports = (lport ^ (__force u16)fport);
+	u32 ports = (((u32)lport) << 16) | (__force u32)fport;
 
 	return jhash_3words((__force u32)laddr->s6_addr32[3],
-			    (__force u32)faddr->s6_addr32[3],
-			    ports, inet_ehash_secret + net_hash_mix(net));
+			    ipv6_addr_jhash(faddr),
+			    ports,
+			    inet_ehash_secret + net_hash_mix(net));
 }
 
 static inline int inet6_sk_ehashfn(const struct sock *sk)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index a4196cbc84ec..7235ae73a1e8 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -203,6 +203,7 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 extern int inet_sk_rebuild_header(struct sock *sk);
 
 extern u32 inet_ehash_secret;
+extern u32 ipv6_hash_secret;
 extern void build_ehash_secret(void);
 
 static inline unsigned int inet_ehashfn(struct net *net,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 851d5412a299..64d12e77719a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -15,6 +15,7 @@
 
 #include <linux/ipv6.h>
 #include <linux/hardirq.h>
+#include <linux/jhash.h>
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
@@ -514,6 +515,17 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a)
 #endif
 }
 
+/* more secured version of ipv6_addr_hash() */
+static inline u32 ipv6_addr_jhash(const struct in6_addr *a)
+{
+	u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1];
+
+	return jhash_3words(v,
+			    (__force u32)a->s6_addr32[2],
+			    (__force u32)a->s6_addr32[3],
+			    ipv6_hash_secret);
+}
+
 static inline bool ipv6_addr_loopback(const struct in6_addr *a)
 {
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
-- 
cgit v1.2.3-59-g8ed1b


From 5b0520425e5ea81ba95ec486dd6bbb59a09fff0e Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Thu, 21 Feb 2013 22:18:44 +0000
Subject: ipv4: fix error handling in icmp_protocol.

Now we handle icmp errors in each transport protocol's err_handler,
for icmp protocols, that is ping_err. Since this handler only care
of those icmp errors triggered by echo request, errors triggered
by echo reply(which sent by kernel) are sliently ignored.

So wrap ping_err() with icmp_err() to deal with those icmp errors.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/icmp.h |  1 +
 net/ipv4/af_inet.c |  2 +-
 net/ipv4/icmp.c    | 23 +++++++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/icmp.h b/include/net/icmp.h
index 9ac2524d1402..081439fd070e 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -41,6 +41,7 @@ struct net;
 
 extern void	icmp_send(struct sk_buff *skb_in,  int type, int code, __be32 info);
 extern int	icmp_rcv(struct sk_buff *skb);
+extern void	icmp_err(struct sk_buff *, u32 info);
 extern int	icmp_init(void);
 extern void	icmp_out_count(struct net *net, unsigned char type);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9cbcb94a4c6d..15847e19b7dd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1577,7 +1577,7 @@ static const struct net_offload udp_offload = {
 
 static const struct net_protocol icmp_protocol = {
 	.handler =	icmp_rcv,
-	.err_handler =	ping_err,
+	.err_handler =	icmp_err,
 	.no_policy =	1,
 	.netns_ok =	1,
 };
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 17ff9fd7cdda..3ac5dff79627 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -934,6 +934,29 @@ error:
 	goto drop;
 }
 
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+	int type = icmp_hdr(skb)->type;
+	int code = icmp_hdr(skb)->code;
+	struct net *net = dev_net(skb->dev);
+
+	/*
+	 * Use ping_err to handle all icmp errors except those
+	 * triggered by ICMP_ECHOREPLY which sent from kernel.
+	 */
+	if (icmph->type != ICMP_ECHOREPLY) {
+		ping_err(skb, info);
+		return;
+	}
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+	else if (type == ICMP_REDIRECT)
+		ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
 /*
  *	This table is the definition of how we handle ICMP.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 4cfb04854d053e4d6391d7f84495f48082342362 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Feb 2013 07:43:35 +0000
Subject: net: fix possible deadlock in sum_frag_mem_limit

Dave Jones reported a lockdep splat occurring in IP defrag code.

commit 6d7b857d541ecd1d (net: use lib/percpu_counter API for
fragmentation mem accounting) added a possible deadlock.

Because percpu_counter_sum_positive() needs to acquire
a lock that can be used from softirq, we need to disable BH
in sum_frag_mem_limit()

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 3f237db0a426..76c3fe5ecc2e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -114,7 +114,13 @@ static inline void init_frag_mem_limit(struct netns_frags *nf)
 
 static inline int sum_frag_mem_limit(struct netns_frags *nf)
 {
-	return percpu_counter_sum_positive(&nf->mem);
+	int res;
+
+	local_bh_disable();
+	res = percpu_counter_sum_positive(&nf->mem);
+	local_bh_enable();
+
+	return res;
 }
 
 static inline void inet_frag_lru_move(struct inet_frag_queue *q)
-- 
cgit v1.2.3-59-g8ed1b


From 490ab08127cebc25e3a260a74556b56ce5f47c0f Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Fri, 22 Feb 2013 07:30:30 +0000
Subject: IP_GRE: Fix IP-Identification.

GRE-GSO generates ip fragments with id 0,2,3,4... for every
GSO packet, which is not correct. Following patch fixes it
by setting ip-header id unique id of fragments are allowed.
As Eric Dumazet suggested it is optimized by using inner ip-header
whenever inner packet is ipv4.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipip.h | 17 +++++++++++++++++
 net/ipv4/af_inet.c |  6 ++++--
 net/ipv4/ip_gre.c  |  3 ++-
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipip.h b/include/net/ipip.h
index 21947cf4fa46..fd19625ff99d 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -71,4 +71,21 @@ static inline void iptunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 }
 
+static inline void tunnel_ip_select_ident(struct sk_buff *skb,
+					  const struct iphdr  *old_iph,
+					  struct dst_entry *dst)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	if (iph->frag_off & htons(IP_DF))
+		iph->id	= 0;
+	else {
+		/* Use inner packet iph-id if possible. */
+		if (skb->protocol == htons(ETH_P_IP) && old_iph->id)
+			iph->id	= old_iph->id;
+		else
+			__ip_select_ident(iph, dst,
+					  (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+	}
+}
 #endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15847e19b7dd..68f6a94f7661 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1332,8 +1332,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 			if (skb->next != NULL)
 				iph->frag_off |= htons(IP_MF);
 			offset += (skb->len - skb->mac_len - iph->ihl * 4);
-		} else
-			iph->id = htons(id++);
+		} else  {
+			if (!(iph->frag_off & htons(IP_DF)))
+				iph->id = htons(id++);
+		}
 		iph->tot_len = htons(skb->len - skb->mac_len);
 		iph->check = 0;
 		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5ef4da780ac1..b8bada00d516 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -970,7 +970,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	iph->daddr		=	fl4.daddr;
 	iph->saddr		=	fl4.saddr;
 	iph->ttl		=	ttl;
-	iph->id			=	0;
+
+	tunnel_ip_select_ident(skb, old_iph, &rt->dst);
 
 	if (ttl == 0) {
 		if (skb->protocol == htons(ETH_P_IP))
-- 
cgit v1.2.3-59-g8ed1b