Merge branch 'ipv4-multipath-hash'

Peter Nørlund says: ==================== ipv4: Hash-based multipath routing When the routing cache was removed in 3.6, the IPv4 multipath algorithm changed from more or less being destination-based into being quasi-random per-packet scheduling. This increases the risk of out-of-order packets and makes it impossible to use multipath together with anycast services. This patch series replaces the old implementation with flow-based load balancing based on a hash over the source and destination addresses. Distribution of the hash is done with thresholds as described in RFC 2992. This reduces the disruption when a path is added/remove when having more than two paths. To futher the chance of successful usage in conjuction with anycast, ICMP error packets are hashed over the inner IP addresses. This ensures that PMTU will work together with anycast or load-balancers such as IPVS. Port numbers are not considered since fragments could cause problems with anycast and IPVS. Relying on the DF-flag for TCP packets is also insufficient, since ICMP inspection effectively extracts information from the opposite flow which might have a different state of the DF-flag. This is also why the RSS hash is not used. These are typically based on the NDIS RSS spec which mandates TCP support. Measurements of the additional overhead of a two-path multipath (p_mkroute_input excl. __mkroute_input) on a Xeon X3550 (4 cores, 2.66GHz): Original per-packet: ~394 cycles/packet L3 hash: ~76 cycles/packet Changes in v5: - Fixed compilation error Changes in v4: - Functions take hash directly instead of func ptr - Added inline hash function - Added dummy macros to minimize ifdefs - Use upper 31 bits of hash instead of lower Changes in v3: - Multipath algorithm is no longer configurable (always L3) - Added random seed to hash - Moved ICMP inspection to isolated function - Ignore source quench packets (deprecated as per RFC 6633) Changes in v2: - Replaced 8-bit xor hash with 31-bit jenkins hash - Don't scale weights (since 31-bit) - Avoided unnecesary renaming of variables - Rely on DF-bit instead of fragment offset when checking for fragmentation - upper_bound is now inclusive to avoid overflow - Use a callback to postpone extracting flow information until necessary - Skipped ICMP inspection entirely with L4 hashing - Handle newly added sysctl ignore_routes_with_linkdown ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2015-10-05 03:00:26 -0700
committer: David S. Miller <davem@davemloft.net> 2015-10-05 03:00:26 -0700
commit: 07355737a8badd951e6b72aa8609a2d6eed0a7e7 (patch)
tree: 16eb6cd39f9475f87223cbca93da02f1853570bd /net/ipv4/route.c
parent: Merge branch 'tcp-listener-fixes-and-improvement' (diff)
parent: ipv4: ICMP packet inspection for multipath (diff)
download: linux-dev-07355737a8badd951e6b72aa8609a2d6eed0a7e7.tar.xz
linux-dev-07355737a8badd951e6b72aa8609a2d6eed0a7e7.zip
1 files changed, 59 insertions, 6 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 76ca4e75f785..54297d3a0559 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1651,6 +1651,48 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+	const struct iphdr *outer_iph = ip_hdr(skb);
+	struct icmphdr _icmph;
+	const struct icmphdr *icmph;
+	struct iphdr _inner_iph;
+	const struct iphdr *inner_iph;
+
+	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+		goto standard_hash;
+
+	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+				   &_icmph);
+	if (!icmph)
+		goto standard_hash;
+
+	if (icmph->type != ICMP_DEST_UNREACH &&
+	    icmph->type != ICMP_REDIRECT &&
+	    icmph->type != ICMP_TIME_EXCEEDED &&
+	    icmph->type != ICMP_PARAMETERPROB) {
+		goto standard_hash;
+	}
+
+	inner_iph = skb_header_pointer(skb,
+				       outer_iph->ihl * 4 + sizeof(_icmph),
+				       sizeof(_inner_iph), &_inner_iph);
+	if (!inner_iph)
+		goto standard_hash;
+
+	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
 			    const struct flowi4 *fl4,
@@ -1658,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    __be32 daddr, __be32 saddr, u32 tos)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi && res->fi->fib_nhs > 1)
-		fib_select_multipath(res);
+	if (res->fi && res->fi->fib_nhs > 1) {
+		int h;
+
+		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+			h = ip_multipath_icmp_hash(skb);
+		else
+			h = fib_multipath_hash(saddr, daddr);
+		fib_select_multipath(res, h);
+	}
 #endif
 
 	/* create a routing cache entry */
@@ -2026,7 +2075,8 @@ add:
  * Major route resolver routine.
  */
 
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+					  int mp_hash)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2189,8 +2239,11 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	}
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-		fib_select_multipath(&res);
+	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+		if (mp_hash < 0)
+			mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr);
+		fib_select_multipath(&res, mp_hash);
+	}
 	else
 #endif
 	if (!res.prefixlen &&
@@ -2212,7 +2265,7 @@ out:
 	rcu_read_unlock();
 	return rth;
 }
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
 {
author	David S. Miller <davem@davemloft.net>	2015-10-05 03:00:26 -0700
committer	David S. Miller <davem@davemloft.net>	2015-10-05 03:00:26 -0700
commit	07355737a8badd951e6b72aa8609a2d6eed0a7e7 (patch)
tree	16eb6cd39f9475f87223cbca93da02f1853570bd /net/ipv4/route.c
parent	Merge branch 'tcp-listener-fixes-and-improvement' (diff)
parent	ipv4: ICMP packet inspection for multipath (diff)
download	linux-dev-07355737a8badd951e6b72aa8609a2d6eed0a7e7.tar.xz linux-dev-07355737a8badd951e6b72aa8609a2d6eed0a7e7.zip