From 3fb5a991916091a908d53608a5899240039fb51e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Sep 2010 15:42:43 +0000
Subject: cls_cgroup: Fix rcu lockdep warning

Dave reported an rcu lockdep warning on 2.6.35.4 kernel

task->cgroups and task->cgroups->subsys[i] are protected by RCU.
So we avoid accessing invalid pointers here. This might happen,
for example, when you are deref-ing those pointers while someone
move @task from one cgroup to another.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cls_cgroup.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 726cc3536409..ef6c24a529e1 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -27,11 +27,17 @@ struct cgroup_cls_state
 #ifdef CONFIG_NET_CLS_CGROUP
 static inline u32 task_cls_classid(struct task_struct *p)
 {
+	int classid;
+
 	if (in_interrupt())
 		return 0;
 
-	return container_of(task_subsys_state(p, net_cls_subsys_id),
-			    struct cgroup_cls_state, css)->classid;
+	rcu_read_lock();
+	classid = container_of(task_subsys_state(p, net_cls_subsys_id),
+			       struct cgroup_cls_state, css)->classid;
+	rcu_read_unlock();
+
+	return classid;
 }
 #else
 extern int net_cls_subsys_id;
-- 
cgit v1.2.3-59-g8ed1b


From 6523ce1525e88c598c75a1a6b8c4edddfa9defe8 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 5 Sep 2010 18:02:29 +0000
Subject: ipvs: fix active FTP

- Do not create expectation when forwarding the PORT
  command to avoid blocking the connection. The problem is that
  nf_conntrack_ftp.c:help() tries to create the same expectation later in
  POST_ROUTING and drops the packet with "dropping packet" message after
  failure in nf_ct_expect_related.

- Change ip_vs_update_conntrack to alter the conntrack
  for related connections from real server. If we do not alter the reply in
  this direction the next packet from client sent to vport 20 comes as NEW
  connection. We alter it but may be some collision happens for both
  conntracks and the second conntrack gets destroyed immediately. The
  connection stucks too.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_vs.h             |  3 +++
 net/netfilter/ipvs/ip_vs_core.c |  1 +
 net/netfilter/ipvs/ip_vs_ftp.c  |  6 ------
 net/netfilter/ipvs/ip_vs_xmit.c | 18 ++++++++++++------
 4 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a4747a0f7303..f976885f686f 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -955,6 +955,9 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum)
 	return csum_partial(diff, sizeof(diff), oldsum);
 }
 
+extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp,
+				   int outin);
+
 #endif /* __KERNEL__ */
 
 #endif	/* _NET_IP_VS_H */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4f8ddba48011..4c2f89df5cce 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -924,6 +924,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 	ip_vs_out_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+	ip_vs_update_conntrack(skb, cp, 0);
 	ip_vs_conn_put(cp);
 
 	skb->ipvs_property = 1;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 33b329bfc2d2..7e9af5b76d9e 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -410,7 +410,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
-	struct nf_conn *ct;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -497,11 +496,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		ip_vs_control_add(n_cp, cp);
 	}
 
-	ct = (struct nf_conn *)skb->nfct;
-	if (ct && ct != &nf_conntrack_untracked)
-		ip_vs_expect_related(skb, ct, n_cp,
-				     IPPROTO_TCP, &n_cp->dport, 1);
-
 	/*
 	 *	Move tunnel to listen state
 	 */
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 21e1a5e9b9d3..49df6bea6a2d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -349,8 +349,8 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 }
 #endif
 
-static void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
 {
 	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
 	struct nf_conntrack_tuple new_tuple;
@@ -365,11 +365,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
 	 * real-server we will see RIP->DIP.
 	 */
 	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-	new_tuple.src.u3 = cp->daddr;
+	if (outin)
+		new_tuple.src.u3 = cp->daddr;
+	else
+		new_tuple.dst.u3 = cp->vaddr;
 	/*
 	 * This will also take care of UDP and other protocols.
 	 */
-	new_tuple.src.u.tcp.port = cp->dport;
+	if (outin)
+		new_tuple.src.u.tcp.port = cp->dport;
+	else
+		new_tuple.dst.u.tcp.port = cp->vport;
 	nf_conntrack_alter_reply(ct, &new_tuple);
 }
 
@@ -428,7 +434,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
-	ip_vs_update_conntrack(skb, cp);
+	ip_vs_update_conntrack(skb, cp, 1);
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
@@ -506,7 +512,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
-	ip_vs_update_conntrack(skb, cp);
+	ip_vs_update_conntrack(skb, cp, 1);
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
-- 
cgit v1.2.3-59-g8ed1b


From 719f835853a92f6090258114a72ffe41f09155cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 8 Sep 2010 05:08:44 +0000
Subject: udp: add rehash on connect()

commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation)
added a secondary hash on UDP, hashed on (local addr, local port).

Problem is that following sequence :

fd = socket(...)
connect(fd, &remote, ...)

not only selects remote end point (address and port), but also sets
local address, while UDP stack stored in secondary hash table the socket
while its local address was INADDR_ANY (or ipv6 equivalent)

Sequence is :
 - autobind() : choose a random local port, insert socket in hash tables
              [while local address is INADDR_ANY]
 - connect() : set remote address and port, change local address to IP
              given by a route lookup.

When an incoming UDP frame comes, if more than 10 sockets are found in
primary hash table, we switch to secondary table, and fail to find
socket because its local address changed.

One solution to this problem is to rehash datagram socket if needed.

We add a new rehash(struct socket *) method in "struct proto", and
implement this method for UDP v4 & v6, using a common helper.

This rehashing only takes care of secondary hash table, since primary
hash (based on local port only) is not changed.

Reported-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  |  1 +
 include/net/udp.h   |  1 +
 net/ipv4/datagram.c |  5 ++++-
 net/ipv4/udp.c      | 44 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/datagram.c |  7 ++++++-
 net/ipv6/udp.c      | 10 ++++++++++
 6 files changed, 66 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index ac53bfbdfe16..adab9dc58183 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -752,6 +752,7 @@ struct proto {
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
+	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
 
 	/* Keeping track of sockets in use */
diff --git a/include/net/udp.h b/include/net/udp.h
index 7abdf305da50..a184d3496b13 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -151,6 +151,7 @@ static inline void udp_lib_hash(struct sock *sk)
 }
 
 extern void udp_lib_unhash(struct sock *sk);
+extern void udp_lib_rehash(struct sock *sk, u16 new_hash);
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
 {
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f0550941df7b..721a8a37b45c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,8 +62,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}
 	if (!inet->inet_saddr)
 		inet->inet_saddr = rt->rt_src;	/* Update source address */
-	if (!inet->inet_rcv_saddr)
+	if (!inet->inet_rcv_saddr) {
 		inet->inet_rcv_saddr = rt->rt_src;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
 	inet->inet_daddr = rt->rt_dst;
 	inet->inet_dport = usin->sin_port;
 	sk->sk_state = TCP_ESTABLISHED;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32e0bef60d0a..fb23c2e63b52 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
 }
 EXPORT_SYMBOL(udp_lib_unhash);
 
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+	if (sk_hashed(sk)) {
+		struct udp_table *udptable = sk->sk_prot->h.udp_table;
+		struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		nhslot2 = udp_hashslot2(udptable, newhash);
+		udp_sk(sk)->udp_portaddr_hash = newhash;
+		if (hslot2 != nhslot2) {
+			hslot = udp_hashslot(udptable, sock_net(sk),
+					     udp_sk(sk)->udp_port_hash);
+			/* we must lock primary chain too */
+			spin_lock_bh(&hslot->lock);
+
+			spin_lock(&hslot2->lock);
+			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hslot2->count--;
+			spin_unlock(&hslot2->lock);
+
+			spin_lock(&nhslot2->lock);
+			hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+						 &nhslot2->head);
+			nhslot2->count++;
+			spin_unlock(&nhslot2->lock);
+
+			spin_unlock_bh(&hslot->lock);
+		}
+	}
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+	u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	udp_lib_rehash(sk, new_hash);
+}
+
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
 	.backlog_rcv	   = __udp_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v4_rehash,
 	.get_port	   = udp_v4_get_port,
 	.memory_allocated  = &udp_memory_allocated,
 	.sysctl_mem	   = sysctl_udp_mem,
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 7d929a22cbc2..ef371aa01ac5 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -105,9 +105,12 @@ ipv4_connected:
 		if (ipv6_addr_any(&np->saddr))
 			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
 
-		if (ipv6_addr_any(&np->rcv_saddr))
+		if (ipv6_addr_any(&np->rcv_saddr)) {
 			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
 					       &np->rcv_saddr);
+			if (sk->sk_prot->rehash)
+				sk->sk_prot->rehash(sk);
+		}
 
 		goto out;
 	}
@@ -181,6 +184,8 @@ ipv4_connected:
 	if (ipv6_addr_any(&np->rcv_saddr)) {
 		ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
 		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
 	}
 
 	ip6_dst_store(sk, dst,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1dd1affdead2..5acb3560ff15 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,6 +111,15 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
 }
 
+static void udp_v6_rehash(struct sock *sk)
+{
+	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+					  &inet6_sk(sk)->rcv_saddr,
+					  inet_sk(sk)->inet_num);
+
+	udp_lib_rehash(sk, new_hash);
+}
+
 static inline int compute_score(struct sock *sk, struct net *net,
 				unsigned short hnum,
 				struct in6_addr *saddr, __be16 sport,
@@ -1447,6 +1456,7 @@ struct proto udpv6_prot = {
 	.backlog_rcv	   = udpv6_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v6_rehash,
 	.get_port	   = udp_v6_get_port,
 	.memory_allocated  = &udp_memory_allocated,
 	.sysctl_mem	   = sysctl_udp_mem,
-- 
cgit v1.2.3-59-g8ed1b


From 01f83d69844d307be2aa6fea88b0e8fe5cbdb2f4 Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Wed, 15 Sep 2010 10:27:52 -0700
Subject: tcp: Prevent overzealous packetization by SWS logic.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If peer uses tiny MSS (say, 75 bytes) and similarly tiny advertised
window, the SWS logic will packetize to half the MSS unnecessarily.

This causes problems with some embedded devices.

However for large MSS devices we do want to half-MSS packetize
otherwise we never get enough packets into the pipe for things
like fast retransmit and recovery to work.

Be careful also to handle the case where MSS > window, otherwise
we'll never send until the probe timer.

Reported-by: ツ Leandro Melo de Sales <leandroal@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index eaa9582779d0..3e4b33e36602 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -475,8 +475,22 @@ extern unsigned int tcp_current_mss(struct sock *sk);
 /* Bound MSS / TSO packet size with the half of the window */
 static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 {
-	if (tp->max_window && pktsize > (tp->max_window >> 1))
-		return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
+	int cutoff;
+
+	/* When peer uses tiny windows, there is no use in packetizing
+	 * to sub-MSS pieces for the sake of SWS or making sure there
+	 * are enough packets in the pipe for fast recovery.
+	 *
+	 * On the other hand, for extremely large MSS devices, handling
+	 * smaller than MSS windows in this way does make sense.
+	 */
+	if (tp->max_window >= 512)
+		cutoff = (tp->max_window >> 1);
+	else
+		cutoff = tp->max_window;
+
+	if (cutoff && pktsize > cutoff)
+		return max_t(int, cutoff, 68U - tp->tcp_header_len);
 	else
 		return pktsize;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 8444cf712c5f71845cba9dc30d8f530ff0d5ff83 Mon Sep 17 00:00:00 2001
From: Thomas Egerer <thomas.egerer@secunet.com>
Date: Mon, 20 Sep 2010 11:11:38 -0700
Subject: xfrm: Allow different selector family in temporary state

The family parameter xfrm_state_find is used to find a state matching a
certain policy. This value is set to the template's family
(encap_family) right before xfrm_state_find is called.
The family parameter is however also used to construct a temporary state
in xfrm_state_find itself which is wrong for inter-family scenarios
because it produces a selector for the wrong family. Since this selector
is included in the xfrm_user_acquire structure, user space programs
misinterpret IPv6 addresses as IPv4 and vice versa.
This patch splits up the original init_tempsel function into a part that
initializes the selector respectively the props and id of the temporary
state, to allow for differing ip address families whithin the state.

Signed-off-by: Thomas Egerer <thomas.egerer@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  4 ++--
 net/ipv4/xfrm4_state.c | 33 +++++++++++++++++++--------------
 net/ipv6/xfrm6_state.c | 33 +++++++++++++++++++--------------
 net/xfrm/xfrm_policy.c |  5 ++---
 net/xfrm/xfrm_state.c  | 45 +++++++++++++++++++++++++++------------------
 5 files changed, 69 insertions(+), 51 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index fc8f36dd0f5c..4f53532d4c2f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -298,8 +298,8 @@ struct xfrm_state_afinfo {
 	const struct xfrm_type	*type_map[IPPROTO_MAX];
 	struct xfrm_mode	*mode_map[XFRM_MODE_MAX];
 	int			(*init_flags)(struct xfrm_state *x);
-	void			(*init_tempsel)(struct xfrm_state *x, struct flowi *fl,
-						struct xfrm_tmpl *tmpl,
+	void			(*init_tempsel)(struct xfrm_selector *sel, struct flowi *fl);
+	void			(*init_temprop)(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
 						xfrm_address_t *daddr, xfrm_address_t *saddr);
 	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
 	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
 }
 
 static void
-__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		     struct xfrm_tmpl *tmpl,
-		     xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+{
+	sel->daddr.a4 = fl->fl4_dst;
+	sel->saddr.a4 = fl->fl4_src;
+	sel->dport = xfrm_flowi_dport(fl);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET;
+	sel->prefixlen_d = 32;
+	sel->prefixlen_s = 32;
+	sel->proto = fl->proto;
+	sel->ifindex = fl->oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+		   xfrm_address_t *daddr, xfrm_address_t *saddr)
 {
-	x->sel.daddr.a4 = fl->fl4_dst;
-	x->sel.saddr.a4 = fl->fl4_src;
-	x->sel.dport = xfrm_flowi_dport(fl);
-	x->sel.dport_mask = htons(0xffff);
-	x->sel.sport = xfrm_flowi_sport(fl);
-	x->sel.sport_mask = htons(0xffff);
-	x->sel.family = AF_INET;
-	x->sel.prefixlen_d = 32;
-	x->sel.prefixlen_s = 32;
-	x->sel.proto = fl->proto;
-	x->sel.ifindex = fl->oif;
 	x->id = tmpl->id;
 	if (x->id.daddr.a4 == 0)
 		x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.owner			= THIS_MODULE,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
+	.init_temprop		= xfrm4_init_temprop,
 	.output			= xfrm4_output,
 	.extract_input		= xfrm4_extract_input,
 	.extract_output		= xfrm4_extract_output,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index f417b77fa0e1..a67575d472a3 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -20,23 +20,27 @@
 #include <net/addrconf.h>
 
 static void
-__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		     struct xfrm_tmpl *tmpl,
-		     xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
 {
 	/* Initialize temporary selector matching only
 	 * to current session. */
-	ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst);
-	ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src);
-	x->sel.dport = xfrm_flowi_dport(fl);
-	x->sel.dport_mask = htons(0xffff);
-	x->sel.sport = xfrm_flowi_sport(fl);
-	x->sel.sport_mask = htons(0xffff);
-	x->sel.family = AF_INET6;
-	x->sel.prefixlen_d = 128;
-	x->sel.prefixlen_s = 128;
-	x->sel.proto = fl->proto;
-	x->sel.ifindex = fl->oif;
+	ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst);
+	ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src);
+	sel->dport = xfrm_flowi_dport(fl);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET6;
+	sel->prefixlen_d = 128;
+	sel->prefixlen_s = 128;
+	sel->proto = fl->proto;
+	sel->ifindex = fl->oif;
+}
+
+static void
+xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+		   xfrm_address_t *daddr, xfrm_address_t *saddr)
+{
 	x->id = tmpl->id;
 	if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
 		memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
@@ -168,6 +172,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.eth_proto		= htons(ETH_P_IPV6),
 	.owner			= THIS_MODULE,
 	.init_tempsel		= __xfrm6_init_tempsel,
+	.init_temprop		= xfrm6_init_temprop,
 	.tmpl_sort		= __xfrm6_tmpl_sort,
 	.state_sort		= __xfrm6_state_sort,
 	.output			= xfrm6_output,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2b3ed7ad4933..cbab6e1a8c9c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1175,9 +1175,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
 		    tmpl->mode == XFRM_MODE_BEET) {
 			remote = &tmpl->id.daddr;
 			local = &tmpl->saddr;
-			family = tmpl->encap_family;
-			if (xfrm_addr_any(local, family)) {
-				error = xfrm_get_saddr(net, &tmp, remote, family);
+			if (xfrm_addr_any(local, tmpl->encap_family)) {
+				error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
 				if (error)
 					goto fail;
 				local = &tmp;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5208b12fbfb4..eb96ce52f178 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -656,15 +656,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 EXPORT_SYMBOL(xfrm_sad_getinfo);
 
 static int
-xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		  struct xfrm_tmpl *tmpl,
-		  xfrm_address_t *daddr, xfrm_address_t *saddr,
-		  unsigned short family)
+xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
+		    struct xfrm_tmpl *tmpl,
+		    xfrm_address_t *daddr, xfrm_address_t *saddr,
+		    unsigned short family)
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
 	if (!afinfo)
 		return -1;
-	afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
+	afinfo->init_tempsel(&x->sel, fl);
+
+	if (family != tmpl->encap_family) {
+		xfrm_state_put_afinfo(afinfo);
+		afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+		if (!afinfo)
+			return -1;
+	}
+	afinfo->init_temprop(x, tmpl, daddr, saddr);
 	xfrm_state_put_afinfo(afinfo);
 	return 0;
 }
@@ -790,37 +798,38 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	int error = 0;
 	struct xfrm_state *best = NULL;
 	u32 mark = pol->mark.v & pol->mark.m;
+	unsigned short encap_family = tmpl->encap_family;
 
 	to_put = NULL;
 
 	spin_lock_bh(&xfrm_state_lock);
-	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, family);
+	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
 	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
-		if (x->props.family == family &&
+		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
-		    xfrm_state_addr_check(x, daddr, saddr, family) &&
+		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
 		    tmpl->mode == x->props.mode &&
 		    tmpl->id.proto == x->id.proto &&
 		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
-			xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+			xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
 					   &best, &acquire_in_progress, &error);
 	}
 	if (best)
 		goto found;
 
-	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
+	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
 	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
-		if (x->props.family == family &&
+		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
-		    xfrm_state_addr_check(x, daddr, saddr, family) &&
+		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
 		    tmpl->mode == x->props.mode &&
 		    tmpl->id.proto == x->id.proto &&
 		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
-			xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+			xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
 					   &best, &acquire_in_progress, &error);
 	}
 
@@ -829,7 +838,7 @@ found:
 	if (!x && !error && !acquire_in_progress) {
 		if (tmpl->id.spi &&
 		    (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
-					      tmpl->id.proto, family)) != NULL) {
+					      tmpl->id.proto, encap_family)) != NULL) {
 			to_put = x0;
 			error = -EEXIST;
 			goto out;
@@ -839,9 +848,9 @@ found:
 			error = -ENOMEM;
 			goto out;
 		}
-		/* Initialize temporary selector matching only
+		/* Initialize temporary state matching only
 		 * to current session. */
-		xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
+		xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
 		memcpy(&x->mark, &pol->mark, sizeof(x->mark));
 
 		error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
@@ -856,10 +865,10 @@ found:
 			x->km.state = XFRM_STATE_ACQ;
 			list_add(&x->km.all, &net->xfrm.state_all);
 			hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
-			h = xfrm_src_hash(net, daddr, saddr, family);
+			h = xfrm_src_hash(net, daddr, saddr, encap_family);
 			hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
 			if (x->id.spi) {
-				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, family);
+				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
 				hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
 			}
 			x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
-- 
cgit v1.2.3-59-g8ed1b


From 693019e90ca45d881109d32c0c6d29adf03f6447 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Thu, 23 Sep 2010 11:19:54 +0000
Subject: net: reset skb queue mapping when rx'ing over tunnel

Reset queue mapping when an skb is reentering the stack via a tunnel.
On second pass, the queue mapping from the original device is no
longer valid.

Signed-off-by: Tom Herbert <therbert@google.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 81d1413a8701..02386505033d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -242,6 +242,7 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += skb->len;
 	skb->rxhash = 0;
+	skb_set_queue_mapping(skb, 0);
 	skb_dst_drop(skb);
 	nf_reset(skb);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2cc6d2bf3d6195fabcf0febc192c01f99519a8f3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 24 Sep 2010 09:55:52 +0000
Subject: ipv6: add a missing unregister_pernet_subsys call

Clean up a missing exit path in the ipv6 module init routines.  In
addrconf_init we call ipv6_addr_label_init which calls register_pernet_subsys
for the ipv6_addr_label_ops structure.  But if module loading fails, or if the
ipv6 module is removed, there is no corresponding unregister_pernet_subsys call,
which leaves a now-bogus address on the pernet_list, leading to oopses in
subsequent registrations.  This patch cleans up both the failed load path and
the unload path.  Tested by myself with good results.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 include/net/addrconf.h |    1 +
 net/ipv6/addrconf.c    |   11 ++++++++---
 net/ipv6/addrlabel.c   |    5 +++++
 3 files changed, 14 insertions(+), 3 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  1 +
 net/ipv6/addrconf.c    | 11 ++++++++---
 net/ipv6/addrlabel.c   |  5 +++++
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 45375b41a2a0..4d40c4d0230b 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -121,6 +121,7 @@ static inline int addrconf_finite_timeout(unsigned long timeout)
  *	IPv6 Address Label subsystem (addrlabel.c)
  */
 extern int			ipv6_addr_label_init(void);
+extern void			ipv6_addr_label_cleanup(void);
 extern void			ipv6_addr_label_rtnl_register(void);
 extern u32			ipv6_addr_label(struct net *net,
 						const struct in6_addr *addr,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab70a3fbcafa..324fac3b6c16 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4637,10 +4637,12 @@ int __init addrconf_init(void)
 	if (err < 0) {
 		printk(KERN_CRIT "IPv6 Addrconf:"
 		       " cannot initialize default policy table: %d.\n", err);
-		return err;
+		goto out;
 	}
 
-	register_pernet_subsys(&addrconf_ops);
+	err = register_pernet_subsys(&addrconf_ops);
+	if (err < 0)
+		goto out_addrlabel;
 
 	/* The addrconf netdev notifier requires that loopback_dev
 	 * has it's ipv6 private information allocated and setup
@@ -4692,7 +4694,9 @@ errout:
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 errlo:
 	unregister_pernet_subsys(&addrconf_ops);
-
+out_addrlabel:
+	ipv6_addr_label_cleanup();
+out:
 	return err;
 }
 
@@ -4703,6 +4707,7 @@ void addrconf_cleanup(void)
 
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 	unregister_pernet_subsys(&addrconf_ops);
+	ipv6_addr_label_cleanup();
 
 	rtnl_lock();
 
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f0e774cea386..8175f802651b 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -393,6 +393,11 @@ int __init ipv6_addr_label_init(void)
 	return register_pernet_subsys(&ipv6_addr_label_ops);
 }
 
+void ipv6_addr_label_cleanup(void)
+{
+	unregister_pernet_subsys(&ipv6_addr_label_ops);
+}
+
 static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
 	[IFAL_ADDRESS]		= { .len = sizeof(struct in6_addr), },
 	[IFAL_LABEL]		= { .len = sizeof(u32), },
-- 
cgit v1.2.3-59-g8ed1b


From fb0c5f0bc8b69b40549449ee7fc65f3706f12062 Mon Sep 17 00:00:00 2001
From: Ulrich Weber <uweber@astaro.com>
Date: Mon, 27 Sep 2010 03:31:00 +0000
Subject: tproxy: check for transparent flag in ip_route_newports

as done in ip_route_connect()

Signed-off-by: Ulrich Weber <uweber@astaro.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index bd732d62e1c3..7e5e73bfa4de 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -199,6 +199,8 @@ static inline int ip_route_newports(struct rtable **rp, u8 protocol,
 		fl.fl_ip_sport = sport;
 		fl.fl_ip_dport = dport;
 		fl.proto = protocol;
+		if (inet_sk(sk)->transparent)
+			fl.flags |= FLOWI_FLAG_ANYSRC;
 		ip_rt_put(*rp);
 		*rp = NULL;
 		security_sk_classify_flow(sk, &fl);
-- 
cgit v1.2.3-59-g8ed1b