From 576ebd74928fd60ae112b33c42b89602015fadbd Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Tue, 21 May 2013 16:08:22 +0200
Subject: kernel: Fix s390 absolute memory access for /dev/mem

On s390 the prefix page and absolute zero pages are not correctly
returned when reading /dev/mem. The reason is that the s390 asm/io.h
file includes the asm-generic/io.h file which then defines
xlate_dev_mem_ptr() and therefore overwrites the s390 specific
version that does the correct swap operation for prefix and absolute
zero pages. The problem is a regression that was introduced with git
commit cd248341 (s390/pci: base support).

To fix the problem add "#ifndef xlate_dev_mem_ptr" in asm-generic/io.h
and "#define xlate_dev_mem_ptr" in asm/io.h. This ensures that the
s390 version is used. For completeness also add the "#ifndef"
construct for xlate_dev_kmem_ptr().

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/asm-generic/io.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index ac9da00e9f2c..d5afe96adba6 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -343,8 +343,12 @@ extern void ioport_unmap(void __iomem *p);
 #endif /* CONFIG_GENERIC_IOMAP */
 #endif /* CONFIG_HAS_IOPORT */
 
+#ifndef xlate_dev_kmem_ptr
 #define xlate_dev_kmem_ptr(p)	p
+#endif
+#ifndef xlate_dev_mem_ptr
 #define xlate_dev_mem_ptr(p)	__va(p)
+#endif
 
 #ifdef CONFIG_VIRT_TO_BUS
 #ifndef virt_to_bus
-- 
cgit v1.3-8-gc7d7


From 2a7851bffb008ff4882eee673da74718997b4265 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 17 May 2013 03:56:10 +0000
Subject: netfilter: add nf_ipv6_ops hook to fix xt_addrtype with IPv6

Quoting https://bugzilla.netfilter.org/show_bug.cgi?id=812:

[ ip6tables -m addrtype ]
When I tried to use in the nat/PREROUTING it messes up the
routing cache even if the rule didn't matched at all.
[..]
If I remove the --limit-iface-in from the non-working scenario, so just
use the -m addrtype --dst-type LOCAL it works!

This happens when LOCAL type matching is requested with --limit-iface-in,
and the default ipv6 route is via the interface the packet we test
arrived on.

Because xt_addrtype uses ip6_route_output, the ipv6 routing implementation
creates an unwanted cached entry, and the packet won't make it to the
real/expected destination.

Silently ignoring --limit-iface-in makes the routing work but it breaks
rule matching (--dst-type LOCAL with limit-iface-in is supposed to only
match if the dst address is configured on the incoming interface;
without --limit-iface-in it will match if the address is reachable
via lo).

The test should call ipv6_chk_addr() instead.  However, this would add
a link-time dependency on ipv6.

There are two possible solutions:

1) Revert the commit that moved ipt_addrtype to xt_addrtype,
   and put ipv6 specific code into ip6t_addrtype.
2) add new "nf_ipv6_ops" struct to register pointers to ipv6 functions.

While the former might seem preferable, Pablo pointed out that there
are more xt modules with link-time dependeny issues regarding ipv6,
so lets go for 2).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 16 ++++++++++++++++
 include/net/addrconf.h         |  2 +-
 net/ipv6/addrconf.c            |  2 +-
 net/ipv6/netfilter.c           |  7 +++++++
 net/netfilter/core.c           |  2 ++
 net/netfilter/xt_addrtype.c    | 27 ++++++++++++++++-----------
 6 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 98ffb54988b6..2d4df6ce043e 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -17,6 +17,22 @@ extern __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 
 extern int ipv6_netfilter_init(void);
 extern void ipv6_netfilter_fini(void);
+
+/*
+ * Hook functions for ipv6 to allow xt_* modules to be built-in even
+ * if IPv6 is a module.
+ */
+struct nf_ipv6_ops {
+	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
+			const struct net_device *dev, int strict);
+};
+
+extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
+static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void)
+{
+	return rcu_dereference(nf_ipv6_ops);
+}
+
 #else /* CONFIG_NETFILTER */
 static inline int ipv6_netfilter_init(void) { return 0; }
 static inline void ipv6_netfilter_fini(void) { return; }
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 84a6440f1f19..21f702704f24 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -65,7 +65,7 @@ extern int			addrconf_set_dstaddr(struct net *net,
 
 extern int			ipv6_chk_addr(struct net *net,
 					      const struct in6_addr *addr,
-					      struct net_device *dev,
+					      const struct net_device *dev,
 					      int strict);
 
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d1ab6ab29a55..d1b2d8034b54 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1487,7 +1487,7 @@ static int ipv6_count_addresses(struct inet6_dev *idev)
 }
 
 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
-		  struct net_device *dev, int strict)
+		  const struct net_device *dev, int strict)
 {
 	struct inet6_ifaddr *ifp;
 	unsigned int hash = inet6_addr_hash(addr);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 72836f40b730..95f3f1da0d7f 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -10,6 +10,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
 #include <linux/export.h>
+#include <net/addrconf.h>
 #include <net/dst.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
@@ -186,6 +187,10 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 	return csum;
 };
 
+static const struct nf_ipv6_ops ipv6ops = {
+	.chk_addr	= ipv6_chk_addr,
+};
+
 static const struct nf_afinfo nf_ip6_afinfo = {
 	.family			= AF_INET6,
 	.checksum		= nf_ip6_checksum,
@@ -198,6 +203,7 @@ static const struct nf_afinfo nf_ip6_afinfo = {
 
 int __init ipv6_netfilter_init(void)
 {
+	RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops);
 	return nf_register_afinfo(&nf_ip6_afinfo);
 }
 
@@ -206,5 +212,6 @@ int __init ipv6_netfilter_init(void)
  */
 void ipv6_netfilter_fini(void)
 {
+	RCU_INIT_POINTER(nf_ipv6_ops, NULL);
 	nf_unregister_afinfo(&nf_ip6_afinfo);
 }
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 07c865a31a3d..857ca9f35177 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -30,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex);
 
 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
 EXPORT_SYMBOL(nf_afinfo);
+const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
 {
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 49c5ff7f6dd6..68ff29f60867 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -22,6 +22,7 @@
 #include <net/ip6_fib.h>
 #endif
 
+#include <linux/netfilter_ipv6.h>
 #include <linux/netfilter/xt_addrtype.h>
 #include <linux/netfilter/x_tables.h>
 
@@ -33,12 +34,12 @@ MODULE_ALIAS("ip6t_addrtype");
 
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
-			    const struct in6_addr *addr)
+			    const struct in6_addr *addr, u16 mask)
 {
 	const struct nf_afinfo *afinfo;
 	struct flowi6 flow;
 	struct rt6_info *rt;
-	u32 ret;
+	u32 ret = 0;
 	int route_err;
 
 	memset(&flow, 0, sizeof(flow));
@@ -49,12 +50,19 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
 	rcu_read_lock();
 
 	afinfo = nf_get_afinfo(NFPROTO_IPV6);
-	if (afinfo != NULL)
+	if (afinfo != NULL) {
+		const struct nf_ipv6_ops *v6ops;
+
+		if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
+			v6ops = nf_get_ipv6_ops();
+			if (v6ops && v6ops->chk_addr(net, addr, dev, true))
+				ret = XT_ADDRTYPE_LOCAL;
+		}
 		route_err = afinfo->route(net, (struct dst_entry **)&rt,
-					flowi6_to_flowi(&flow), !!dev);
-	else
+					  flowi6_to_flowi(&flow), false);
+	} else {
 		route_err = 1;
-
+	}
 	rcu_read_unlock();
 
 	if (route_err)
@@ -62,15 +70,12 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
 
 	if (rt->rt6i_flags & RTF_REJECT)
 		ret = XT_ADDRTYPE_UNREACHABLE;
-	else
-		ret = 0;
 
-	if (rt->rt6i_flags & RTF_LOCAL)
+	if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
 		ret |= XT_ADDRTYPE_LOCAL;
 	if (rt->rt6i_flags & RTF_ANYCAST)
 		ret |= XT_ADDRTYPE_ANYCAST;
 
-
 	dst_release(&rt->dst);
 	return ret;
 }
@@ -90,7 +95,7 @@ static bool match_type6(struct net *net, const struct net_device *dev,
 
 	if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST |
 	     XT_ADDRTYPE_UNREACHABLE) & mask)
-		return !!(mask & match_lookup_rt6(net, dev, addr));
+		return !!(mask & match_lookup_rt6(net, dev, addr, mask));
 	return true;
 }
 
-- 
cgit v1.3-8-gc7d7


From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:50:24 +0900
Subject: cgroup: fix a subtle bug in descendant pre-order walk

When cgroup_next_descendant_pre() initiates a walk, it checks whether
the subtree root doesn't have any children and if not returns NULL.
Later code assumes that the subtree isn't empty.  This is broken
because the subtree may become empty inbetween, which can lead to the
traversal escaping the subtree by walking to the sibling of the
subtree root.

There's no reason to have the early exit path.  Remove it along with
the later assumption that the subtree isn't empty.  This simplifies
the code a bit and fixes the subtle bug.

While at it, fix the comment of cgroup_for_each_descendant_pre() which
was incorrectly referring to ->css_offline() instead of
->css_online().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: stable@vger.kernel.org
---
 include/linux/cgroup.h | 2 +-
 kernel/cgroup.c        | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5047355b9a0f..8bda1294c035 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -707,7 +707,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
  *
  * If a subsystem synchronizes against the parent in its ->css_online() and
  * before starting iterating, and synchronizes against @pos on each
- * iteration, any descendant cgroup which finished ->css_offline() is
+ * iteration, any descendant cgroup which finished ->css_online() is
  * guaranteed to be visible in the future iterations.
  *
  * In other words, the following guarantees that a descendant can't escape
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 38b136553044..31e9ef319070 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* if first iteration, pretend we just visited @cgroup */
-	if (!pos) {
-		if (list_empty(&cgroup->children))
-			return NULL;
+	if (!pos)
 		pos = cgroup;
-	}
 
 	/* visit the first child if exists */
 	next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
@@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 		return next;
 
 	/* no child, visit my or the closest ancestor's next sibling */
-	do {
+	while (pos != cgroup) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup,
 				      sibling);
 		if (&next->sibling != &pos->parent->children)
 			return next;
 
 		pos = pos->parent;
-	} while (pos != cgroup);
+	}
 
 	return NULL;
 }
-- 
cgit v1.3-8-gc7d7


From 12bcbe66d7b3cc9f9f86cd02f925666eaa3c2107 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 May 2013 14:38:42 -0400
Subject: rcu: Add _notrace variation of rcu_dereference_raw() and
 hlist_for_each_entry_rcu()

As rcu_dereference_raw() under RCU debug config options can add quite a
bit of checks, and that tracing uses rcu_dereference_raw(), these checks
happen with the function tracer. The function tracer also happens to trace
these debug checks too. This added overhead can livelock the system.

Add a new interface to RCU for both rcu_dereference_raw_notrace() as well
as hlist_for_each_entry_rcu_notrace() as the hlist iterator uses the
rcu_dereference_raw() as well, and is used a bit with the function tracer.

Link: http://lkml.kernel.org/r/20130528184209.304356745@goodmis.org

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/rculist.h  | 20 ++++++++++++++++++++
 include/linux/rcupdate.h |  9 +++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 8089e35d47ac..f4b1001a4676 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -460,6 +460,26 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
 			&(pos)->member)), typeof(*(pos)), member))
 
+/**
+ * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ *
+ * This is the same as hlist_for_each_entry_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define hlist_for_each_entry_rcu_notrace(pos, head, member)			\
+	for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\
+			typeof(*(pos)), member);			\
+		pos;							\
+		pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\
+			&(pos)->member)), typeof(*(pos)), member))
+
 /**
  * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
  * @pos:	the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4ccd68e49b00..ddcc7826d907 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -640,6 +640,15 @@ static inline void rcu_preempt_sleep_check(void)
 
 #define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
 
+/*
+ * The tracing infrastructure traces RCU (we want that), but unfortunately
+ * some of the RCU checks causes tracing to lock up the system.
+ *
+ * The tracing version of rcu_dereference_raw() must not call
+ * rcu_read_lock_held().
+ */
+#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
+
 /**
  * rcu_access_index() - fetch RCU index with no dereferencing
  * @p: The index to read
-- 
cgit v1.3-8-gc7d7


From 1e2bd517c108816220f262d7954b697af03b5f9c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 30 May 2013 06:45:27 +0000
Subject: udp6: Fix udp fragmentation for tunnel traffic.

udp6 over GRE tunnel does not work after to GRE tso changes. GRE
tso handler passes inner packet but keeps track of outer header
start in SKB_GSO_CB(skb)->mac_offset.  udp6 fragment need to
take care of outer header, which start at the mac_offset, while
adding fragment header.
This bug is introduced by commit 68c3316311 (GRE: Add TCP
segmentation offload for GRE).

Reported-by: Dmitry Kravkov <dkravkov@gmail.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Tested-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 net/ipv6/udp_offload.c | 20 ++++++++++++--------
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2e0ced1af3b1..9c676eae3968 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2852,6 +2852,21 @@ static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
 		SKB_GSO_CB(inner_skb)->mac_offset;
 }
 
+static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
+{
+	int new_headroom, headroom;
+	int ret;
+
+	headroom = skb_headroom(skb);
+	ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC);
+	if (ret)
+		return ret;
+
+	new_headroom = skb_headroom(skb);
+	SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom);
+	return 0;
+}
+
 static inline bool skb_is_gso(const struct sk_buff *skb)
 {
 	return skb_shinfo(skb)->gso_size;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 3bb3a891a424..d3cfaf9c7a08 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -46,11 +46,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 	unsigned int mss;
 	unsigned int unfrag_ip6hlen, unfrag_len;
 	struct frag_hdr *fptr;
-	u8 *mac_start, *prevhdr;
+	u8 *packet_start, *prevhdr;
 	u8 nexthdr;
 	u8 frag_hdr_sz = sizeof(struct frag_hdr);
 	int offset;
 	__wsum csum;
+	int tnl_hlen;
 
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
@@ -83,9 +84,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 	skb->ip_summed = CHECKSUM_NONE;
 
 	/* Check if there is enough headroom to insert fragment header. */
-	if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) &&
-	    pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC))
-		goto out;
+	tnl_hlen = skb_tnl_header_len(skb);
+	if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) {
+		if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+			goto out;
+	}
 
 	/* Find the unfragmentable header and shift it left by frag_hdr_sz
 	 * bytes to insert fragment header.
@@ -93,11 +96,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 	unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 	*prevhdr = NEXTHDR_FRAGMENT;
-	unfrag_len = skb_network_header(skb) - skb_mac_header(skb) +
-		     unfrag_ip6hlen;
-	mac_start = skb_mac_header(skb);
-	memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len);
+	unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+		     unfrag_ip6hlen + tnl_hlen;
+	packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+	memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
 
+	SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
 	skb->mac_header -= frag_hdr_sz;
 	skb->network_header -= frag_hdr_sz;
 
-- 
cgit v1.3-8-gc7d7


From e4c1721642bbd42d8142f4811cde0588c28db51d Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Wed, 29 May 2013 07:36:25 +0000
Subject: xfrm: force a garbage collection after deleting a policy

In some cases after deleting a policy from the SPD the policy would
remain in the dst/flow/route cache for an extended period of time
which caused problems for SELinux as its dynamic network access
controls key off of the number of XFRM policy and state entries.
This patch corrects this problem by forcing a XFRM garbage collection
whenever a policy is sucessfully removed.

Reported-by: Ondrej Moris <omoris@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     | 5 +++++
 net/key/af_key.c       | 4 ++++
 net/xfrm/xfrm_policy.c | 3 ++-
 net/xfrm/xfrm_user.c   | 2 ++
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ae16531d0d35..94ce082b29dc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1160,6 +1160,8 @@ static inline void xfrm_sk_free_policy(struct sock *sk)
 	}
 }
 
+extern void xfrm_garbage_collect(struct net *net);
+
 #else
 
 static inline void xfrm_sk_free_policy(struct sock *sk) {}
@@ -1194,6 +1196,9 @@ static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
 {
 	return 1;
 }
+static inline void xfrm_garbage_collect(struct net *net)
+{
+}
 #endif
 
 static __inline__
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 5b1e5af25713..c5fbd7589681 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2366,6 +2366,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
 
 out:
 	xfrm_pol_put(xp);
+	if (err == 0)
+		xfrm_garbage_collect(net);
 	return err;
 }
 
@@ -2615,6 +2617,8 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
 
 out:
 	xfrm_pol_put(xp);
+	if (delete && err == 0)
+		xfrm_garbage_collect(net);
 	return err;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 23cea0f74336..ea970b8002a2 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2557,11 +2557,12 @@ static void __xfrm_garbage_collect(struct net *net)
 	}
 }
 
-static void xfrm_garbage_collect(struct net *net)
+void xfrm_garbage_collect(struct net *net)
 {
 	flow_cache_flush();
 	__xfrm_garbage_collect(net);
 }
+EXPORT_SYMBOL(xfrm_garbage_collect);
 
 static void xfrm_garbage_collect_deferred(struct net *net)
 {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index aa778748c565..3f565e495ac6 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1681,6 +1681,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 out:
 	xfrm_pol_put(xp);
+	if (delete && err == 0)
+		xfrm_garbage_collect(net);
 	return err;
 }
 
-- 
cgit v1.3-8-gc7d7


From 6d7581e62f8be462440d7b22c6361f7c9fa4902b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 29 May 2013 05:02:56 +0000
Subject: list: introduce list_first_entry_or_null

non-rcu variant of list_first_or_null_rcu

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/list.h b/include/linux/list.h
index 6a1f8df9144b..b83e5657365a 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -361,6 +361,17 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_first_entry(ptr, type, member) \
 	list_entry((ptr)->next, type, member)
 
+/**
+ * list_first_entry_or_null - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+
 /**
  * list_for_each	-	iterate over a list
  * @pos:	the &struct list_head to use as a loop cursor.
-- 
cgit v1.3-8-gc7d7


From c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 29 May 2013 09:06:27 +0000
Subject: net: force a reload of first item in hlist_nulls_for_each_entry_rcu

Roman Gushchin discovered that udp4_lib_lookup2() was not reloading
first item in the rcu protected list, in case the loop was restarted.

This produced soft lockups as in https://lkml.org/lkml/2013/4/16/37

rcu_dereference(X)/ACCESS_ONCE(X) seem to not work as intended if X is
ptr->field :

In some cases, gcc caches the value or ptr->field in a register.

Use a barrier() to disallow such caching, as documented in
Documentation/atomic_ops.txt line 114

Thanks a lot to Roman for providing analysis and numerous patches.

Diagnosed-by: Roman Gushchin <klamm@yandex-team.ru>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Boris Zhmurov <zhmurov@yandex-team.ru>
Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist_nulls.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 2ae13714828b..1c33dd7da4a7 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -105,9 +105,14 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  * @head:	the head for your list.
  * @member:	the name of the hlist_nulls_node within the struct.
  *
+ * The barrier() is needed to make sure compiler doesn't cache first element [1],
+ * as this loop can be restarted [2]
+ * [1] Documentation/atomic_ops.txt around line 114
+ * [2] Documentation/RCU/rculist_nulls.txt around line 146
  */
 #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)			\
-	for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));		\
+	for (({barrier();}),							\
+	     pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));		\
 		(!is_a_nulls(pos)) &&						\
 		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
-- 
cgit v1.3-8-gc7d7


From 01cb71d2d47b78354358e4bb938bb06323e17498 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 2 Jun 2013 13:55:05 +0000
Subject: net_sched: restore "overhead xxx" handling

commit 56b765b79 ("htb: improved accuracy at high rates")
broke the "overhead xxx" handling, as well as the "linklayer atm"
attribute.

tc class add ... htb rate X ceil Y linklayer atm overhead 10

This patch restores the "overhead xxx" handling, for htb, tbf
and act_police

The "linklayer atm" thing needs a separate fix.

Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vimalkumar <j.vimal@gmail.com>
Cc: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 18 +++++++++++-------
 net/sched/act_police.c    |  8 ++++----
 net/sched/sch_generic.c   |  8 +++++---
 net/sched/sch_htb.c       |  8 ++++----
 net/sched/sch_tbf.c       |  8 ++++----
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f10818fc8804..e7f4e21cc3e1 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -679,22 +679,26 @@ static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask,
 #endif
 
 struct psched_ratecfg {
-	u64 rate_bps;
-	u32 mult;
-	u32 shift;
+	u64	rate_bps;
+	u32	mult;
+	u16	overhead;
+	u8	shift;
 };
 
 static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
 				unsigned int len)
 {
-	return ((u64)len * r->mult) >> r->shift;
+	return ((u64)(len + r->overhead) * r->mult) >> r->shift;
 }
 
-extern void psched_ratecfg_precompute(struct psched_ratecfg *r, u32 rate);
+extern void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf);
 
-static inline u32 psched_ratecfg_getrate(const struct psched_ratecfg *r)
+static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
+					  const struct psched_ratecfg *r)
 {
-	return r->rate_bps >> 3;
+	memset(res, 0, sizeof(*res));
+	res->rate = r->rate_bps >> 3;
+	res->overhead = r->overhead;
 }
 
 #endif
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 823463adbd21..189e3c5b3d09 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -231,14 +231,14 @@ override:
 	}
 	if (R_tab) {
 		police->rate_present = true;
-		psched_ratecfg_precompute(&police->rate, R_tab->rate.rate);
+		psched_ratecfg_precompute(&police->rate, &R_tab->rate);
 		qdisc_put_rtab(R_tab);
 	} else {
 		police->rate_present = false;
 	}
 	if (P_tab) {
 		police->peak_present = true;
-		psched_ratecfg_precompute(&police->peak, P_tab->rate.rate);
+		psched_ratecfg_precompute(&police->peak, &P_tab->rate);
 		qdisc_put_rtab(P_tab);
 	} else {
 		police->peak_present = false;
@@ -376,9 +376,9 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 	};
 
 	if (police->rate_present)
-		opt.rate.rate = psched_ratecfg_getrate(&police->rate);
+		psched_ratecfg_getrate(&opt.rate, &police->rate);
 	if (police->peak_present)
-		opt.peakrate.rate = psched_ratecfg_getrate(&police->peak);
+		psched_ratecfg_getrate(&opt.peakrate, &police->peak);
 	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if (police->tcfp_result &&
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index eac7e0ee23c1..20224086cc28 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -898,14 +898,16 @@ void dev_shutdown(struct net_device *dev)
 	WARN_ON(timer_pending(&dev->watchdog_timer));
 }
 
-void psched_ratecfg_precompute(struct psched_ratecfg *r, u32 rate)
+void psched_ratecfg_precompute(struct psched_ratecfg *r,
+			       const struct tc_ratespec *conf)
 {
 	u64 factor;
 	u64 mult;
 	int shift;
 
-	r->rate_bps = (u64)rate << 3;
-	r->shift = 0;
+	memset(r, 0, sizeof(*r));
+	r->overhead = conf->overhead;
+	r->rate_bps = (u64)conf->rate << 3;
 	r->mult = 1;
 	/*
 	 * Calibrate mult, shift so that token counting is accurate
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 79b1876b6cd2..f87fb850b7ef 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1090,9 +1090,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 
 	memset(&opt, 0, sizeof(opt));
 
-	opt.rate.rate = psched_ratecfg_getrate(&cl->rate);
+	psched_ratecfg_getrate(&opt.rate, &cl->rate);
 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
-	opt.ceil.rate = psched_ratecfg_getrate(&cl->ceil);
+	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
 	opt.quantum = cl->quantum;
 	opt.prio = cl->prio;
@@ -1459,8 +1459,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			cl->prio = TC_HTB_NUMPRIO - 1;
 	}
 
-	psched_ratecfg_precompute(&cl->rate, hopt->rate.rate);
-	psched_ratecfg_precompute(&cl->ceil, hopt->ceil.rate);
+	psched_ratecfg_precompute(&cl->rate, &hopt->rate);
+	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil);
 
 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
 	cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index c8388f3c3426..e478d316602b 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -298,9 +298,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
 
-	psched_ratecfg_precompute(&q->rate, rtab->rate.rate);
+	psched_ratecfg_precompute(&q->rate, &rtab->rate);
 	if (ptab) {
-		psched_ratecfg_precompute(&q->peak, ptab->rate.rate);
+		psched_ratecfg_precompute(&q->peak, &ptab->rate);
 		q->peak_present = true;
 	} else {
 		q->peak_present = false;
@@ -350,9 +350,9 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto nla_put_failure;
 
 	opt.limit = q->limit;
-	opt.rate.rate = psched_ratecfg_getrate(&q->rate);
+	psched_ratecfg_getrate(&opt.rate, &q->rate);
 	if (q->peak_present)
-		opt.peakrate.rate = psched_ratecfg_getrate(&q->peak);
+		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 	else
 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 	opt.mtu = PSCHED_NS2TICKS(q->mtu);
-- 
cgit v1.3-8-gc7d7


From 29eb77825cc7da8d45b642de2de3d423dc8a363f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 5 Jun 2013 12:26:50 +0200
Subject: arch, mm: Remove tlb_fast_mode()

Since the introduction of preemptible mmu_gather TLB fast mode has been
broken. TLB fast mode relies on there being absolutely no concurrency;
it frees pages first and invalidates TLBs later.

However now we can get concurrency and stuff goes *bang*.

This patch removes all tlb_fast_mode() code; it was found the better
option vs trying to patch the hole by entangling tlb invalidation with
the scheduler.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Reported-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/tlb.h  | 27 ++++-----------------------
 arch/ia64/include/asm/tlb.h | 41 ++++++++---------------------------------
 include/asm-generic/tlb.h   | 17 +----------------
 mm/memory.c                 |  9 ---------
 4 files changed, 13 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 99a19512ee26..bdf2b8458ec1 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -33,18 +33,6 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
-/*
- * We need to delay page freeing for SMP as other CPUs can access pages
- * which have been removed but not yet had their TLB entries invalidated.
- * Also, as ARMv7 speculative prefetch can drag new entries into the TLB,
- * we need to apply this same delaying tactic to ensure correct operation.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_CPU_32v7)
-#define tlb_fast_mode(tlb)	0
-#else
-#define tlb_fast_mode(tlb)	1
-#endif
-
 #define MMU_GATHER_BUNDLE	8
 
 /*
@@ -112,12 +100,10 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb)
 static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 {
 	tlb_flush(tlb);
-	if (!tlb_fast_mode(tlb)) {
-		free_pages_and_swap_cache(tlb->pages, tlb->nr);
-		tlb->nr = 0;
-		if (tlb->pages == tlb->local)
-			__tlb_alloc_page(tlb);
-	}
+	free_pages_and_swap_cache(tlb->pages, tlb->nr);
+	tlb->nr = 0;
+	if (tlb->pages == tlb->local)
+		__tlb_alloc_page(tlb);
 }
 
 static inline void
@@ -178,11 +164,6 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 
 static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
-	if (tlb_fast_mode(tlb)) {
-		free_page_and_swap_cache(page);
-		return 1; /* avoid calling tlb_flush_mmu */
-	}
-
 	tlb->pages[tlb->nr++] = page;
 	VM_BUG_ON(tlb->nr > tlb->max);
 	return tlb->max - tlb->nr;
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index c3ffe3e54edc..ef3a9de01954 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -46,12 +46,6 @@
 #include <asm/tlbflush.h>
 #include <asm/machvec.h>
 
-#ifdef CONFIG_SMP
-# define tlb_fast_mode(tlb)	((tlb)->nr == ~0U)
-#else
-# define tlb_fast_mode(tlb)	(1)
-#endif
-
 /*
  * If we can't allocate a page to make a big batch of page pointers
  * to work on, then just handle a few from the on-stack structure.
@@ -60,7 +54,7 @@
 
 struct mmu_gather {
 	struct mm_struct	*mm;
-	unsigned int		nr;		/* == ~0U => fast mode */
+	unsigned int		nr;
 	unsigned int		max;
 	unsigned char		fullmm;		/* non-zero means full mm flush */
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
@@ -103,6 +97,7 @@ extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS];
 static inline void
 ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	unsigned long i;
 	unsigned int nr;
 
 	if (!tlb->need_flush)
@@ -141,13 +136,11 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e
 
 	/* lastly, release the freed pages */
 	nr = tlb->nr;
-	if (!tlb_fast_mode(tlb)) {
-		unsigned long i;
-		tlb->nr = 0;
-		tlb->start_addr = ~0UL;
-		for (i = 0; i < nr; ++i)
-			free_page_and_swap_cache(tlb->pages[i]);
-	}
+
+	tlb->nr = 0;
+	tlb->start_addr = ~0UL;
+	for (i = 0; i < nr; ++i)
+		free_page_and_swap_cache(tlb->pages[i]);
 }
 
 static inline void __tlb_alloc_page(struct mmu_gather *tlb)
@@ -167,20 +160,7 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_m
 	tlb->mm = mm;
 	tlb->max = ARRAY_SIZE(tlb->local);
 	tlb->pages = tlb->local;
-	/*
-	 * Use fast mode if only 1 CPU is online.
-	 *
-	 * It would be tempting to turn on fast-mode for full_mm_flush as well.  But this
-	 * doesn't work because of speculative accesses and software prefetching: the page
-	 * table of "mm" may (and usually is) the currently active page table and even
-	 * though the kernel won't do any user-space accesses during the TLB shoot down, a
-	 * compiler might use speculation or lfetch.fault on what happens to be a valid
-	 * user-space address.  This in turn could trigger a TLB miss fault (or a VHPT
-	 * walk) and re-insert a TLB entry we just removed.  Slow mode avoids such
-	 * problems.  (We could make fast-mode work by switching the current task to a
-	 * different "mm" during the shootdown.) --davidm 08/02/2002
-	 */
-	tlb->nr = (num_online_cpus() == 1) ? ~0U : 0;
+	tlb->nr = 0;
 	tlb->fullmm = full_mm_flush;
 	tlb->start_addr = ~0UL;
 }
@@ -214,11 +194,6 @@ static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 	tlb->need_flush = 1;
 
-	if (tlb_fast_mode(tlb)) {
-		free_page_and_swap_cache(page);
-		return 1; /* avoid calling tlb_flush_mmu */
-	}
-
 	if (!tlb->nr && tlb->pages == tlb->local)
 		__tlb_alloc_page(tlb);
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b1b1fa6ffffe..13821c339a41 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -97,11 +97,9 @@ struct mmu_gather {
 	unsigned long		start;
 	unsigned long		end;
 	unsigned int		need_flush : 1,	/* Did free PTEs */
-				fast_mode  : 1; /* No batching   */
-
 	/* we are in the middle of an operation to clear
 	 * a full mm and can make some optimizations */
-	unsigned int		fullmm : 1,
+				fullmm : 1,
 	/* we have performed an operation which
 	 * requires a complete flush of the tlb */
 				need_flush_all : 1;
@@ -114,19 +112,6 @@ struct mmu_gather {
 
 #define HAVE_GENERIC_MMU_GATHER
 
-static inline int tlb_fast_mode(struct mmu_gather *tlb)
-{
-#ifdef CONFIG_SMP
-	return tlb->fast_mode;
-#else
-	/*
-	 * For UP we don't need to worry about TLB flush
-	 * and page free order so much..
-	 */
-	return 1;
-#endif
-}
-
 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm);
 void tlb_flush_mmu(struct mmu_gather *tlb);
 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start,
diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882fbd72..61a262b08e53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -220,7 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
 	tlb->start	= -1UL;
 	tlb->end	= 0;
 	tlb->need_flush = 0;
-	tlb->fast_mode  = (num_possible_cpus() == 1);
 	tlb->local.next = NULL;
 	tlb->local.nr   = 0;
 	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
@@ -244,9 +243,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
 	tlb_table_flush(tlb);
 #endif
 
-	if (tlb_fast_mode(tlb))
-		return;
-
 	for (batch = &tlb->local; batch; batch = batch->next) {
 		free_pages_and_swap_cache(batch->pages, batch->nr);
 		batch->nr = 0;
@@ -288,11 +284,6 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 
 	VM_BUG_ON(!tlb->need_flush);
 
-	if (tlb_fast_mode(tlb)) {
-		free_page_and_swap_cache(page);
-		return 1; /* avoid calling tlb_flush_mmu() */
-	}
-
 	batch = tlb->active;
 	batch->pages[batch->nr++] = page;
 	if (batch->nr == batch->max) {
-- 
cgit v1.3-8-gc7d7


From a7526eb5d06b0084ef12d7b168d008fcf516caab Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Wed, 5 Jun 2013 19:38:26 +0000
Subject: net: Unbreak compat_sys_{send,recv}msg

I broke them in this commit:

    commit 1be374a0518a288147c6a7398792583200a67261
    Author: Andy Lutomirski <luto@amacapital.net>
    Date:   Wed May 22 14:07:44 2013 -0700

        net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

This patch adds __sys_sendmsg and __sys_sendmsg as common helpers that accept
MSG_CMSG_COMPAT and blocks MSG_CMSG_COMPAT at the syscall entrypoints.  It
also reverts some unnecessary checks in sys_socketcall.

Apparently I was suffering from underscore blindness the first time around.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Tested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h |  3 +++
 net/compat.c           | 13 +++++++--
 net/socket.c           | 72 +++++++++++++++++++++++---------------------------
 3 files changed, 47 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 33bf2dfab19d..b10ce4b341ea 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -320,6 +320,9 @@ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
 struct timespec;
 
+/* The __sys_...msg variants allow MSG_CMSG_COMPAT */
+extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
 extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			  unsigned int flags, struct timespec *timeout);
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
diff --git a/net/compat.c b/net/compat.c
index 79ae88485001..f0a1ba6c8086 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -734,19 +734,25 @@ static unsigned char nas[21] = {
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags)
 {
-	return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+	return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
 asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
 				    unsigned int vlen, unsigned int flags)
 {
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
 	return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
 			      flags | MSG_CMSG_COMPAT);
 }
 
 asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags)
 {
-	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+	return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
 asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned int flags)
@@ -768,6 +774,9 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
 	int datagrams;
 	struct timespec ktspec;
 
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+
 	if (COMPAT_USE_64BIT_TIME)
 		return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
 				      flags | MSG_CMSG_COMPAT,
diff --git a/net/socket.c b/net/socket.c
index 9ff6366fee13..4ca1526db756 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1956,7 +1956,7 @@ struct used_address {
 	unsigned int name_len;
 };
 
-static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
+static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
 			 struct msghdr *msg_sys, unsigned int flags,
 			 struct used_address *used_address)
 {
@@ -2071,26 +2071,30 @@ out:
  *	BSD sendmsg interface
  */
 
-SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags)
+long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
 {
 	int fput_needed, err;
 	struct msghdr msg_sys;
 	struct socket *sock;
 
-	if (flags & MSG_CMSG_COMPAT)
-		return -EINVAL;
-
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		goto out;
 
-	err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
+	err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
 
 	fput_light(sock->file, fput_needed);
 out:
 	return err;
 }
 
+SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags)
+{
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+	return __sys_sendmsg(fd, msg, flags);
+}
+
 /*
  *	Linux sendmmsg interface
  */
@@ -2121,15 +2125,16 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 
 	while (datagrams < vlen) {
 		if (MSG_CMSG_COMPAT & flags) {
-			err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry,
-					    &msg_sys, flags, &used_address);
+			err = ___sys_sendmsg(sock, (struct msghdr __user *)compat_entry,
+					     &msg_sys, flags, &used_address);
 			if (err < 0)
 				break;
 			err = __put_user(err, &compat_entry->msg_len);
 			++compat_entry;
 		} else {
-			err = __sys_sendmsg(sock, (struct msghdr __user *)entry,
-					    &msg_sys, flags, &used_address);
+			err = ___sys_sendmsg(sock,
+					     (struct msghdr __user *)entry,
+					     &msg_sys, flags, &used_address);
 			if (err < 0)
 				break;
 			err = put_user(err, &entry->msg_len);
@@ -2158,7 +2163,7 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
 	return __sys_sendmmsg(fd, mmsg, vlen, flags);
 }
 
-static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
 			 struct msghdr *msg_sys, unsigned int flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
@@ -2250,27 +2255,31 @@ out:
  *	BSD recvmsg interface
  */
 
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
-		unsigned int, flags)
+long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags)
 {
 	int fput_needed, err;
 	struct msghdr msg_sys;
 	struct socket *sock;
 
-	if (flags & MSG_CMSG_COMPAT)
-		return -EINVAL;
-
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		goto out;
 
-	err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+	err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
 
 	fput_light(sock->file, fput_needed);
 out:
 	return err;
 }
 
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+		unsigned int, flags)
+{
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+	return __sys_recvmsg(fd, msg, flags);
+}
+
 /*
  *     Linux recvmmsg interface
  */
@@ -2308,17 +2317,18 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		 * No need to ask LSM for more than the first datagram.
 		 */
 		if (MSG_CMSG_COMPAT & flags) {
-			err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry,
-					    &msg_sys, flags & ~MSG_WAITFORONE,
-					    datagrams);
+			err = ___sys_recvmsg(sock, (struct msghdr __user *)compat_entry,
+					     &msg_sys, flags & ~MSG_WAITFORONE,
+					     datagrams);
 			if (err < 0)
 				break;
 			err = __put_user(err, &compat_entry->msg_len);
 			++compat_entry;
 		} else {
-			err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
-					    &msg_sys, flags & ~MSG_WAITFORONE,
-					    datagrams);
+			err = ___sys_recvmsg(sock,
+					     (struct msghdr __user *)entry,
+					     &msg_sys, flags & ~MSG_WAITFORONE,
+					     datagrams);
 			if (err < 0)
 				break;
 			err = put_user(err, &entry->msg_len);
@@ -2505,31 +2515,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 				   (int __user *)a[4]);
 		break;
 	case SYS_SENDMSG:
-		if (a[2] & MSG_CMSG_COMPAT) {
-			err = -EINVAL;
-			break;
-		}
 		err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
 	case SYS_SENDMMSG:
-		if (a[3] & MSG_CMSG_COMPAT) {
-			err = -EINVAL;
-			break;
-		}
 		err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
 		break;
 	case SYS_RECVMSG:
-		if (a[2] & MSG_CMSG_COMPAT) {
-			err = -EINVAL;
-			break;
-		}
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
 	case SYS_RECVMMSG:
-		if (a[3] & MSG_CMSG_COMPAT) {
-			err = -EINVAL;
-			break;
-		}
 		err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
 				   (struct timespec __user *)a[4]);
 		break;
-- 
cgit v1.3-8-gc7d7