From 38516ab59fbc5b3bb278cf5e1fe2867c70cff32e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 20 Apr 2010 17:04:50 -0400
Subject: tracing: Let tracepoints have data passed to tracepoint callbacks

This patch adds data to be passed to tracepoint callbacks.

The created functions from DECLARE_TRACE() now need a mandatory data
parameter. For example:

DECLARE_TRACE(mytracepoint, int value, value)

Will create the register function:

int register_trace_mytracepoint((void(*)(void *data, int value))probe,
                                void *data);

As the first argument, all callbacks (probes) must take a (void *data)
parameter. So a callback for the above tracepoint will look like:

void myprobe(void *data, int value)
{
}

The callback may choose to ignore the data parameter.

This change allows callbacks to register a private data pointer along
with the function probe.

	void mycallback(void *data, int value);

	register_trace_mytracepoint(mycallback, mydata);

Then the mycallback() will receive the "mydata" as the first parameter
before the args.

A more detailed example:

  DECLARE_TRACE(mytracepoint, TP_PROTO(int status), TP_ARGS(status));

  /* In the C file */

  DEFINE_TRACE(mytracepoint, TP_PROTO(int status), TP_ARGS(status));

  [...]

       trace_mytracepoint(status);

  /* In a file registering this tracepoint */

  int my_callback(void *data, int status)
  {
	struct my_struct my_data = data;
	[...]
  }

  [...]
	my_data = kmalloc(sizeof(*my_data), GFP_KERNEL);
	init_my_data(my_data);
	register_trace_mytracepoint(my_callback, my_data);

The same callback can also be registered to the same tracepoint as long
as the data registered is different. Note, the data must also be used
to unregister the callback:

	unregister_trace_mytracepoint(my_callback, my_data);

Because of the data parameter, tracepoints declared this way can not have
no args. That is:

  DECLARE_TRACE(mytracepoint, TP_PROTO(void), TP_ARGS());

will cause an error.

If no arguments are needed, a new macro can be used instead:

  DECLARE_TRACE_NOARGS(mytracepoint);

Since there are no arguments, the proto and args fields are left out.

This is part of a series to make the tracepoint footprint smaller:

   text	   data	    bss	    dec	    hex	filename
4913961	1088356	 861512	6863829	 68bbd5	vmlinux.orig
4914025	1088868	 861512	6864405	 68be15	vmlinux.class
4918492	1084612	 861512	6864616	 68bee8	vmlinux.tracepoint

Again, this patch also increases the size of the kernel, but
lays the ground work for decreasing it.

 v5: Fixed net/core/drop_monitor.c to handle these updates.

 v4: Moved the DECLARE_TRACE() DECLARE_TRACE_NOARGS out of the
     #ifdef CONFIG_TRACE_POINTS, since the two are the same in both
     cases. The __DECLARE_TRACE() is what changes.
     Thanks to Frederic Weisbecker for pointing this out.

 v3: Made all register_* functions require data to be passed and
     all callbacks to take a void * parameter as its first argument.
     This makes the calling functions comply with C standards.

     Also added more comments to the modifications of DECLARE_TRACE().

 v2: Made the DECLARE_TRACE() have the ability to pass arguments
     and added a new DECLARE_TRACE_NOARGS() for tracepoints that
     do not need any arguments.

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 net/core/drop_monitor.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index cf208d8042b1..ad41529fb60f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -172,12 +172,12 @@ out:
 	return;
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
 {
 	trace_drop_common(skb, location);
 }
 
-static void trace_napi_poll_hit(struct napi_struct *napi)
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
 {
 	struct dm_hw_stat_delta *new_stat;
 
@@ -225,12 +225,12 @@ static int set_all_monitor_traces(int state)
 
 	switch (state) {
 	case TRACE_ON:
-		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
-		rc |= register_trace_napi_poll(trace_napi_poll_hit);
+		rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
 		break;
 	case TRACE_OFF:
-		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
-		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
+		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
 
 		tracepoint_synchronize_unregister();
 
-- 
cgit v1.3-8-gc7d7


From 622e0ca1cd4d459f5af4f2c65f4dc0dd823cb4c3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 20 May 2010 23:07:56 -0700
Subject: gro: Fix bogus gso_size on the first fraglist entry

When GRO produces fraglist entries, and the resulting skb hits
an interface that is incapable of TSO but capable of FRAGLIST,
we end up producing a bogus packet with gso_size non-zero.

This was reported in the field with older versions of KVM that
did not set the TSO bits on tuntap.

This patch fixes that.

Reported-by: Igor Zhang <yugzhang@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c543dd252433..4c11000a96aa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2718,6 +2718,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
 	skb_shinfo(nskb)->frag_list = p;
 	skb_shinfo(nskb)->gso_size = pinfo->gso_size;
+	pinfo->gso_size = 0;
 	skb_header_release(p);
 	nskb->prev = p;
 
-- 
cgit v1.3-8-gc7d7


From 76cc8b13a6e41b537fd262b600da1571314add62 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Thu, 20 May 2010 18:37:59 +0000
Subject: net: fix problem in dequeuing from input_pkt_queue

Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
The primary issue it that the queue head must be incremented only
after a packet has been processed, that is only after
__netif_receive_skb has been called.  This is needed for the mechanism
to prevent OOO packet in RFS.  Also when flushing the input_pkt_queue
and process_queue, the process queue should be done first to prevent
OOO packets.

Because the input_pkt_queue has been effectively split into two queues,
the calculation of the tail ptr is no longer correct.  The correct value
would be head+input_pkt_queue->len+process_queue->len.  To avoid
this calculation we added an explict input_queue_tail in softnet_data.
The tail value is simply incremented when queuing to input_pkt_queue.

Signed-off-by: Tom Herbert <therbert@google.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 +++++++++++---
 net/core/dev.c            | 28 +++++++++++++++-------------
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a1bff6518166..256419583d9d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1407,17 +1407,25 @@ struct softnet_data {
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_head;
+	unsigned int		input_queue_tail;
 #endif
 	unsigned		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_add(struct softnet_data *sd,
-					unsigned int len)
+static inline void input_queue_head_incr(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head += len;
+	sd->input_queue_head++;
+#endif
+}
+
+static inline void input_queue_tail_incr_save(struct softnet_data *sd,
+					      unsigned int *qtail)
+{
+#ifdef CONFIG_RPS
+	*qtail = ++sd->input_queue_tail;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c820650b80f..0aab66d68b19 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2426,10 +2426,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
-#ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head +
-					skb_queue_len(&sd->input_pkt_queue);
-#endif
+			input_queue_tail_incr_save(sd, qtail);
 			rps_unlock(sd);
 			local_irq_restore(flags);
 			return NET_RX_SUCCESS;
@@ -2964,7 +2961,7 @@ static void flush_backlog(void *arg)
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
-			input_queue_head_add(sd, 1);
+			input_queue_head_incr(sd);
 		}
 	}
 	rps_unlock(sd);
@@ -2973,6 +2970,7 @@ static void flush_backlog(void *arg)
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->process_queue);
 			kfree_skb(skb);
+			input_queue_head_incr(sd);
 		}
 	}
 }
@@ -3328,18 +3326,20 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
 			local_irq_enable();
 			__netif_receive_skb(skb);
-			if (++work >= quota)
-				return work;
 			local_irq_disable();
+			input_queue_head_incr(sd);
+			if (++work >= quota) {
+				local_irq_enable();
+				return work;
+			}
 		}
 
 		rps_lock(sd);
 		qlen = skb_queue_len(&sd->input_pkt_queue);
-		if (qlen) {
-			input_queue_head_add(sd, qlen);
+		if (qlen)
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 						   &sd->process_queue);
-		}
+
 		if (qlen < quota - work) {
 			/*
 			 * Inline a custom version of __napi_complete().
@@ -5679,12 +5679,14 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
-	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 		netif_rx(skb);
-		input_queue_head_add(oldsd, 1);
+		input_queue_head_incr(oldsd);
 	}
-	while ((skb = __skb_dequeue(&oldsd->process_queue)))
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
+		input_queue_head_incr(oldsd);
+	}
 
 	return NOTIFY_OK;
 }
-- 
cgit v1.3-8-gc7d7


From 253683bbfb6bc5864417c8c35cb6ef13b5e259e6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 May 2010 02:25:27 +0000
Subject: rtnetlink: Fix error handling in do_setlink()

Commit c02db8c6290bb992442fec1407643c94cc414375:

	Author:  Chris Wright <chrisw@sous-sol.org>
	Date:    Sun May 16 01:05:45 2010 -0700
	Subject: rtnetlink: make SR-IOV VF interface symmetric

adds broken error handling to do_setlink() in net/core/rtnetlink.c.  The
problem is the following chunk of code:

	if (tb[IFLA_VFINFO_LIST]) {
		struct nlattr *attr;
		int rem;
		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
			if (nla_type(attr) != IFLA_VF_INFO)
  ---->				goto errout;
			err = do_setvfinfo(dev, attr);
			if (err < 0)
				goto errout;
			modified = 1;
		}
	}

which can get to errout without setting err, resulting in the following error:

net/core/rtnetlink.c: In function 'do_setlink':
net/core/rtnetlink.c:904: warning: 'err' may be used uninitialized in this function

Change the code to return -EINVAL in this case.  Note that this might not be
the appropriate error though.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Chris Wright <chrisw@sous-sol.org>
cc: David S. Miller <davem@davemloft.net>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e4b9870e4706..7ab86f3a1ea4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1199,8 +1199,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		struct nlattr *attr;
 		int rem;
 		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
-			if (nla_type(attr) != IFLA_VF_INFO)
+			if (nla_type(attr) != IFLA_VF_INFO) {
+				err = -EINVAL;
 				goto errout;
+			}
 			err = do_setvfinfo(dev, attr);
 			if (err < 0)
 				goto errout;
-- 
cgit v1.3-8-gc7d7


From 8ce6cebc2f126f3ecf2d80746ea54245adf18057 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@free.fr>
Date: Wed, 19 May 2010 10:12:19 +0000
Subject: net-2.6 : V2 - fix dev_get_valid_name

the commit:

commit d90310243fd750240755e217c5faa13e24f41536
Author: Octavian Purdila <opurdila@ixiacom.com>
Date:   Wed Nov 18 02:36:59 2009 +0000

    net: device name allocation cleanups

introduced a bug when there is a hash collision making impossible
to rename a device with eth%d. This bug is very hard to reproduce
and appears rarely.

The problem is coming from we don't pass a temporary buffer to
__dev_alloc_name but 'dev->name' which is modified by the function.

A detailed explanation is here:

http://marc.info/?l=linux-netdev&m=127417784011987&w=2

Changelog:
 V2 : replaced strings comparison by pointers comparison

Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Reviewed-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0aab66d68b19..07a48e2bf7db 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -954,18 +954,22 @@ int dev_alloc_name(struct net_device *dev, const char *name)
 }
 EXPORT_SYMBOL(dev_alloc_name);
 
-static int dev_get_valid_name(struct net *net, const char *name, char *buf,
-			      bool fmt)
+static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 {
+	struct net *net;
+
+	BUG_ON(!dev_net(dev));
+	net = dev_net(dev);
+
 	if (!dev_valid_name(name))
 		return -EINVAL;
 
 	if (fmt && strchr(name, '%'))
-		return __dev_alloc_name(net, name, buf);
+		return dev_alloc_name(dev, name);
 	else if (__dev_get_by_name(net, name))
 		return -EEXIST;
-	else if (buf != name)
-		strlcpy(buf, name, IFNAMSIZ);
+	else if (dev->name != name)
+		strlcpy(dev->name, name, IFNAMSIZ);
 
 	return 0;
 }
@@ -997,7 +1001,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
 
 	memcpy(oldname, dev->name, IFNAMSIZ);
 
-	err = dev_get_valid_name(net, newname, dev->name, 1);
+	err = dev_get_valid_name(dev, newname, 1);
 	if (err < 0)
 		return err;
 
@@ -4965,7 +4969,7 @@ int register_netdevice(struct net_device *dev)
 		}
 	}
 
-	ret = dev_get_valid_name(net, dev->name, dev->name, 0);
+	ret = dev_get_valid_name(dev, dev->name, 0);
 	if (ret)
 		goto err_uninit;
 
@@ -5574,7 +5578,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 		/* We get here if we can't use the current device name */
 		if (!pat)
 			goto out;
-		if (dev_get_valid_name(net, pat, dev->name, 1))
+		if (dev_get_valid_name(dev, pat, 1))
 			goto out;
 	}
 
-- 
cgit v1.3-8-gc7d7


From f845172531fb7410c7fb7780b1a6e51ee6df7d52 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 24 May 2010 00:12:34 -0700
Subject: cls_cgroup: Store classid in struct sock

Up until now cls_cgroup has relied on fetching the classid out of
the current executing thread.  This runs into trouble when a packet
processing is delayed in which case it may execute out of another
thread's context.

Furthermore, even when a packet is not delayed we may fail to
classify it if soft IRQs have been disabled, because this scenario
is indistinguishable from one where a packet unrelated to the
current thread is processed by a real soft IRQ.

In fact, the current semantics is inherently broken, as a single
skb may be constructed out of the writes of two different tasks.
A different manifestation of this problem is when the TCP stack
transmits in response of an incoming ACK.  This is currently
unclassified.

As we already have a concept of packet ownership for accounting
purposes in the skb->sk pointer, this is a natural place to store
the classid in a persistent manner.

This patch adds the cls_cgroup classid in struct sock, filling up
an existing hole on 64-bit :)

The value is set at socket creation time.  So all sockets created
via socket(2) automatically gains the ID of the thread creating it.
Whenever another process touches the socket by either reading or
writing to it, we will change the socket classid to that of the
process if it has a valid (non-zero) classid.

For sockets created on inbound connections through accept(2), we
inherit the classid of the original listening socket through
sk_clone, possibly preceding the actual accept(2) call.

In order to minimise risks, I have not made this the authoritative
classid.  For now it is only used as a backup when we execute
with soft IRQs disabled.  Once we're completely happy with its
semantics we can use it as the sole classid.

Footnote: I have rearranged the error path on cls_group module
creation.  If we didn't do this, then there is a window where
someone could create a tc rule using cls_group before the cgroup
subsystem has been registered.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cls_cgroup.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/sock.h       | 10 +++++++-
 net/core/sock.c          | 18 ++++++++++++++
 net/sched/cls_cgroup.c   | 50 ++++++++++++++++++++++++++------------
 net/socket.c             |  9 +++++++
 5 files changed, 133 insertions(+), 17 deletions(-)
 create mode 100644 include/net/cls_cgroup.h

(limited to 'net/core')

diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
new file mode 100644
index 000000000000..ef2df1475b51
--- /dev/null
+++ b/include/net/cls_cgroup.h
@@ -0,0 +1,63 @@
+/*
+ * cls_cgroup.h			Control Group Classifier
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _NET_CLS_CGROUP_H
+#define _NET_CLS_CGROUP_H
+
+#include <linux/cgroup.h>
+#include <linux/hardirq.h>
+#include <linux/rcupdate.h>
+
+#ifdef CONFIG_CGROUPS
+struct cgroup_cls_state
+{
+	struct cgroup_subsys_state css;
+	u32 classid;
+};
+
+#ifdef CONFIG_NET_CLS_CGROUP
+static inline u32 task_cls_classid(struct task_struct *p)
+{
+	if (in_interrupt())
+		return 0;
+
+	return container_of(task_subsys_state(p, net_cls_subsys_id),
+			    struct cgroup_cls_state, css).classid;
+}
+#else
+extern int net_cls_subsys_id;
+
+static inline u32 task_cls_classid(struct task_struct *p)
+{
+	int id;
+	u32 classid;
+
+	if (in_interrupt())
+		return 0;
+
+	rcu_read_lock();
+	id = rcu_dereference(net_cls_subsys_id);
+	if (id >= 0)
+		classid = container_of(task_subsys_state(p, id),
+				       struct cgroup_cls_state, css)->classid;
+	rcu_read_unlock();
+
+	return classid;
+}
+#endif
+#else
+static inline u32 task_cls_classid(struct task_struct *p)
+{
+	return 0;
+}
+#endif
+#endif  /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 5697caf8cc76..d24f382cb712 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -312,7 +312,7 @@ struct sock {
 	void			*sk_security;
 #endif
 	__u32			sk_mark;
-	/* XXX 4 bytes hole on 64 bit */
+	u32			sk_classid;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
 	void			(*sk_write_space)(struct sock *sk);
@@ -1074,6 +1074,14 @@ extern void *sock_kmalloc(struct sock *sk, int size,
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
 extern void sk_send_sigurg(struct sock *sk);
 
+#ifdef CONFIG_CGROUPS
+extern void sock_update_classid(struct sock *sk);
+#else
+static inline void sock_update_classid(struct sock *sk)
+{
+}
+#endif
+
 /*
  * Functions to fill in entries in struct proto_ops when a protocol
  * does not implement a particular function.
diff --git a/net/core/sock.c b/net/core/sock.c
index bf88a167c8f2..a05ae7f9771e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -123,6 +123,7 @@
 #include <linux/net_tstamp.h>
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
+#include <net/cls_cgroup.h>
 
 #include <linux/filter.h>
 
@@ -217,6 +218,11 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 EXPORT_SYMBOL(sysctl_optmem_max);
 
+#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
+int net_cls_subsys_id = -1;
+EXPORT_SYMBOL_GPL(net_cls_subsys_id);
+#endif
+
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
 	struct timeval tv;
@@ -1050,6 +1056,16 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 	module_put(owner);
 }
 
+#ifdef CONFIG_CGROUPS
+void sock_update_classid(struct sock *sk)
+{
+	u32 classid = task_cls_classid(current);
+
+	if (classid && classid != sk->sk_classid)
+		sk->sk_classid = classid;
+}
+#endif
+
 /**
  *	sk_alloc - All socket objects are allocated here
  *	@net: the applicable net namespace
@@ -1073,6 +1089,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sock_lock_init(sk);
 		sock_net_set(sk, get_net(net));
 		atomic_set(&sk->sk_wmem_alloc, 1);
+
+		sock_update_classid(sk);
 	}
 
 	return sk;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 221180384fd7..78ef2c5e130b 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -16,14 +16,11 @@
 #include <linux/errno.h>
 #include <linux/skbuff.h>
 #include <linux/cgroup.h>
+#include <linux/rcupdate.h>
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
-
-struct cgroup_cls_state
-{
-	struct cgroup_subsys_state css;
-	u32 classid;
-};
+#include <net/sock.h>
+#include <net/cls_cgroup.h>
 
 static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
 					       struct cgroup *cgrp);
@@ -112,6 +109,10 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	struct cls_cgroup_head *head = tp->root;
 	u32 classid;
 
+	rcu_read_lock();
+	classid = task_cls_state(current)->classid;
+	rcu_read_unlock();
+
 	/*
 	 * Due to the nature of the classifier it is required to ignore all
 	 * packets originating from softirq context as accessing `current'
@@ -122,12 +123,12 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	 * calls by looking at the number of nested bh disable calls because
 	 * softirqs always disables bh.
 	 */
-	if (softirq_count() != SOFTIRQ_OFFSET)
-		return -1;
-
-	rcu_read_lock();
-	classid = task_cls_state(current)->classid;
-	rcu_read_unlock();
+	if (softirq_count() != SOFTIRQ_OFFSET) {
+		/* If there is an sk_classid we'll use that. */
+		if (!skb->sk)
+			return -1;
+		classid = skb->sk->sk_classid;
+	}
 
 	if (!classid)
 		return -1;
@@ -289,18 +290,35 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
 
 static int __init init_cgroup_cls(void)
 {
-	int ret = register_tcf_proto_ops(&cls_cgroup_ops);
-	if (ret)
-		return ret;
+	int ret;
+
 	ret = cgroup_load_subsys(&net_cls_subsys);
 	if (ret)
-		unregister_tcf_proto_ops(&cls_cgroup_ops);
+		goto out;
+
+#ifndef CONFIG_NET_CLS_CGROUP
+	/* We can't use rcu_assign_pointer because this is an int. */
+	smp_wmb();
+	net_cls_subsys_id = net_cls_subsys.subsys_id;
+#endif
+
+	ret = register_tcf_proto_ops(&cls_cgroup_ops);
+	if (ret)
+		cgroup_unload_subsys(&net_cls_subsys);
+
+out:
 	return ret;
 }
 
 static void __exit exit_cgroup_cls(void)
 {
 	unregister_tcf_proto_ops(&cls_cgroup_ops);
+
+#ifndef CONFIG_NET_CLS_CGROUP
+	net_cls_subsys_id = -1;
+	synchronize_rcu();
+#endif
+
 	cgroup_unload_subsys(&net_cls_subsys);
 }
 
diff --git a/net/socket.c b/net/socket.c
index f9f7d0872cac..367d5477d00f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -94,6 +94,7 @@
 
 #include <net/compat.h>
 #include <net/wext.h>
+#include <net/cls_cgroup.h>
 
 #include <net/sock.h>
 #include <linux/netfilter.h>
@@ -558,6 +559,8 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	struct sock_iocb *si = kiocb_to_siocb(iocb);
 	int err;
 
+	sock_update_classid(sock->sk);
+
 	si->sock = sock;
 	si->scm = NULL;
 	si->msg = msg;
@@ -684,6 +687,8 @@ static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
 {
 	struct sock_iocb *si = kiocb_to_siocb(iocb);
 
+	sock_update_classid(sock->sk);
+
 	si->sock = sock;
 	si->scm = NULL;
 	si->msg = msg;
@@ -777,6 +782,8 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 	if (unlikely(!sock->ops->splice_read))
 		return -EINVAL;
 
+	sock_update_classid(sock->sk);
+
 	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
 }
 
@@ -3069,6 +3076,8 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags)
 {
+	sock_update_classid(sock->sk);
+
 	if (sock->ops->sendpage)
 		return sock->ops->sendpage(sock, page, offset, size, flags);
 
-- 
cgit v1.3-8-gc7d7


From 8286274284e15b11b0f531b6ceeef21fbe00a8dd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 24 May 2010 00:14:10 -0700
Subject: tun: Update classid on packet injection

This patch makes tun update its socket classid every time we
inject a packet into the network stack.  This is so that any
updates made by the admin to the process writing packets to
tun is effected.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 2 ++
 net/core/sock.c   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 97b25533e5fb..8793c2bf7f15 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -526,6 +526,8 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
 	struct sk_buff *skb;
 	int err;
 
+	sock_update_classid(sk);
+
 	/* Under a page?  Don't bother with paged skb. */
 	if (prepad + len < PAGE_SIZE || !linear)
 		linear = len;
diff --git a/net/core/sock.c b/net/core/sock.c
index a05ae7f9771e..37fe9b6adade 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1064,6 +1064,7 @@ void sock_update_classid(struct sock *sk)
 	if (classid && classid != sk->sk_classid)
 		sk->sk_classid = classid;
 }
+EXPORT_SYMBOL(sock_update_classid);
 #endif
 
 /**
-- 
cgit v1.3-8-gc7d7


From 8a74ad60a546b13bd1096b2a61a7a5c6fd9ae17c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 26 May 2010 19:20:18 +0000
Subject: net: fix lock_sock_bh/unlock_sock_bh

This new sock lock primitive was introduced to speedup some user context
socket manipulation. But it is unsafe to protect two threads, one using
regular lock_sock/release_sock, one using lock_sock_bh/unlock_sock_bh

This patch changes lock_sock_bh to be careful against 'owned' state.
If owned is found to be set, we must take the slow path.
lock_sock_bh() now returns a boolean to say if the slow path was taken,
and this boolean is used at unlock_sock_bh time to call the appropriate
unlock function.

After this change, BH are either disabled or enabled during the
lock_sock_bh/unlock_sock_bh protected section. This might be misleading,
so we rename these functions to lock_sock_fast()/unlock_sock_fast().

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  | 20 ++++++++++++++------
 net/core/datagram.c |  6 ++++--
 net/core/sock.c     | 33 +++++++++++++++++++++++++++++++++
 net/ipv4/udp.c      | 14 ++++++++------
 net/ipv6/udp.c      |  5 +++--
 5 files changed, 62 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index d2a71b04a5ae..ca241ea14875 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1026,15 +1026,23 @@ extern void release_sock(struct sock *sk);
 				SINGLE_DEPTH_NESTING)
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 
-static inline void lock_sock_bh(struct sock *sk)
+extern bool lock_sock_fast(struct sock *sk);
+/**
+ * unlock_sock_fast - complement of lock_sock_fast
+ * @sk: socket
+ * @slow: slow mode
+ *
+ * fast unlock socket for user context.
+ * If slow mode is on, we call regular release_sock()
+ */
+static inline void unlock_sock_fast(struct sock *sk, bool slow)
 {
-	spin_lock_bh(&sk->sk_lock.slock);
+	if (slow)
+		release_sock(sk);
+	else
+		spin_unlock_bh(&sk->sk_lock.slock);
 }
 
-static inline void unlock_sock_bh(struct sock *sk)
-{
-	spin_unlock_bh(&sk->sk_lock.slock);
-}
 
 extern struct sock		*sk_alloc(struct net *net, int family,
 					  gfp_t priority,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index e0097531417a..f5b6f43a4c2e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -229,15 +229,17 @@ EXPORT_SYMBOL(skb_free_datagram);
 
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 {
+	bool slow;
+
 	if (likely(atomic_read(&skb->users) == 1))
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
 
-	lock_sock_bh(sk);
+	slow = lock_sock_fast(sk);
 	skb_orphan(skb);
 	sk_mem_reclaim_partial(sk);
-	unlock_sock_bh(sk);
+	unlock_sock_fast(sk, slow);
 
 	/* skb is now orphaned, can be freed outside of locked section */
 	__kfree_skb(skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 37fe9b6adade..2cf7f9f7e775 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2007,6 +2007,39 @@ void release_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(release_sock);
 
+/**
+ * lock_sock_fast - fast version of lock_sock
+ * @sk: socket
+ *
+ * This version should be used for very small section, where process wont block
+ * return false if fast path is taken
+ *   sk_lock.slock locked, owned = 0, BH disabled
+ * return true if slow path is taken
+ *   sk_lock.slock unlocked, owned = 1, BH enabled
+ */
+bool lock_sock_fast(struct sock *sk)
+{
+	might_sleep();
+	spin_lock_bh(&sk->sk_lock.slock);
+
+	if (!sk->sk_lock.owned)
+		/*
+		 * Note : We must disable BH
+		 */
+		return false;
+
+	__lock_sock(sk);
+	sk->sk_lock.owned = 1;
+	spin_unlock(&sk->sk_lock.slock);
+	/*
+	 * The sk_lock has mutex_lock() semantics here:
+	 */
+	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+	local_bh_enable();
+	return true;
+}
+EXPORT_SYMBOL(lock_sock_fast);
+
 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
 {
 	struct timeval tv;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9de6a698f91d..b9d0d409516f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1063,10 +1063,11 @@ static unsigned int first_packet_length(struct sock *sk)
 	spin_unlock_bh(&rcvq->lock);
 
 	if (!skb_queue_empty(&list_kill)) {
-		lock_sock_bh(sk);
+		bool slow = lock_sock_fast(sk);
+
 		__skb_queue_purge(&list_kill);
 		sk_mem_reclaim_partial(sk);
-		unlock_sock_bh(sk);
+		unlock_sock_fast(sk, slow);
 	}
 	return res;
 }
@@ -1123,6 +1124,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int peeked;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
+	bool slow;
 
 	/*
 	 *	Check any passed addresses
@@ -1197,10 +1199,10 @@ out:
 	return err;
 
 csum_copy_err:
-	lock_sock_bh(sk);
+	slow = lock_sock_fast(sk);
 	if (!skb_kill_datagram(sk, skb, flags))
 		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-	unlock_sock_bh(sk);
+	unlock_sock_fast(sk, slow);
 
 	if (noblock)
 		return -EAGAIN;
@@ -1625,9 +1627,9 @@ int udp_rcv(struct sk_buff *skb)
 
 void udp_destroy_sock(struct sock *sk)
 {
-	lock_sock_bh(sk);
+	bool slow = lock_sock_fast(sk);
 	udp_flush_pending_frames(sk);
-	unlock_sock_bh(sk);
+	unlock_sock_fast(sk, slow);
 }
 
 /*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3d7a2c0b836a..87be58673b55 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -328,6 +328,7 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	int is_udp4;
+	bool slow;
 
 	if (addr_len)
 		*addr_len=sizeof(struct sockaddr_in6);
@@ -424,7 +425,7 @@ out:
 	return err;
 
 csum_copy_err:
-	lock_sock_bh(sk);
+	slow = lock_sock_fast(sk);
 	if (!skb_kill_datagram(sk, skb, flags)) {
 		if (is_udp4)
 			UDP_INC_STATS_USER(sock_net(sk),
@@ -433,7 +434,7 @@ csum_copy_err:
 			UDP6_INC_STATS_USER(sock_net(sk),
 					UDP_MIB_INERRORS, is_udplite);
 	}
-	unlock_sock_bh(sk);
+	unlock_sock_fast(sk, slow);
 
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;
-- 
cgit v1.3-8-gc7d7


From a47311380e094bb201be8a818370c73c3f52122c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 27 May 2010 16:09:39 -0700
Subject: net: fix __neigh_event_send()

commit 7fee226ad23 (net: add a noref bit on skb dst) missed one spot
where an skb is enqueued, with a possibly not refcounted dst entry.

__neigh_event_send() inserts skb into arp_queue, so we must make sure
dst entry is refcounted, or dst entry can be freed by garbage collector
after caller exits from rcu protected section.

Reported-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index bff37908bd55..6ba1c0eece03 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -934,6 +934,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 				kfree_skb(buff);
 				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
 			}
+			skb_dst_force(skb);
 			__skb_queue_tail(&neigh->arp_queue, skb);
 		}
 		rc = 1;
-- 
cgit v1.3-8-gc7d7


From 8ca9418350eccd5dd2659931807c1901224dd638 Mon Sep 17 00:00:00 2001
From: Scott Feldman <scofeldm@cisco.com>
Date: Fri, 28 May 2010 03:42:18 -0700
Subject: netlink: bug fix: don't overrun skbs on vf_port dump

Noticed by Patrick McHardy: was continuing to fill skb after a
nla_put_failure, ignoring the size calculated by upper layer.  Now,
return -EMSGSIZE on any overruns, but also allow netdev to
fail ndo_get_vf_port with error other than -EMSGSIZE, thus unwinding
nest.

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7ab86f3a1ea4..7331bb2f6b9c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -722,14 +722,13 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
 
 	for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
 		vf_port = nla_nest_start(skb, IFLA_VF_PORT);
-		if (!vf_port) {
-			nla_nest_cancel(skb, vf_ports);
-			return -EMSGSIZE;
-		}
+		if (!vf_port)
+			goto nla_put_failure;
 		NLA_PUT_U32(skb, IFLA_PORT_VF, vf);
 		err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
+		if (err == -EMSGSIZE)
+			goto nla_put_failure;
 		if (err) {
-nla_put_failure:
 			nla_nest_cancel(skb, vf_port);
 			continue;
 		}
@@ -739,6 +738,10 @@ nla_put_failure:
 	nla_nest_end(skb, vf_ports);
 
 	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, vf_ports);
+	return -EMSGSIZE;
 }
 
 static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
@@ -753,7 +756,7 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
 	err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
 	if (err) {
 		nla_nest_cancel(skb, port_self);
-		return err;
+		return (err == -EMSGSIZE) ? err : 0;
 	}
 
 	nla_nest_end(skb, port_self);
-- 
cgit v1.3-8-gc7d7


From 045de01a174d9f0734f657eb4b3313d89b4fd5ad Mon Sep 17 00:00:00 2001
From: Scott Feldman <scofeldm@cisco.com>
Date: Fri, 28 May 2010 03:42:43 -0700
Subject: netlink: bug fix: wrong size was calculated for vfinfo list blob

The wrong size was being calculated for vfinfo.  In one case, it was over-
calculating using nlmsg_total_size on attrs, in another case, it was
under-calculating by assuming ifla_vf_* structs are packed together, but
each struct is it's own attr w/ hdr (and padding).

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7331bb2f6b9c..1a2af24e9e3d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -650,11 +650,12 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev)
 	if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
 
 		int num_vfs = dev_num_vf(dev->dev.parent);
-		size_t size = nlmsg_total_size(sizeof(struct nlattr));
-		size += nlmsg_total_size(num_vfs * sizeof(struct nlattr));
-		size += num_vfs * (sizeof(struct ifla_vf_mac) +
-				  sizeof(struct ifla_vf_vlan) +
-				  sizeof(struct ifla_vf_tx_rate));
+		size_t size = nla_total_size(sizeof(struct nlattr));
+		size += nla_total_size(num_vfs * sizeof(struct nlattr));
+		size += num_vfs *
+			(nla_total_size(sizeof(struct ifla_vf_mac)) +
+			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
+			 nla_total_size(sizeof(struct ifla_vf_tx_rate)));
 		return size;
 	} else
 		return 0;
-- 
cgit v1.3-8-gc7d7