From 38516ab59fbc5b3bb278cf5e1fe2867c70cff32e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 20 Apr 2010 17:04:50 -0400 Subject: tracing: Let tracepoints have data passed to tracepoint callbacks This patch adds data to be passed to tracepoint callbacks. The created functions from DECLARE_TRACE() now need a mandatory data parameter. For example: DECLARE_TRACE(mytracepoint, int value, value) Will create the register function: int register_trace_mytracepoint((void(*)(void *data, int value))probe, void *data); As the first argument, all callbacks (probes) must take a (void *data) parameter. So a callback for the above tracepoint will look like: void myprobe(void *data, int value) { } The callback may choose to ignore the data parameter. This change allows callbacks to register a private data pointer along with the function probe. void mycallback(void *data, int value); register_trace_mytracepoint(mycallback, mydata); Then the mycallback() will receive the "mydata" as the first parameter before the args. A more detailed example: DECLARE_TRACE(mytracepoint, TP_PROTO(int status), TP_ARGS(status)); /* In the C file */ DEFINE_TRACE(mytracepoint, TP_PROTO(int status), TP_ARGS(status)); [...] trace_mytracepoint(status); /* In a file registering this tracepoint */ int my_callback(void *data, int status) { struct my_struct my_data = data; [...] } [...] my_data = kmalloc(sizeof(*my_data), GFP_KERNEL); init_my_data(my_data); register_trace_mytracepoint(my_callback, my_data); The same callback can also be registered to the same tracepoint as long as the data registered is different. Note, the data must also be used to unregister the callback: unregister_trace_mytracepoint(my_callback, my_data); Because of the data parameter, tracepoints declared this way can not have no args. That is: DECLARE_TRACE(mytracepoint, TP_PROTO(void), TP_ARGS()); will cause an error. If no arguments are needed, a new macro can be used instead: DECLARE_TRACE_NOARGS(mytracepoint); Since there are no arguments, the proto and args fields are left out. This is part of a series to make the tracepoint footprint smaller: text data bss dec hex filename 4913961 1088356 861512 6863829 68bbd5 vmlinux.orig 4914025 1088868 861512 6864405 68be15 vmlinux.class 4918492 1084612 861512 6864616 68bee8 vmlinux.tracepoint Again, this patch also increases the size of the kernel, but lays the ground work for decreasing it. v5: Fixed net/core/drop_monitor.c to handle these updates. v4: Moved the DECLARE_TRACE() DECLARE_TRACE_NOARGS out of the #ifdef CONFIG_TRACE_POINTS, since the two are the same in both cases. The __DECLARE_TRACE() is what changes. Thanks to Frederic Weisbecker for pointing this out. v3: Made all register_* functions require data to be passed and all callbacks to take a void * parameter as its first argument. This makes the calling functions comply with C standards. Also added more comments to the modifications of DECLARE_TRACE(). v2: Made the DECLARE_TRACE() have the ability to pass arguments and added a new DECLARE_TRACE_NOARGS() for tracepoints that do not need any arguments. Acked-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: Frederic Weisbecker Cc: Neil Horman Cc: David S. Miller Signed-off-by: Steven Rostedt --- net/core/drop_monitor.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index cf208d8042b1..ad41529fb60f 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -172,12 +172,12 @@ out: return; } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) { trace_drop_common(skb, location); } -static void trace_napi_poll_hit(struct napi_struct *napi) +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) { struct dm_hw_stat_delta *new_stat; @@ -225,12 +225,12 @@ static int set_all_monitor_traces(int state) switch (state) { case TRACE_ON: - rc |= register_trace_kfree_skb(trace_kfree_skb_hit); - rc |= register_trace_napi_poll(trace_napi_poll_hit); + rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); break; case TRACE_OFF: - rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); - rc |= unregister_trace_napi_poll(trace_napi_poll_hit); + rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); tracepoint_synchronize_unregister(); -- cgit v1.3-8-gc7d7 From 622e0ca1cd4d459f5af4f2c65f4dc0dd823cb4c3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 20 May 2010 23:07:56 -0700 Subject: gro: Fix bogus gso_size on the first fraglist entry When GRO produces fraglist entries, and the resulting skb hits an interface that is incapable of TSO but capable of FRAGLIST, we end up producing a bogus packet with gso_size non-zero. This was reported in the field with older versions of KVM that did not set the TSO bits on tuntap. This patch fixes that. Reported-by: Igor Zhang Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c543dd252433..4c11000a96aa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2718,6 +2718,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; + pinfo->gso_size = 0; skb_header_release(p); nskb->prev = p; -- cgit v1.3-8-gc7d7 From 76cc8b13a6e41b537fd262b600da1571314add62 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 20 May 2010 18:37:59 +0000 Subject: net: fix problem in dequeuing from input_pkt_queue Fix some issues introduced in batch skb dequeuing for input_pkt_queue. The primary issue it that the queue head must be incremented only after a packet has been processed, that is only after __netif_receive_skb has been called. This is needed for the mechanism to prevent OOO packet in RFS. Also when flushing the input_pkt_queue and process_queue, the process queue should be done first to prevent OOO packets. Because the input_pkt_queue has been effectively split into two queues, the calculation of the tail ptr is no longer correct. The correct value would be head+input_pkt_queue->len+process_queue->len. To avoid this calculation we added an explict input_queue_tail in softnet_data. The tail value is simply incremented when queuing to input_pkt_queue. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +++++++++++--- net/core/dev.c | 28 +++++++++++++++------------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a1bff6518166..256419583d9d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1407,17 +1407,25 @@ struct softnet_data { struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_head; + unsigned int input_queue_tail; #endif unsigned dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; }; -static inline void input_queue_head_add(struct softnet_data *sd, - unsigned int len) +static inline void input_queue_head_incr(struct softnet_data *sd) { #ifdef CONFIG_RPS - sd->input_queue_head += len; + sd->input_queue_head++; +#endif +} + +static inline void input_queue_tail_incr_save(struct softnet_data *sd, + unsigned int *qtail) +{ +#ifdef CONFIG_RPS + *qtail = ++sd->input_queue_tail; #endif } diff --git a/net/core/dev.c b/net/core/dev.c index 6c820650b80f..0aab66d68b19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2426,10 +2426,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); -#ifdef CONFIG_RPS - *qtail = sd->input_queue_head + - skb_queue_len(&sd->input_pkt_queue); -#endif + input_queue_tail_incr_save(sd, qtail); rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; @@ -2964,7 +2961,7 @@ static void flush_backlog(void *arg) if (skb->dev == dev) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); - input_queue_head_add(sd, 1); + input_queue_head_incr(sd); } } rps_unlock(sd); @@ -2973,6 +2970,7 @@ static void flush_backlog(void *arg) if (skb->dev == dev) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); + input_queue_head_incr(sd); } } } @@ -3328,18 +3326,20 @@ static int process_backlog(struct napi_struct *napi, int quota) while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); __netif_receive_skb(skb); - if (++work >= quota) - return work; local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } } rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) { - input_queue_head_add(sd, qlen); + if (qlen) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); - } + if (qlen < quota - work) { /* * Inline a custom version of __napi_complete(). @@ -5679,12 +5679,14 @@ static int dev_cpu_callback(struct notifier_block *nfb, local_irq_enable(); /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_add(oldsd, 1); + input_queue_head_incr(oldsd); } - while ((skb = __skb_dequeue(&oldsd->process_queue))) + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); + input_queue_head_incr(oldsd); + } return NOTIFY_OK; } -- cgit v1.3-8-gc7d7 From 253683bbfb6bc5864417c8c35cb6ef13b5e259e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 May 2010 02:25:27 +0000 Subject: rtnetlink: Fix error handling in do_setlink() Commit c02db8c6290bb992442fec1407643c94cc414375: Author: Chris Wright Date: Sun May 16 01:05:45 2010 -0700 Subject: rtnetlink: make SR-IOV VF interface symmetric adds broken error handling to do_setlink() in net/core/rtnetlink.c. The problem is the following chunk of code: if (tb[IFLA_VFINFO_LIST]) { struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO) ----> goto errout; err = do_setvfinfo(dev, attr); if (err < 0) goto errout; modified = 1; } } which can get to errout without setting err, resulting in the following error: net/core/rtnetlink.c: In function 'do_setlink': net/core/rtnetlink.c:904: warning: 'err' may be used uninitialized in this function Change the code to return -EINVAL in this case. Note that this might not be the appropriate error though. Signed-off-by: David Howells cc: Chris Wright cc: David S. Miller Acked-by: Chris Wright Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e4b9870e4706..7ab86f3a1ea4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1199,8 +1199,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { - if (nla_type(attr) != IFLA_VF_INFO) + if (nla_type(attr) != IFLA_VF_INFO) { + err = -EINVAL; goto errout; + } err = do_setvfinfo(dev, attr); if (err < 0) goto errout; -- cgit v1.3-8-gc7d7 From 8ce6cebc2f126f3ecf2d80746ea54245adf18057 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 19 May 2010 10:12:19 +0000 Subject: net-2.6 : V2 - fix dev_get_valid_name the commit: commit d90310243fd750240755e217c5faa13e24f41536 Author: Octavian Purdila Date: Wed Nov 18 02:36:59 2009 +0000 net: device name allocation cleanups introduced a bug when there is a hash collision making impossible to rename a device with eth%d. This bug is very hard to reproduce and appears rarely. The problem is coming from we don't pass a temporary buffer to __dev_alloc_name but 'dev->name' which is modified by the function. A detailed explanation is here: http://marc.info/?l=linux-netdev&m=127417784011987&w=2 Changelog: V2 : replaced strings comparison by pointers comparison Signed-off-by: Daniel Lezcano Reviewed-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0aab66d68b19..07a48e2bf7db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -954,18 +954,22 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net *net, const char *name, char *buf, - bool fmt) +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) { + struct net *net; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + if (!dev_valid_name(name)) return -EINVAL; if (fmt && strchr(name, '%')) - return __dev_alloc_name(net, name, buf); + return dev_alloc_name(dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; - else if (buf != name) - strlcpy(buf, name, IFNAMSIZ); + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); return 0; } @@ -997,7 +1001,7 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(net, newname, dev->name, 1); + err = dev_get_valid_name(dev, newname, 1); if (err < 0) return err; @@ -4965,7 +4969,7 @@ int register_netdevice(struct net_device *dev) } } - ret = dev_get_valid_name(net, dev->name, dev->name, 0); + ret = dev_get_valid_name(dev, dev->name, 0); if (ret) goto err_uninit; @@ -5574,7 +5578,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(net, pat, dev->name, 1)) + if (dev_get_valid_name(dev, pat, 1)) goto out; } -- cgit v1.3-8-gc7d7 From f845172531fb7410c7fb7780b1a6e51ee6df7d52 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 24 May 2010 00:12:34 -0700 Subject: cls_cgroup: Store classid in struct sock Up until now cls_cgroup has relied on fetching the classid out of the current executing thread. This runs into trouble when a packet processing is delayed in which case it may execute out of another thread's context. Furthermore, even when a packet is not delayed we may fail to classify it if soft IRQs have been disabled, because this scenario is indistinguishable from one where a packet unrelated to the current thread is processed by a real soft IRQ. In fact, the current semantics is inherently broken, as a single skb may be constructed out of the writes of two different tasks. A different manifestation of this problem is when the TCP stack transmits in response of an incoming ACK. This is currently unclassified. As we already have a concept of packet ownership for accounting purposes in the skb->sk pointer, this is a natural place to store the classid in a persistent manner. This patch adds the cls_cgroup classid in struct sock, filling up an existing hole on 64-bit :) The value is set at socket creation time. So all sockets created via socket(2) automatically gains the ID of the thread creating it. Whenever another process touches the socket by either reading or writing to it, we will change the socket classid to that of the process if it has a valid (non-zero) classid. For sockets created on inbound connections through accept(2), we inherit the classid of the original listening socket through sk_clone, possibly preceding the actual accept(2) call. In order to minimise risks, I have not made this the authoritative classid. For now it is only used as a backup when we execute with soft IRQs disabled. Once we're completely happy with its semantics we can use it as the sole classid. Footnote: I have rearranged the error path on cls_group module creation. If we didn't do this, then there is a window where someone could create a tc rule using cls_group before the cgroup subsystem has been registered. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/cls_cgroup.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 10 +++++++- net/core/sock.c | 18 ++++++++++++++ net/sched/cls_cgroup.c | 50 ++++++++++++++++++++++++++------------ net/socket.c | 9 +++++++ 5 files changed, 133 insertions(+), 17 deletions(-) create mode 100644 include/net/cls_cgroup.h (limited to 'net/core') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h new file mode 100644 index 000000000000..ef2df1475b51 --- /dev/null +++ b/include/net/cls_cgroup.h @@ -0,0 +1,63 @@ +/* + * cls_cgroup.h Control Group Classifier + * + * Authors: Thomas Graf + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#ifndef _NET_CLS_CGROUP_H +#define _NET_CLS_CGROUP_H + +#include +#include +#include + +#ifdef CONFIG_CGROUPS +struct cgroup_cls_state +{ + struct cgroup_subsys_state css; + u32 classid; +}; + +#ifdef CONFIG_NET_CLS_CGROUP +static inline u32 task_cls_classid(struct task_struct *p) +{ + if (in_interrupt()) + return 0; + + return container_of(task_subsys_state(p, net_cls_subsys_id), + struct cgroup_cls_state, css).classid; +} +#else +extern int net_cls_subsys_id; + +static inline u32 task_cls_classid(struct task_struct *p) +{ + int id; + u32 classid; + + if (in_interrupt()) + return 0; + + rcu_read_lock(); + id = rcu_dereference(net_cls_subsys_id); + if (id >= 0) + classid = container_of(task_subsys_state(p, id), + struct cgroup_cls_state, css)->classid; + rcu_read_unlock(); + + return classid; +} +#endif +#else +static inline u32 task_cls_classid(struct task_struct *p) +{ + return 0; +} +#endif +#endif /* _NET_CLS_CGROUP_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 5697caf8cc76..d24f382cb712 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -312,7 +312,7 @@ struct sock { void *sk_security; #endif __u32 sk_mark; - /* XXX 4 bytes hole on 64 bit */ + u32 sk_classid; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk, int bytes); void (*sk_write_space)(struct sock *sk); @@ -1074,6 +1074,14 @@ extern void *sock_kmalloc(struct sock *sk, int size, extern void sock_kfree_s(struct sock *sk, void *mem, int size); extern void sk_send_sigurg(struct sock *sk); +#ifdef CONFIG_CGROUPS +extern void sock_update_classid(struct sock *sk); +#else +static inline void sock_update_classid(struct sock *sk) +{ +} +#endif + /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. diff --git a/net/core/sock.c b/net/core/sock.c index bf88a167c8f2..a05ae7f9771e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -123,6 +123,7 @@ #include #include #include +#include #include @@ -217,6 +218,11 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +int net_cls_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_cls_subsys_id); +#endif + static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { struct timeval tv; @@ -1050,6 +1056,16 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } +#ifdef CONFIG_CGROUPS +void sock_update_classid(struct sock *sk) +{ + u32 classid = task_cls_classid(current); + + if (classid && classid != sk->sk_classid) + sk->sk_classid = classid; +} +#endif + /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1073,6 +1089,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_lock_init(sk); sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); + + sock_update_classid(sk); } return sk; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 221180384fd7..78ef2c5e130b 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -16,14 +16,11 @@ #include #include #include +#include #include #include - -struct cgroup_cls_state -{ - struct cgroup_subsys_state css; - u32 classid; -}; +#include +#include static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, struct cgroup *cgrp); @@ -112,6 +109,10 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp, struct cls_cgroup_head *head = tp->root; u32 classid; + rcu_read_lock(); + classid = task_cls_state(current)->classid; + rcu_read_unlock(); + /* * Due to the nature of the classifier it is required to ignore all * packets originating from softirq context as accessing `current' @@ -122,12 +123,12 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp, * calls by looking at the number of nested bh disable calls because * softirqs always disables bh. */ - if (softirq_count() != SOFTIRQ_OFFSET) - return -1; - - rcu_read_lock(); - classid = task_cls_state(current)->classid; - rcu_read_unlock(); + if (softirq_count() != SOFTIRQ_OFFSET) { + /* If there is an sk_classid we'll use that. */ + if (!skb->sk) + return -1; + classid = skb->sk->sk_classid; + } if (!classid) return -1; @@ -289,18 +290,35 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { static int __init init_cgroup_cls(void) { - int ret = register_tcf_proto_ops(&cls_cgroup_ops); - if (ret) - return ret; + int ret; + ret = cgroup_load_subsys(&net_cls_subsys); if (ret) - unregister_tcf_proto_ops(&cls_cgroup_ops); + goto out; + +#ifndef CONFIG_NET_CLS_CGROUP + /* We can't use rcu_assign_pointer because this is an int. */ + smp_wmb(); + net_cls_subsys_id = net_cls_subsys.subsys_id; +#endif + + ret = register_tcf_proto_ops(&cls_cgroup_ops); + if (ret) + cgroup_unload_subsys(&net_cls_subsys); + +out: return ret; } static void __exit exit_cgroup_cls(void) { unregister_tcf_proto_ops(&cls_cgroup_ops); + +#ifndef CONFIG_NET_CLS_CGROUP + net_cls_subsys_id = -1; + synchronize_rcu(); +#endif + cgroup_unload_subsys(&net_cls_subsys); } diff --git a/net/socket.c b/net/socket.c index f9f7d0872cac..367d5477d00f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -94,6 +94,7 @@ #include #include +#include #include #include @@ -558,6 +559,8 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct sock_iocb *si = kiocb_to_siocb(iocb); int err; + sock_update_classid(sock->sk); + si->sock = sock; si->scm = NULL; si->msg = msg; @@ -684,6 +687,8 @@ static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, { struct sock_iocb *si = kiocb_to_siocb(iocb); + sock_update_classid(sock->sk); + si->sock = sock; si->scm = NULL; si->msg = msg; @@ -777,6 +782,8 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, if (unlikely(!sock->ops->splice_read)) return -EINVAL; + sock_update_classid(sock->sk); + return sock->ops->splice_read(sock, ppos, pipe, len, flags); } @@ -3069,6 +3076,8 @@ int kernel_setsockopt(struct socket *sock, int level, int optname, int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { + sock_update_classid(sock->sk); + if (sock->ops->sendpage) return sock->ops->sendpage(sock, page, offset, size, flags); -- cgit v1.3-8-gc7d7 From 8286274284e15b11b0f531b6ceeef21fbe00a8dd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 24 May 2010 00:14:10 -0700 Subject: tun: Update classid on packet injection This patch makes tun update its socket classid every time we inject a packet into the network stack. This is so that any updates made by the admin to the process writing packets to tun is effected. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 ++ net/core/sock.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net/core') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 97b25533e5fb..8793c2bf7f15 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -526,6 +526,8 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun, struct sk_buff *skb; int err; + sock_update_classid(sk); + /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; diff --git a/net/core/sock.c b/net/core/sock.c index a05ae7f9771e..37fe9b6adade 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1064,6 +1064,7 @@ void sock_update_classid(struct sock *sk) if (classid && classid != sk->sk_classid) sk->sk_classid = classid; } +EXPORT_SYMBOL(sock_update_classid); #endif /** -- cgit v1.3-8-gc7d7 From 8a74ad60a546b13bd1096b2a61a7a5c6fd9ae17c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 May 2010 19:20:18 +0000 Subject: net: fix lock_sock_bh/unlock_sock_bh This new sock lock primitive was introduced to speedup some user context socket manipulation. But it is unsafe to protect two threads, one using regular lock_sock/release_sock, one using lock_sock_bh/unlock_sock_bh This patch changes lock_sock_bh to be careful against 'owned' state. If owned is found to be set, we must take the slow path. lock_sock_bh() now returns a boolean to say if the slow path was taken, and this boolean is used at unlock_sock_bh time to call the appropriate unlock function. After this change, BH are either disabled or enabled during the lock_sock_bh/unlock_sock_bh protected section. This might be misleading, so we rename these functions to lock_sock_fast()/unlock_sock_fast(). Reported-by: Anton Blanchard Signed-off-by: Eric Dumazet Tested-by: Anton Blanchard Signed-off-by: David S. Miller --- include/net/sock.h | 20 ++++++++++++++------ net/core/datagram.c | 6 ++++-- net/core/sock.c | 33 +++++++++++++++++++++++++++++++++ net/ipv4/udp.c | 14 ++++++++------ net/ipv6/udp.c | 5 +++-- 5 files changed, 62 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index d2a71b04a5ae..ca241ea14875 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1026,15 +1026,23 @@ extern void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -static inline void lock_sock_bh(struct sock *sk) +extern bool lock_sock_fast(struct sock *sk); +/** + * unlock_sock_fast - complement of lock_sock_fast + * @sk: socket + * @slow: slow mode + * + * fast unlock socket for user context. + * If slow mode is on, we call regular release_sock() + */ +static inline void unlock_sock_fast(struct sock *sk, bool slow) { - spin_lock_bh(&sk->sk_lock.slock); + if (slow) + release_sock(sk); + else + spin_unlock_bh(&sk->sk_lock.slock); } -static inline void unlock_sock_bh(struct sock *sk) -{ - spin_unlock_bh(&sk->sk_lock.slock); -} extern struct sock *sk_alloc(struct net *net, int family, gfp_t priority, diff --git a/net/core/datagram.c b/net/core/datagram.c index e0097531417a..f5b6f43a4c2e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -229,15 +229,17 @@ EXPORT_SYMBOL(skb_free_datagram); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { + bool slow; + if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; - lock_sock_bh(sk); + slow = lock_sock_fast(sk); skb_orphan(skb); sk_mem_reclaim_partial(sk); - unlock_sock_bh(sk); + unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ __kfree_skb(skb); diff --git a/net/core/sock.c b/net/core/sock.c index 37fe9b6adade..2cf7f9f7e775 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2007,6 +2007,39 @@ void release_sock(struct sock *sk) } EXPORT_SYMBOL(release_sock); +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken + * sk_lock.slock locked, owned = 0, BH disabled + * return true if slow path is taken + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +bool lock_sock_fast(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + + if (!sk->sk_lock.owned) + /* + * Note : We must disable BH + */ + return false; + + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + local_bh_enable(); + return true; +} +EXPORT_SYMBOL(lock_sock_fast); + int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) { struct timeval tv; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9de6a698f91d..b9d0d409516f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1063,10 +1063,11 @@ static unsigned int first_packet_length(struct sock *sk) spin_unlock_bh(&rcvq->lock); if (!skb_queue_empty(&list_kill)) { - lock_sock_bh(sk); + bool slow = lock_sock_fast(sk); + __skb_queue_purge(&list_kill); sk_mem_reclaim_partial(sk); - unlock_sock_bh(sk); + unlock_sock_fast(sk, slow); } return res; } @@ -1123,6 +1124,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int peeked; int err; int is_udplite = IS_UDPLITE(sk); + bool slow; /* * Check any passed addresses @@ -1197,10 +1199,10 @@ out: return err; csum_copy_err: - lock_sock_bh(sk); + slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - unlock_sock_bh(sk); + unlock_sock_fast(sk, slow); if (noblock) return -EAGAIN; @@ -1625,9 +1627,9 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { - lock_sock_bh(sk); + bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); - unlock_sock_bh(sk); + unlock_sock_fast(sk, slow); } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3d7a2c0b836a..87be58673b55 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -328,6 +328,7 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, int err; int is_udplite = IS_UDPLITE(sk); int is_udp4; + bool slow; if (addr_len) *addr_len=sizeof(struct sockaddr_in6); @@ -424,7 +425,7 @@ out: return err; csum_copy_err: - lock_sock_bh(sk); + slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { if (is_udp4) UDP_INC_STATS_USER(sock_net(sk), @@ -433,7 +434,7 @@ csum_copy_err: UDP6_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - unlock_sock_bh(sk); + unlock_sock_fast(sk, slow); if (flags & MSG_DONTWAIT) return -EAGAIN; -- cgit v1.3-8-gc7d7 From a47311380e094bb201be8a818370c73c3f52122c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 May 2010 16:09:39 -0700 Subject: net: fix __neigh_event_send() commit 7fee226ad23 (net: add a noref bit on skb dst) missed one spot where an skb is enqueued, with a possibly not refcounted dst entry. __neigh_event_send() inserts skb into arp_queue, so we must make sure dst entry is refcounted, or dst entry can be freed by garbage collector after caller exits from rcu protected section. Reported-by: Ingo Molnar Tested-by: Ingo Molnar Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bff37908bd55..6ba1c0eece03 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -934,6 +934,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } + skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); } rc = 1; -- cgit v1.3-8-gc7d7 From 8ca9418350eccd5dd2659931807c1901224dd638 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 28 May 2010 03:42:18 -0700 Subject: netlink: bug fix: don't overrun skbs on vf_port dump Noticed by Patrick McHardy: was continuing to fill skb after a nla_put_failure, ignoring the size calculated by upper layer. Now, return -EMSGSIZE on any overruns, but also allow netdev to fail ndo_get_vf_port with error other than -EMSGSIZE, thus unwinding nest. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7ab86f3a1ea4..7331bb2f6b9c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -722,14 +722,13 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start(skb, IFLA_VF_PORT); - if (!vf_port) { - nla_nest_cancel(skb, vf_ports); - return -EMSGSIZE; - } + if (!vf_port) + goto nla_put_failure; NLA_PUT_U32(skb, IFLA_PORT_VF, vf); err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); + if (err == -EMSGSIZE) + goto nla_put_failure; if (err) { -nla_put_failure: nla_nest_cancel(skb, vf_port); continue; } @@ -739,6 +738,10 @@ nla_put_failure: nla_nest_end(skb, vf_ports); return 0; + +nla_put_failure: + nla_nest_cancel(skb, vf_ports); + return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) @@ -753,7 +756,7 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); - return err; + return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); -- cgit v1.3-8-gc7d7 From 045de01a174d9f0734f657eb4b3313d89b4fd5ad Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 28 May 2010 03:42:43 -0700 Subject: netlink: bug fix: wrong size was calculated for vfinfo list blob The wrong size was being calculated for vfinfo. In one case, it was over- calculating using nlmsg_total_size on attrs, in another case, it was under-calculating by assuming ifla_vf_* structs are packed together, but each struct is it's own attr w/ hdr (and padding). Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7331bb2f6b9c..1a2af24e9e3d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -650,11 +650,12 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev) if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { int num_vfs = dev_num_vf(dev->dev.parent); - size_t size = nlmsg_total_size(sizeof(struct nlattr)); - size += nlmsg_total_size(num_vfs * sizeof(struct nlattr)); - size += num_vfs * (sizeof(struct ifla_vf_mac) + - sizeof(struct ifla_vf_vlan) + - sizeof(struct ifla_vf_tx_rate)); + size_t size = nla_total_size(sizeof(struct nlattr)); + size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size += num_vfs * + (nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate))); return size; } else return 0; -- cgit v1.3-8-gc7d7