From cde2e9a651b76d8db36ae94cd0febc82b637e5dd Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 27 Apr 2012 10:11:48 +0000
Subject: drop_monitor: fix sleeping in invalid context warning

Eric Dumazet pointed out this warning in the drop_monitor protocol to me:

[   38.352571] BUG: sleeping function called from invalid context at kernel/mutex.c:85
[   38.352576] in_atomic(): 1, irqs_disabled(): 0, pid: 4415, name: dropwatch
[   38.352580] Pid: 4415, comm: dropwatch Not tainted 3.4.0-rc2+ #71
[   38.352582] Call Trace:
[   38.352592]  [<ffffffff8153aaf0>] ? trace_napi_poll_hit+0xd0/0xd0
[   38.352599]  [<ffffffff81063f2a>] __might_sleep+0xca/0xf0
[   38.352606]  [<ffffffff81655b16>] mutex_lock+0x26/0x50
[   38.352610]  [<ffffffff8153aaf0>] ? trace_napi_poll_hit+0xd0/0xd0
[   38.352616]  [<ffffffff810b72d9>] tracepoint_probe_register+0x29/0x90
[   38.352621]  [<ffffffff8153a585>] set_all_monitor_traces+0x105/0x170
[   38.352625]  [<ffffffff8153a8ca>] net_dm_cmd_trace+0x2a/0x40
[   38.352630]  [<ffffffff8154a81a>] genl_rcv_msg+0x21a/0x2b0
[   38.352636]  [<ffffffff810f8029>] ? zone_statistics+0x99/0xc0
[   38.352640]  [<ffffffff8154a600>] ? genl_rcv+0x30/0x30
[   38.352645]  [<ffffffff8154a059>] netlink_rcv_skb+0xa9/0xd0
[   38.352649]  [<ffffffff8154a5f0>] genl_rcv+0x20/0x30
[   38.352653]  [<ffffffff81549a7e>] netlink_unicast+0x1ae/0x1f0
[   38.352658]  [<ffffffff81549d76>] netlink_sendmsg+0x2b6/0x310
[   38.352663]  [<ffffffff8150824f>] sock_sendmsg+0x10f/0x130
[   38.352668]  [<ffffffff8150abe0>] ? move_addr_to_kernel+0x60/0xb0
[   38.352673]  [<ffffffff81515f04>] ? verify_iovec+0x64/0xe0
[   38.352677]  [<ffffffff81509c46>] __sys_sendmsg+0x386/0x390
[   38.352682]  [<ffffffff810ffaf9>] ? handle_mm_fault+0x139/0x210
[   38.352687]  [<ffffffff8165b5bc>] ? do_page_fault+0x1ec/0x4f0
[   38.352693]  [<ffffffff8106ba4d>] ? set_next_entity+0x9d/0xb0
[   38.352699]  [<ffffffff81310b49>] ? tty_ldisc_deref+0x9/0x10
[   38.352703]  [<ffffffff8106d363>] ? pick_next_task_fair+0x63/0x140
[   38.352708]  [<ffffffff8150b8d4>] sys_sendmsg+0x44/0x80
[   38.352713]  [<ffffffff8165f8e2>] system_call_fastpath+0x16/0x1b

It stems from holding a spinlock (trace_state_lock) while attempting to register
or unregister tracepoint hooks, making in_atomic() true in this context, leading
to the warning when the tracepoint calls might_sleep() while its taking a mutex.
Since we only use the trace_state_lock to prevent trace protocol state races, as
well as hardware stat list updates on an rcu write side, we can just convert the
spinlock to a mutex to avoid this problem.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: David Miller <davem@davemloft.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/drop_monitor.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/core')
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 5c3c81a609e5..a221a5bbecf7 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -42,7 +42,7 @@ static void send_dm_alert(struct work_struct *unused);
  * netlink alerts
  */
 static int trace_state = TRACE_OFF;
-static DEFINE_SPINLOCK(trace_state_lock);
+static DEFINE_MUTEX(trace_state_mutex);
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -214,7 +214,7 @@ static int set_all_monitor_traces(int state)
 	struct dm_hw_stat_delta *new_stat = NULL;
 	struct dm_hw_stat_delta *temp;
 
-	spin_lock(&trace_state_lock);
+	mutex_lock(&trace_state_mutex);
 
 	if (state == trace_state) {
 		rc = -EAGAIN;
@@ -253,7 +253,7 @@ static int set_all_monitor_traces(int state)
 		rc = -EINPROGRESS;
 
 out_unlock:
-	spin_unlock(&trace_state_lock);
+	mutex_unlock(&trace_state_mutex);
 
 	return rc;
 }
@@ -296,12 +296,12 @@ static int dropmon_net_event(struct notifier_block *ev_block,
 
 		new_stat->dev = dev;
 		new_stat->last_rx = jiffies;
-		spin_lock(&trace_state_lock);
+		mutex_lock(&trace_state_mutex);
 		list_add_rcu(&new_stat->list, &hw_stats_list);
-		spin_unlock(&trace_state_lock);
+		mutex_unlock(&trace_state_mutex);
 		break;
 	case NETDEV_UNREGISTER:
-		spin_lock(&trace_state_lock);
+		mutex_lock(&trace_state_mutex);
 		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
 			if (new_stat->dev == dev) {
 				new_stat->dev = NULL;
@@ -312,7 +312,7 @@ static int dropmon_net_event(struct notifier_block *ev_block,
 				}
 			}
 		}
-		spin_unlock(&trace_state_lock);
+		mutex_unlock(&trace_state_mutex);
 		break;
 	}
 out:
-- 
cgit v1.2.3-59-g8ed1b


From 3885ca785a3618593226687ced84f3f336dc3860 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 27 Apr 2012 10:11:49 +0000
Subject: drop_monitor: Make updating data->skb smp safe

Eric Dumazet pointed out to me that the drop_monitor protocol has some holes in
its smp protections.  Specifically, its possible to replace data->skb while its
being written.  This patch corrects that by making data->skb an rcu protected
variable.  That will prevent it from being overwritten while a tracepoint is
modifying it.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: David Miller <davem@davemloft.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/drop_monitor.c | 70 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index a221a5bbecf7..7592943513e3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -46,7 +46,7 @@ static DEFINE_MUTEX(trace_state_mutex);
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
-	struct sk_buff *skb;
+	struct sk_buff __rcu *skb;
 	atomic_t dm_hit_count;
 	struct timer_list send_timer;
 };
@@ -73,35 +73,58 @@ static int dm_hit_limit = 64;
 static int dm_delay = 1;
 static unsigned long dm_hw_check_delta = 2*HZ;
 static LIST_HEAD(hw_stats_list);
+static int initialized = 0;
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
 	size_t al;
 	struct net_dm_alert_msg *msg;
 	struct nlattr *nla;
+	struct sk_buff *skb;
+	struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);
 
 	al = sizeof(struct net_dm_alert_msg);
 	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
 	al += sizeof(struct nlattr);
 
-	data->skb = genlmsg_new(al, GFP_KERNEL);
-	genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
-			0, NET_DM_CMD_ALERT);
-	nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg));
-	msg = nla_data(nla);
-	memset(msg, 0, al);
-	atomic_set(&data->dm_hit_count, dm_hit_limit);
+	skb = genlmsg_new(al, GFP_KERNEL);
+
+	if (skb) {
+		genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+				0, NET_DM_CMD_ALERT);
+		nla = nla_reserve(skb, NLA_UNSPEC,
+				  sizeof(struct net_dm_alert_msg));
+		msg = nla_data(nla);
+		memset(msg, 0, al);
+	} else if (initialized)
+		schedule_work_on(smp_processor_id(), &data->dm_alert_work);
+
+	/*
+	 * Don't need to lock this, since we are guaranteed to only
+	 * run this on a single cpu at a time.
+	 * Note also that we only update data->skb if the old and new skb
+	 * pointers don't match.  This ensures that we don't continually call
+	 * synchornize_rcu if we repeatedly fail to alloc a new netlink message.
+	 */
+	if (skb != oskb) {
+		rcu_assign_pointer(data->skb, skb);
+
+		synchronize_rcu();
+
+		atomic_set(&data->dm_hit_count, dm_hit_limit);
+	}
+
 }
 
 static void send_dm_alert(struct work_struct *unused)
 {
 	struct sk_buff *skb;
-	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
 
 	/*
 	 * Grab the skb we're about to send
 	 */
-	skb = data->skb;
+	skb = rcu_dereference_protected(data->skb, 1);
 
 	/*
 	 * Replace it with a new one
@@ -111,8 +134,10 @@ static void send_dm_alert(struct work_struct *unused)
 	/*
 	 * Ship it!
 	 */
-	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
+	if (skb)
+		genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
 
+	put_cpu_var(dm_cpu_data);
 }
 
 /*
@@ -123,9 +148,11 @@ static void send_dm_alert(struct work_struct *unused)
  */
 static void sched_send_work(unsigned long unused)
 {
-	struct per_cpu_dm_data *data =  &__get_cpu_var(dm_cpu_data);
+	struct per_cpu_dm_data *data =  &get_cpu_var(dm_cpu_data);
+
+	schedule_work_on(smp_processor_id(), &data->dm_alert_work);
 
-	schedule_work(&data->dm_alert_work);
+	put_cpu_var(dm_cpu_data);
 }
 
 static void trace_drop_common(struct sk_buff *skb, void *location)
@@ -134,9 +161,16 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	struct nlmsghdr *nlh;
 	struct nlattr *nla;
 	int i;
-	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+	struct sk_buff *dskb;
+	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
 
 
+	rcu_read_lock();
+	dskb = rcu_dereference(data->skb);
+
+	if (!dskb)
+		goto out;
+
 	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
 		/*
 		 * we're already at zero, discard this hit
@@ -144,7 +178,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 		goto out;
 	}
 
-	nlh = (struct nlmsghdr *)data->skb->data;
+	nlh = (struct nlmsghdr *)dskb->data;
 	nla = genlmsg_data(nlmsg_data(nlh));
 	msg = nla_data(nla);
 	for (i = 0; i < msg->entries; i++) {
@@ -158,7 +192,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	/*
 	 * We need to create a new entry
 	 */
-	__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+	__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
 	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
 	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
 	msg->points[msg->entries].count = 1;
@@ -170,6 +204,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	}
 
 out:
+	rcu_read_unlock();
+	put_cpu_var(dm_cpu_data);
 	return;
 }
 
@@ -375,6 +411,8 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.function = sched_send_work;
 	}
 
+	initialized = 1;
+
 	goto out;
 
 out_unreg:
-- 
cgit v1.2.3-59-g8ed1b


From 4fdcfa12843bca38d0c9deff70c8720e4e8f515f Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 1 May 2012 08:18:02 +0000
Subject: drop_monitor: prevent init path from scheduling on the wrong cpu

I just noticed after some recent updates, that the init path for the drop
monitor protocol has a minor error.  drop monitor maintains a per cpu structure,
that gets initalized from a single cpu.  Normally this is fine, as the protocol
isn't in use yet, but I recently made a change that causes a failed skb
allocation to reschedule itself .  Given the current code, the implication is
that this workqueue reschedule will take place on the wrong cpu.  If drop
monitor is used early during the boot process, its possible that two cpus will
access a single per-cpu structure in parallel, possibly leading to data
corruption.

This patch fixes the situation, by storing the cpu number that a given instance
of this per-cpu data should be accessed from.  In the case of a need for a
reschedule, the cpu stored in the struct is assigned the rescheule, rather than
the currently executing cpu

Tested successfully by myself.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/drop_monitor.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 7592943513e3..a7cad741df01 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -49,6 +49,7 @@ struct per_cpu_dm_data {
 	struct sk_buff __rcu *skb;
 	atomic_t dm_hit_count;
 	struct timer_list send_timer;
+	int cpu;
 };
 
 struct dm_hw_stat_delta {
@@ -73,7 +74,6 @@ static int dm_hit_limit = 64;
 static int dm_delay = 1;
 static unsigned long dm_hw_check_delta = 2*HZ;
 static LIST_HEAD(hw_stats_list);
-static int initialized = 0;
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -96,8 +96,8 @@ static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 				  sizeof(struct net_dm_alert_msg));
 		msg = nla_data(nla);
 		memset(msg, 0, al);
-	} else if (initialized)
-		schedule_work_on(smp_processor_id(), &data->dm_alert_work);
+	} else
+		schedule_work_on(data->cpu, &data->dm_alert_work);
 
 	/*
 	 * Don't need to lock this, since we are guaranteed to only
@@ -121,6 +121,8 @@ static void send_dm_alert(struct work_struct *unused)
 	struct sk_buff *skb;
 	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
 
+	WARN_ON_ONCE(data->cpu != smp_processor_id());
+
 	/*
 	 * Grab the skb we're about to send
 	 */
@@ -404,14 +406,14 @@ static int __init init_net_drop_monitor(void)
 
 	for_each_present_cpu(cpu) {
 		data = &per_cpu(dm_cpu_data, cpu);
-		reset_per_cpu_data(data);
+		data->cpu = cpu;
 		INIT_WORK(&data->dm_alert_work, send_dm_alert);
 		init_timer(&data->send_timer);
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
+		reset_per_cpu_data(data);
 	}
 
-	initialized = 1;
 
 	goto out;
 
-- 
cgit v1.2.3-59-g8ed1b