From 5f286e113fa377e50bd18fc45e5a0d4d83f6950c Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Sat, 28 Apr 2007 20:57:37 -0700
Subject: [NETPOLL]: Fix TX queue overflow in trapped mode.

CONFIG_NETPOLL_TRAP causes the TX queue controls to be completely bypassed in
the netpoll's "trapped" mode which easily causes overflows in the drivers with
short TX queues (most notably, in 8139too with its 4-deep queue).  So, make
this option more sensible by making it only bypass the TX softirq wakeup.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Acked-by: Tom Rini <trini@kernel.crashing.org>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e027a3750a77..24cef42f1e0f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -654,8 +654,10 @@ static inline void netif_start_queue(struct net_device *dev)
 static inline void netif_wake_queue(struct net_device *dev)
 {
 #ifdef CONFIG_NETPOLL_TRAP
-	if (netpoll_trap())
+	if (netpoll_trap()) {
+		clear_bit(__LINK_STATE_XOFF, &dev->state);
 		return;
+	}
 #endif
 	if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state))
 		__netif_schedule(dev);
@@ -663,10 +665,6 @@ static inline void netif_wake_queue(struct net_device *dev)
 
 static inline void netif_stop_queue(struct net_device *dev)
 {
-#ifdef CONFIG_NETPOLL_TRAP
-	if (netpoll_trap())
-		return;
-#endif
 	set_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5a1b5898ee9e0bf68a86609ecb9775457b1857a5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 28 Apr 2007 21:04:03 -0700
Subject: [NET]: Remove NETIF_F_INTERNAL_STATS, default to internal stats.

Herbert Xu conviced me that a new flag was overkill; every driver
currently overrides get_stats, so we might as well make the internal
one the default.  If someone did fail to set get_stats, they would now
get all 0 stats instead of "No statistics available".

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/appldata/appldata_net_sum.c |  3 --
 drivers/net/bonding/bond_main.c       | 59 +++++++++++++++--------------------
 drivers/parisc/led.c                  |  2 --
 include/linux/netdevice.h             |  1 -
 net/core/dev.c                        | 45 ++++++++++++--------------
 5 files changed, 45 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c
index 516b3ac9a9b5..a43f3488fecf 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -109,9 +109,6 @@ static void appldata_get_net_sum_data(void *data)
 	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev != NULL; dev = dev->next) {
 		stats = dev->get_stats(dev);
-		if (stats == NULL) {
-			continue;
-		}
 		rx_packets += stats->rx_packets;
 		tx_packets += stats->tx_packets;
 		rx_bytes   += stats->rx_bytes;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index cea3783c92c5..724bce51f936 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1360,13 +1360,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		goto err_undo_flags;
 	}
 
-	if (slave_dev->get_stats == NULL) {
-		printk(KERN_NOTICE DRV_NAME
-			": %s: the driver for slave device %s does not provide "
-			"get_stats function, network statistics will be "
-			"inaccurate.\n", bond_dev->name, slave_dev->name);
-	}
-
 	new_slave = kzalloc(sizeof(struct slave), GFP_KERNEL);
 	if (!new_slave) {
 		res = -ENOMEM;
@@ -3641,33 +3634,31 @@ static struct net_device_stats *bond_get_stats(struct net_device *bond_dev)
 
 	bond_for_each_slave(bond, slave, i) {
 		sstats = slave->dev->get_stats(slave->dev);
-		if (sstats) {
-			stats->rx_packets += sstats->rx_packets;
-			stats->rx_bytes += sstats->rx_bytes;
-			stats->rx_errors += sstats->rx_errors;
-			stats->rx_dropped += sstats->rx_dropped;
-
-			stats->tx_packets += sstats->tx_packets;
-			stats->tx_bytes += sstats->tx_bytes;
-			stats->tx_errors += sstats->tx_errors;
-			stats->tx_dropped += sstats->tx_dropped;
-
-			stats->multicast += sstats->multicast;
-			stats->collisions += sstats->collisions;
-
-			stats->rx_length_errors += sstats->rx_length_errors;
-			stats->rx_over_errors += sstats->rx_over_errors;
-			stats->rx_crc_errors += sstats->rx_crc_errors;
-			stats->rx_frame_errors += sstats->rx_frame_errors;
-			stats->rx_fifo_errors += sstats->rx_fifo_errors;
-			stats->rx_missed_errors += sstats->rx_missed_errors;
-
-			stats->tx_aborted_errors += sstats->tx_aborted_errors;
-			stats->tx_carrier_errors += sstats->tx_carrier_errors;
-			stats->tx_fifo_errors += sstats->tx_fifo_errors;
-			stats->tx_heartbeat_errors += sstats->tx_heartbeat_errors;
-			stats->tx_window_errors += sstats->tx_window_errors;
-		}
+		stats->rx_packets += sstats->rx_packets;
+		stats->rx_bytes += sstats->rx_bytes;
+		stats->rx_errors += sstats->rx_errors;
+		stats->rx_dropped += sstats->rx_dropped;
+
+		stats->tx_packets += sstats->tx_packets;
+		stats->tx_bytes += sstats->tx_bytes;
+		stats->tx_errors += sstats->tx_errors;
+		stats->tx_dropped += sstats->tx_dropped;
+
+		stats->multicast += sstats->multicast;
+		stats->collisions += sstats->collisions;
+
+		stats->rx_length_errors += sstats->rx_length_errors;
+		stats->rx_over_errors += sstats->rx_over_errors;
+		stats->rx_crc_errors += sstats->rx_crc_errors;
+		stats->rx_frame_errors += sstats->rx_frame_errors;
+		stats->rx_fifo_errors += sstats->rx_fifo_errors;
+		stats->rx_missed_errors += sstats->rx_missed_errors;
+
+		stats->tx_aborted_errors += sstats->tx_aborted_errors;
+		stats->tx_carrier_errors += sstats->tx_carrier_errors;
+		stats->tx_fifo_errors += sstats->tx_fifo_errors;
+		stats->tx_heartbeat_errors += sstats->tx_heartbeat_errors;
+		stats->tx_window_errors += sstats->tx_window_errors;
 	}
 
 	read_unlock_bh(&bond->lock);
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 453e6829756c..3df82fe9ce8c 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -373,8 +373,6 @@ static __inline__ int led_get_net_activity(void)
 	    if (LOOPBACK(in_dev->ifa_list->ifa_local))
 		continue;
 	    stats = dev->get_stats(dev);
-	    if (!stats)
-		continue;
 	    rx_total += stats->rx_packets;
 	    tx_total += stats->tx_packets;
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 24cef42f1e0f..ac0c92b1e002 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -325,7 +325,6 @@ struct net_device
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_GSO		2048	/* Enable software GSO. */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
-#define NETIF_F_INTERNAL_STATS	8192	/* Use stats structure in net_device */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
diff --git a/net/core/dev.c b/net/core/dev.c
index d5e42d13bd67..eb999003bbb7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2101,26 +2101,23 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 {
 	struct net_device_stats *stats = dev->get_stats(dev);
 
-	if (stats) {
-		seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
-				"%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
-			   dev->name, stats->rx_bytes, stats->rx_packets,
-			   stats->rx_errors,
-			   stats->rx_dropped + stats->rx_missed_errors,
-			   stats->rx_fifo_errors,
-			   stats->rx_length_errors + stats->rx_over_errors +
-			     stats->rx_crc_errors + stats->rx_frame_errors,
-			   stats->rx_compressed, stats->multicast,
-			   stats->tx_bytes, stats->tx_packets,
-			   stats->tx_errors, stats->tx_dropped,
-			   stats->tx_fifo_errors, stats->collisions,
-			   stats->tx_carrier_errors +
-			     stats->tx_aborted_errors +
-			     stats->tx_window_errors +
-			     stats->tx_heartbeat_errors,
-			   stats->tx_compressed);
-	} else
-		seq_printf(seq, "%6s: No statistics available.\n", dev->name);
+	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
+		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
+		   dev->name, stats->rx_bytes, stats->rx_packets,
+		   stats->rx_errors,
+		   stats->rx_dropped + stats->rx_missed_errors,
+		   stats->rx_fifo_errors,
+		   stats->rx_length_errors + stats->rx_over_errors +
+		    stats->rx_crc_errors + stats->rx_frame_errors,
+		   stats->rx_compressed, stats->multicast,
+		   stats->tx_bytes, stats->tx_packets,
+		   stats->tx_errors, stats->tx_dropped,
+		   stats->tx_fifo_errors, stats->collisions,
+		   stats->tx_carrier_errors +
+		    stats->tx_aborted_errors +
+		    stats->tx_window_errors +
+		    stats->tx_heartbeat_errors,
+		   stats->tx_compressed);
 }
 
 /*
@@ -3257,11 +3254,9 @@ out:
 	mutex_unlock(&net_todo_run_mutex);
 }
 
-static struct net_device_stats *maybe_internal_stats(struct net_device *dev)
+static struct net_device_stats *internal_stats(struct net_device *dev)
 {
-	if (dev->features & NETIF_F_INTERNAL_STATS)
-		return &dev->stats;
-	return NULL;
+	return &dev->stats;
 }
 
 /**
@@ -3299,7 +3294,7 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
 	if (sizeof_priv)
 		dev->priv = netdev_priv(dev);
 
-	dev->get_stats = maybe_internal_stats;
+	dev->get_stats = internal_stats;
 	setup(dev);
 	strcpy(dev->name, name);
 	return dev;
-- 
cgit v1.2.3-59-g8ed1b


From ecfd6b183780c6d9e85873693b3ce6c5f4d08b58 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 28 Apr 2007 21:20:32 -0700
Subject: [XFRM]: Export SPD info

With this patch you can use iproute2 in user space to efficiently see
how many policies exist in different directions.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h   | 35 +++++++++++++++++++++++
 include/net/xfrm.h     | 13 +++++++++
 net/xfrm/xfrm_policy.c | 16 ++++++++++-
 net/xfrm/xfrm_user.c   | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 9c656a5cf842..a5d53e0fe152 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -185,6 +185,11 @@ enum {
 #define XFRM_MSG_NEWSADINFO XFRM_MSG_NEWSADINFO
 	XFRM_MSG_GETSADINFO,
 #define XFRM_MSG_GETSADINFO XFRM_MSG_GETSADINFO
+
+	XFRM_MSG_NEWSPDINFO,
+#define XFRM_MSG_NEWSPDINFO XFRM_MSG_NEWSPDINFO
+	XFRM_MSG_GETSPDINFO,
+#define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO
 	__XFRM_MSG_MAX
 };
 #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1)
@@ -290,6 +295,36 @@ enum xfrm_sadattr_type_t {
 #define XFRMA_SAD_MAX (__XFRMA_SAD_MAX - 1)
 };
 
+/* SPD Table filter flags  */
+enum xfrm_spd_ftype_t {
+	XFRM_SPD_UNSPEC,
+	XFRM_SPD_HMASK=1,
+	XFRM_SPD_HMAX=2,
+	XFRM_SPD_ICNT=4,
+	XFRM_SPD_OCNT=8,
+	XFRM_SPD_FCNT=16,
+	XFRM_SPD_ISCNT=32,
+	XFRM_SPD_OSCNT=64,
+	XFRM_SPD_FSCNT=128,
+	__XFRM_SPD_MAX
+
+#define XFRM_SPD_MAX (__XFRM_SPD_MAX - 1)
+};
+enum xfrm_spdattr_type_t {
+	XFRMA_SPD_UNSPEC,
+	XFRMA_SPDHMASK,
+	XFRMA_SPDHMAX,
+	XFRMA_SPDICNT,
+	XFRMA_SPDOCNT,
+	XFRMA_SPDFCNT,
+	XFRMA_SPDISCNT,
+	XFRMA_SPDOSCNT,
+	XFRMA_SPDFSCNT,
+	__XFRMA_SPD_MAX
+
+#define XFRMA_SPD_MAX (__XFRMA_SPD_MAX - 1)
+};
+
 struct xfrm_usersa_info {
 	struct xfrm_selector		sel;
 	struct xfrm_id			id;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8287081d77f2..9561bf817b02 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,6 +423,18 @@ struct xfrm_sadinfo
 	u32 sadhmcnt; /* max allowed hash bkts */
 	u32 sadcnt; /* current running count */
 };
+
+struct xfrm_spdinfo
+{
+	u32 incnt;
+	u32 outcnt;
+	u32 fwdcnt;
+	u32 inscnt;
+	u32 outscnt;
+	u32 fwdscnt;
+	u32 spdhcnt;
+	u32 spdhmcnt;
+};
 #ifdef CONFIG_AUDITSYSCALL
 extern void xfrm_audit_log(uid_t auid, u32 secid, int type, int result,
 		    struct xfrm_policy *xp, struct xfrm_state *x);
@@ -946,6 +958,7 @@ extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
 extern int xfrm_state_delete(struct xfrm_state *x);
 extern void xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info);
 extern void xfrm_sad_getinfo(struct xfrm_sadinfo *si);
+extern void xfrm_spd_getinfo(struct xfrm_spdinfo *si);
 extern int xfrm_replay_check(struct xfrm_state *x, __be32 seq);
 extern void xfrm_replay_advance(struct xfrm_state *x, __be32 seq);
 extern void xfrm_replay_notify(struct xfrm_state *x, int event);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 762926009c04..dbf9d96a2f0b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -579,8 +579,22 @@ static inline int xfrm_byidx_should_resize(int total)
 	return 0;
 }
 
-static DEFINE_MUTEX(hash_resize_mutex);
+void xfrm_spd_getinfo(struct xfrm_spdinfo *si)
+{
+	read_lock_bh(&xfrm_policy_lock);
+	si->incnt = xfrm_policy_count[XFRM_POLICY_IN];
+	si->outcnt = xfrm_policy_count[XFRM_POLICY_OUT];
+	si->fwdcnt = xfrm_policy_count[XFRM_POLICY_FWD];
+	si->inscnt = xfrm_policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
+	si->outscnt = xfrm_policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
+	si->fwdscnt = xfrm_policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
+	si->spdhcnt = xfrm_idx_hmask;
+	si->spdhmcnt = xfrm_policy_hashmax;
+	read_unlock_bh(&xfrm_policy_lock);
+}
+EXPORT_SYMBOL(xfrm_spd_getinfo);
 
+static DEFINE_MUTEX(hash_resize_mutex);
 static void xfrm_hash_resize(struct work_struct *__unused)
 {
 	int dir, total;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 69110fed64b6..4210d91624cd 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -672,6 +672,81 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
 	return skb;
 }
 
+static int build_spdinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags)
+{
+	struct xfrm_spdinfo si;
+	struct nlmsghdr *nlh;
+	u32 *f;
+
+	nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0);
+	if (nlh == NULL) /* shouldnt really happen ... */
+		return -EMSGSIZE;
+
+	f = nlmsg_data(nlh);
+	*f = flags;
+	xfrm_spd_getinfo(&si);
+
+	if (flags & XFRM_SPD_HMASK)
+		NLA_PUT_U32(skb, XFRMA_SPDHMASK, si.spdhcnt);
+	if (flags & XFRM_SPD_HMAX)
+		NLA_PUT_U32(skb, XFRMA_SPDHMAX, si.spdhmcnt);
+	if (flags & XFRM_SPD_ICNT)
+		NLA_PUT_U32(skb, XFRMA_SPDICNT, si.incnt);
+	if (flags & XFRM_SPD_OCNT)
+		NLA_PUT_U32(skb, XFRMA_SPDOCNT, si.outcnt);
+	if (flags & XFRM_SPD_FCNT)
+		NLA_PUT_U32(skb, XFRMA_SPDFCNT, si.fwdcnt);
+	if (flags & XFRM_SPD_ISCNT)
+		NLA_PUT_U32(skb, XFRMA_SPDISCNT, si.inscnt);
+	if (flags & XFRM_SPD_OSCNT)
+		NLA_PUT_U32(skb, XFRMA_SPDOSCNT, si.inscnt);
+	if (flags & XFRM_SPD_FSCNT)
+		NLA_PUT_U32(skb, XFRMA_SPDFSCNT, si.inscnt);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct rtattr **xfrma)
+{
+	struct sk_buff *r_skb;
+	u32 *flags = NLMSG_DATA(nlh);
+	u32 spid = NETLINK_CB(skb).pid;
+	u32 seq = nlh->nlmsg_seq;
+	int len = NLMSG_LENGTH(sizeof(u32));
+
+
+	if (*flags & XFRM_SPD_HMASK)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_HMAX)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_ICNT)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_OCNT)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_FCNT)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_ISCNT)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_OSCNT)
+		len += RTA_SPACE(sizeof(u32));
+	if (*flags & XFRM_SPD_FSCNT)
+		len += RTA_SPACE(sizeof(u32));
+
+	r_skb = alloc_skb(len, GFP_ATOMIC);
+	if (r_skb == NULL)
+		return -ENOMEM;
+
+	if (build_spdinfo(r_skb, spid, seq, *flags) < 0)
+		BUG();
+
+	return nlmsg_unicast(xfrm_nl, r_skb, spid);
+}
+
 static int build_sadinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags)
 {
 	struct xfrm_sadinfo si;
@@ -1879,6 +1954,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
 	[XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
 	[XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
 	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)),
+	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)),
 };
 
 #undef XMSGSIZE
@@ -1907,6 +1983,7 @@ static struct xfrm_link {
 	[XFRM_MSG_GETAE       - XFRM_MSG_BASE] = { .doit = xfrm_get_ae  },
 	[XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate    },
 	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo   },
+	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
 };
 
 static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-- 
cgit v1.2.3-59-g8ed1b


From 04b090d50c88ac8e5ec9c2e985bb65bd153893aa Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Sat, 28 Apr 2007 23:03:59 -0700
Subject: [AF_IUCV/IUCV]: smp_call_function deadlock

Calling smp_call_function can lead to a deadlock if it is called
from tasklet context.
Fixing this deadlock requires to move the smp_call_function from the
tasklet context to a work queue. To do that queue the path pending
interrupts to a separate list and move the path cleanup out of
iucv_path_sever to iucv_path_connect and iucv_path_pending.
This creates a new requirement for iucv_path_connect: it may not be
called from tasklet context anymore.
Also fixed compile problem for CONFIG_HOTPLUG_CPU=n and
another one when walking the cpu_online mask. When doing this,
we must disable cpu hotplug.

Signed-off-by: Frank Pavlic <fpavlic@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/iucv/iucv.h |   2 +-
 net/iucv/iucv.c         | 205 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 133 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h
index 746e7416261e..fd70adbb3566 100644
--- a/include/net/iucv/iucv.h
+++ b/include/net/iucv/iucv.h
@@ -16,7 +16,7 @@
  * completed a register, it can exploit the other functions.
  * For furthur reference on all IUCV functionality, refer to the
  * CP Programming Services book, also available on the web thru
- * www.ibm.com/s390/vm/pubs, manual # SC24-5760
+ * www.vm.ibm.com/pubs, manual # SC24-6084
  *
  * Definition of Return Codes
  * - All positive return codes including zero are reflected back
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 60f293842a39..903bdb6eaaa1 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -90,20 +90,43 @@ struct iucv_irq_data {
 	u32 res2[8];
 };
 
-struct iucv_work {
+struct iucv_irq_list {
 	struct list_head list;
 	struct iucv_irq_data data;
 };
 
-static LIST_HEAD(iucv_work_queue);
-static DEFINE_SPINLOCK(iucv_work_lock);
-
 static struct iucv_irq_data *iucv_irq_data;
 static cpumask_t iucv_buffer_cpumask = CPU_MASK_NONE;
 static cpumask_t iucv_irq_cpumask = CPU_MASK_NONE;
 
-static void iucv_tasklet_handler(unsigned long);
-static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_handler,0);
+/*
+ * Queue of interrupt buffers lock for delivery via the tasklet
+ * (fast but can't call smp_call_function).
+ */
+static LIST_HEAD(iucv_task_queue);
+
+/*
+ * The tasklet for fast delivery of iucv interrupts.
+ */
+static void iucv_tasklet_fn(unsigned long);
+static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0);
+
+/*
+ * Queue of interrupt buffers for delivery via a work queue
+ * (slower but can call smp_call_function).
+ */
+static LIST_HEAD(iucv_work_queue);
+
+/*
+ * The work element to deliver path pending interrupts.
+ */
+static void iucv_work_fn(struct work_struct *work);
+static DECLARE_WORK(iucv_work, iucv_work_fn);
+
+/*
+ * Spinlock protecting task and work queue.
+ */
+static DEFINE_SPINLOCK(iucv_queue_lock);
 
 enum iucv_command_codes {
 	IUCV_QUERY = 0,
@@ -147,10 +170,10 @@ static unsigned long iucv_max_pathid;
 static DEFINE_SPINLOCK(iucv_table_lock);
 
 /*
- * iucv_tasklet_cpu: contains the number of the cpu executing the tasklet.
- * Needed for iucv_path_sever called from tasklet.
+ * iucv_active_cpu: contains the number of the cpu executing the tasklet
+ * or the work handler. Needed for iucv_path_sever called from tasklet.
  */
-static int iucv_tasklet_cpu = -1;
+static int iucv_active_cpu = -1;
 
 /*
  * Mutex and wait queue for iucv_register/iucv_unregister.
@@ -449,17 +472,19 @@ static void iucv_setmask_mp(void)
 {
 	int cpu;
 
+	preempt_disable();
 	for_each_online_cpu(cpu)
 		/* Enable all cpus with a declared buffer. */
 		if (cpu_isset(cpu, iucv_buffer_cpumask) &&
 		    !cpu_isset(cpu, iucv_irq_cpumask))
 			smp_call_function_on(iucv_allow_cpu, NULL, 0, 1, cpu);
+	preempt_enable();
 }
 
 /**
  * iucv_setmask_up
  *
- * Allow iucv interrupts on a single cpus.
+ * Allow iucv interrupts on a single cpu.
  */
 static void iucv_setmask_up(void)
 {
@@ -493,8 +518,10 @@ static int iucv_enable(void)
 		goto out;
 	/* Declare per cpu buffers. */
 	rc = -EIO;
+	preempt_disable();
 	for_each_online_cpu(cpu)
 		smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu);
+	preempt_enable();
 	if (cpus_empty(iucv_buffer_cpumask))
 		/* No cpu could declare an iucv buffer. */
 		goto out_path;
@@ -584,48 +611,49 @@ static int iucv_sever_pathid(u16 pathid, u8 userdata[16])
 	return iucv_call_b2f0(IUCV_SEVER, parm);
 }
 
+#ifdef CONFIG_SMP
 /**
- * __iucv_cleanup_pathid
+ * __iucv_cleanup_queue
  * @dummy: unused dummy argument
  *
  * Nop function called via smp_call_function to force work items from
  * pending external iucv interrupts to the work queue.
  */
-static void __iucv_cleanup_pathid(void *dummy)
+static void __iucv_cleanup_queue(void *dummy)
 {
 }
+#endif
 
 /**
- * iucv_cleanup_pathid
- * @pathid: 16 bit pathid
+ * iucv_cleanup_queue
  *
  * Function called after a path has been severed to find all remaining
  * work items for the now stale pathid. The caller needs to hold the
  * iucv_table_lock.
  */
-static void iucv_cleanup_pathid(u16 pathid)
+static void iucv_cleanup_queue(void)
 {
-	struct iucv_work *p, *n;
+	struct iucv_irq_list *p, *n;
 
 	/*
-	 * Path is severed, the pathid can be reused immediatly on
-	 * a iucv connect or a connection pending interrupt.
-	 * iucv_path_connect and connection pending interrupt will
-	 * wait until the iucv_table_lock is released before the
-	 * recycled pathid enters the system.
-	 * Force remaining interrupts to the work queue, then
-	 * scan the work queue for items of this path.
+	 * When a path is severed, the pathid can be reused immediatly
+	 * on a iucv connect or a connection pending interrupt. Remove
+	 * all entries from the task queue that refer to a stale pathid
+	 * (iucv_path_table[ix] == NULL). Only then do the iucv connect
+	 * or deliver the connection pending interrupt. To get all the
+	 * pending interrupts force them to the work queue by calling
+	 * an empty function on all cpus.
 	 */
-	smp_call_function(__iucv_cleanup_pathid, NULL, 0, 1);
-	spin_lock_irq(&iucv_work_lock);
-	list_for_each_entry_safe(p, n, &iucv_work_queue, list) {
-		/* Remove work items for pathid except connection pending */
-		if (p->data.ippathid == pathid && p->data.iptype != 0x01) {
+	smp_call_function(__iucv_cleanup_queue, NULL, 0, 1);
+	spin_lock_irq(&iucv_queue_lock);
+	list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
+		/* Remove stale work items from the task queue. */
+		if (iucv_path_table[p->data.ippathid] == NULL) {
 			list_del(&p->list);
 			kfree(p);
 		}
 	}
-	spin_unlock_irq(&iucv_work_lock);
+	spin_unlock_irq(&iucv_queue_lock);
 }
 
 /**
@@ -684,7 +712,6 @@ void iucv_unregister(struct iucv_handler *handler, int smp)
 		iucv_sever_pathid(p->pathid, NULL);
 		iucv_path_table[p->pathid] = NULL;
 		list_del(&p->list);
-		iucv_cleanup_pathid(p->pathid);
 		iucv_path_free(p);
 	}
 	spin_unlock_bh(&iucv_table_lock);
@@ -757,9 +784,9 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
 	union iucv_param *parm;
 	int rc;
 
-	preempt_disable();
-	if (iucv_tasklet_cpu != smp_processor_id())
-		spin_lock_bh(&iucv_table_lock);
+	BUG_ON(in_atomic());
+	spin_lock_bh(&iucv_table_lock);
+	iucv_cleanup_queue();
 	parm = percpu_ptr(iucv_param, smp_processor_id());
 	memset(parm, 0, sizeof(union iucv_param));
 	parm->ctrl.ipmsglim = path->msglim;
@@ -794,9 +821,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
 			rc = -EIO;
 		}
 	}
-	if (iucv_tasklet_cpu != smp_processor_id())
-		spin_unlock_bh(&iucv_table_lock);
-	preempt_enable();
+	spin_unlock_bh(&iucv_table_lock);
 	return rc;
 }
 
@@ -867,15 +892,14 @@ int iucv_path_sever(struct iucv_path *path, u8 userdata[16])
 
 
 	preempt_disable();
-	if (iucv_tasklet_cpu != smp_processor_id())
+	if (iucv_active_cpu != smp_processor_id())
 		spin_lock_bh(&iucv_table_lock);
 	rc = iucv_sever_pathid(path->pathid, userdata);
 	if (!rc) {
 		iucv_path_table[path->pathid] = NULL;
 		list_del_init(&path->list);
-		iucv_cleanup_pathid(path->pathid);
 	}
-	if (iucv_tasklet_cpu != smp_processor_id())
+	if (iucv_active_cpu != smp_processor_id())
 		spin_unlock_bh(&iucv_table_lock);
 	preempt_enable();
 	return rc;
@@ -1244,8 +1268,7 @@ static void iucv_path_complete(struct iucv_irq_data *data)
 	struct iucv_path_complete *ipc = (void *) data;
 	struct iucv_path *path = iucv_path_table[ipc->ippathid];
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->path_complete)
+	if (path && path->handler && path->handler->path_complete)
 		path->handler->path_complete(path, ipc->ipuser);
 }
 
@@ -1273,14 +1296,14 @@ static void iucv_path_severed(struct iucv_irq_data *data)
 	struct iucv_path_severed *ips = (void *) data;
 	struct iucv_path *path = iucv_path_table[ips->ippathid];
 
-	BUG_ON(!path || !path->handler);
+	if (!path || !path->handler)	/* Already severed */
+		return;
 	if (path->handler->path_severed)
 		path->handler->path_severed(path, ips->ipuser);
 	else {
 		iucv_sever_pathid(path->pathid, NULL);
 		iucv_path_table[path->pathid] = NULL;
 		list_del_init(&path->list);
-		iucv_cleanup_pathid(path->pathid);
 		iucv_path_free(path);
 	}
 }
@@ -1309,8 +1332,7 @@ static void iucv_path_quiesced(struct iucv_irq_data *data)
 	struct iucv_path_quiesced *ipq = (void *) data;
 	struct iucv_path *path = iucv_path_table[ipq->ippathid];
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->path_quiesced)
+	if (path && path->handler && path->handler->path_quiesced)
 		path->handler->path_quiesced(path, ipq->ipuser);
 }
 
@@ -1338,8 +1360,7 @@ static void iucv_path_resumed(struct iucv_irq_data *data)
 	struct iucv_path_resumed *ipr = (void *) data;
 	struct iucv_path *path = iucv_path_table[ipr->ippathid];
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->path_resumed)
+	if (path && path->handler && path->handler->path_resumed)
 		path->handler->path_resumed(path, ipr->ipuser);
 }
 
@@ -1371,8 +1392,7 @@ static void iucv_message_complete(struct iucv_irq_data *data)
 	struct iucv_path *path = iucv_path_table[imc->ippathid];
 	struct iucv_message msg;
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->message_complete) {
+	if (path && path->handler && path->handler->message_complete) {
 		msg.flags = imc->ipflags1;
 		msg.id = imc->ipmsgid;
 		msg.audit = imc->ipaudit;
@@ -1417,8 +1437,7 @@ static void iucv_message_pending(struct iucv_irq_data *data)
 	struct iucv_path *path = iucv_path_table[imp->ippathid];
 	struct iucv_message msg;
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->message_pending) {
+	if (path && path->handler && path->handler->message_pending) {
 		msg.flags = imp->ipflags1;
 		msg.id = imp->ipmsgid;
 		msg.class = imp->iptrgcls;
@@ -1433,17 +1452,16 @@ static void iucv_message_pending(struct iucv_irq_data *data)
 }
 
 /**
- * iucv_tasklet_handler:
+ * iucv_tasklet_fn:
  *
  * This tasklet loops over the queue of irq buffers created by
  * iucv_external_interrupt, calls the appropriate action handler
  * and then frees the buffer.
  */
-static void iucv_tasklet_handler(unsigned long ignored)
+static void iucv_tasklet_fn(unsigned long ignored)
 {
 	typedef void iucv_irq_fn(struct iucv_irq_data *);
 	static iucv_irq_fn *irq_fn[] = {
-		[0x01] = iucv_path_pending,
 		[0x02] = iucv_path_complete,
 		[0x03] = iucv_path_severed,
 		[0x04] = iucv_path_quiesced,
@@ -1453,38 +1471,70 @@ static void iucv_tasklet_handler(unsigned long ignored)
 		[0x08] = iucv_message_pending,
 		[0x09] = iucv_message_pending,
 	};
-	struct iucv_work *p;
+	struct list_head task_queue = LIST_HEAD_INIT(task_queue);
+	struct iucv_irq_list *p, *n;
 
 	/* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
 	spin_lock(&iucv_table_lock);
-	iucv_tasklet_cpu = smp_processor_id();
+	iucv_active_cpu = smp_processor_id();
 
-	spin_lock_irq(&iucv_work_lock);
-	while (!list_empty(&iucv_work_queue)) {
-		p = list_entry(iucv_work_queue.next, struct iucv_work, list);
+	spin_lock_irq(&iucv_queue_lock);
+	list_splice_init(&iucv_task_queue, &task_queue);
+	spin_unlock_irq(&iucv_queue_lock);
+
+	list_for_each_entry_safe(p, n, &task_queue, list) {
 		list_del_init(&p->list);
-		spin_unlock_irq(&iucv_work_lock);
 		irq_fn[p->data.iptype](&p->data);
 		kfree(p);
-		spin_lock_irq(&iucv_work_lock);
 	}
-	spin_unlock_irq(&iucv_work_lock);
 
-	iucv_tasklet_cpu = -1;
+	iucv_active_cpu = -1;
 	spin_unlock(&iucv_table_lock);
 }
 
+/**
+ * iucv_work_fn:
+ *
+ * This work function loops over the queue of path pending irq blocks
+ * created by iucv_external_interrupt, calls the appropriate action
+ * handler and then frees the buffer.
+ */
+static void iucv_work_fn(struct work_struct *work)
+{
+	typedef void iucv_irq_fn(struct iucv_irq_data *);
+	struct list_head work_queue = LIST_HEAD_INIT(work_queue);
+	struct iucv_irq_list *p, *n;
+
+	/* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
+	spin_lock_bh(&iucv_table_lock);
+	iucv_active_cpu = smp_processor_id();
+
+	spin_lock_irq(&iucv_queue_lock);
+	list_splice_init(&iucv_work_queue, &work_queue);
+	spin_unlock_irq(&iucv_queue_lock);
+
+	iucv_cleanup_queue();
+	list_for_each_entry_safe(p, n, &work_queue, list) {
+		list_del_init(&p->list);
+		iucv_path_pending(&p->data);
+		kfree(p);
+	}
+
+	iucv_active_cpu = -1;
+	spin_unlock_bh(&iucv_table_lock);
+}
+
 /**
  * iucv_external_interrupt
  * @code: irq code
  *
  * Handles external interrupts coming in from CP.
- * Places the interrupt buffer on a queue and schedules iucv_tasklet_handler().
+ * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
  */
 static void iucv_external_interrupt(u16 code)
 {
 	struct iucv_irq_data *p;
-	struct iucv_work *work;
+	struct iucv_irq_list *work;
 
 	p = percpu_ptr(iucv_irq_data, smp_processor_id());
 	if (p->ippathid >= iucv_max_pathid) {
@@ -1498,16 +1548,23 @@ static void iucv_external_interrupt(u16 code)
 		printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n");
 		return;
 	}
-	work = kmalloc(sizeof(struct iucv_work), GFP_ATOMIC);
+	work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC);
 	if (!work) {
 		printk(KERN_WARNING "iucv_external_interrupt: out of memory\n");
 		return;
 	}
 	memcpy(&work->data, p, sizeof(work->data));
-	spin_lock(&iucv_work_lock);
-	list_add_tail(&work->list, &iucv_work_queue);
-	spin_unlock(&iucv_work_lock);
-	tasklet_schedule(&iucv_tasklet);
+	spin_lock(&iucv_queue_lock);
+	if (p->iptype == 0x01) {
+		/* Path pending interrupt. */
+		list_add_tail(&work->list, &iucv_work_queue);
+		schedule_work(&iucv_work);
+	} else {
+		/* The other interrupts. */
+		list_add_tail(&work->list, &iucv_task_queue);
+		tasklet_schedule(&iucv_tasklet);
+	}
+	spin_unlock(&iucv_queue_lock);
 }
 
 /**
@@ -1577,12 +1634,14 @@ out:
  */
 static void iucv_exit(void)
 {
-	struct iucv_work *p, *n;
+	struct iucv_irq_list *p, *n;
 
-	spin_lock_irq(&iucv_work_lock);
+	spin_lock_irq(&iucv_queue_lock);
+	list_for_each_entry_safe(p, n, &iucv_task_queue, list)
+		kfree(p);
 	list_for_each_entry_safe(p, n, &iucv_work_queue, list)
 		kfree(p);
-	spin_unlock_irq(&iucv_work_lock);
+	spin_unlock_irq(&iucv_queue_lock);
 	unregister_hotcpu_notifier(&iucv_cpu_notifier);
 	percpu_free(iucv_param);
 	percpu_free(iucv_irq_data);
-- 
cgit v1.2.3-59-g8ed1b


From 46f8914e53c28d0716c586e08a7c819d8ebb9d54 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Mon, 30 Apr 2007 00:07:31 -0700
Subject: [SKB]: Introduce skb_queue_walk_safe()

This patch provides a method for walking skb lists while inserting or
removing skbs from the list.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2694cb3ca763..253a2b9be9d6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1471,6 +1471,11 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 		     prefetch(skb->next), (skb != (struct sk_buff *)(queue));	\
 		     skb = skb->next)
 
+#define skb_queue_walk_safe(queue, skb, tmp)					\
+		for (skb = (queue)->next, tmp = skb->next;			\
+		     skb != (struct sk_buff *)(queue);				\
+		     skb = tmp, tmp = skb->next)
+
 #define skb_queue_reverse_walk(queue, skb) \
 		for (skb = (queue)->prev;					\
 		     prefetch(skb->prev), (skb != (struct sk_buff *)(queue));	\
-- 
cgit v1.2.3-59-g8ed1b


From 34588b4c046c34773e5a1a962da7b78b05c4d1bd Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 30 Apr 2007 00:57:33 -0700
Subject: [TCP]: Catch skb with S+L bugs earlier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SACKED_ACKED and LOST are mutually exclusive with SACK, thus
having their sum larger than packets_out is bug with SACK.
Eventually these bugs trigger traps in the tcp_clean_rtx_queue
with SACK but it's much more informative to do this here.

Non-SACK TCP, however, could get more than packets_out duplicate
ACKs which each increment sacked_out, so it makes sense to do
this kind of limitting for non-SACK TCP but not for SACK enabled
one. Perhaps the author had the opposite in mind but did the
logic accidently wrong way around? Anyway, the sacked_out
incrementer code for non-SACK already deals this issue before
calling sync_left_out so this trapping can be done
unconditionally.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a385797f160a..c6ecd455edab 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -736,9 +736,7 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 
 static inline void tcp_sync_left_out(struct tcp_sock *tp)
 {
-	if (tp->rx_opt.sack_ok &&
-	    (tp->sacked_out >= tp->packets_out - tp->lost_out))
-		tp->sacked_out = tp->packets_out - tp->lost_out;
+	BUG_ON(tp->sacked_out + tp->lost_out > tp->packets_out);
 	tp->left_out = tp->sacked_out + tp->lost_out;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 157bfc25020f7eb731f94140e099307ade47299e Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Mon, 30 Apr 2007 00:33:35 -0700
Subject: [XFRM]: Restrict upper layer information by bundle.

On MIPv6 usage, XFRM sub policy is enabled.
When main (IPsec) and sub (MIPv6) policy selectors have the same
address set but different upper layer information (i.e. protocol
number and its ports or type/code), multiple bundle should be created.
However, currently we have issue to use the same bundle created for
the first time with all flows covered by the case.

It is useful for the bundle to have the upper layer information
to be restructured correctly if it does not match with the flow.

1. Bundle was created by two policies
Selector from another policy is added to xfrm_dst.
If the flow does not match the selector, it goes to slow path to
restructure new bundle by single policy.

2. Bundle was created by one policy
Flow cache is added to xfrm_dst as originated one. If the flow does
not match the cache, it goes to slow path to try searching another
policy.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h     |  6 ++++++
 include/net/xfrm.h     | 10 +++++++++
 net/xfrm/xfrm_policy.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index ce4b10d8b412..f3cc1f812619 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -97,4 +97,10 @@ extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
+static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2)
+{
+	return (fl1->proto == fl2->proto &&
+		!memcmp(&fl1->uli_u, &fl2->uli_u, sizeof(fl1->uli_u)));
+}
+
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9561bf817b02..66c2d3eec03c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -603,6 +603,10 @@ struct xfrm_dst
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct flowi *origin;
+	struct xfrm_selector *partner;
+#endif
 	u32 genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
@@ -615,6 +619,12 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
+#ifdef CONFIG_XFRM_SUB_POLICY
+	kfree(xdst->origin);
+	xdst->origin = NULL;
+	kfree(xdst->partner);
+	xdst->partner = NULL;
+#endif
 }
 
 extern void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dbf9d96a2f0b..263e34e45265 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1344,6 +1344,40 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
 	return err;
 }
 
+static int inline
+xfrm_dst_alloc_copy(void **target, void *src, int size)
+{
+	if (!*target) {
+		*target = kmalloc(size, GFP_ATOMIC);
+		if (!*target)
+			return -ENOMEM;
+	}
+	memcpy(*target, src, size);
+	return 0;
+}
+
+static int inline
+xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	return xfrm_dst_alloc_copy((void **)&(xdst->partner),
+				   sel, sizeof(*sel));
+#else
+	return 0;
+#endif
+}
+
+static int inline
+xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
+#else
+	return 0;
+#endif
+}
 
 static int stale_bundle(struct dst_entry *dst);
 
@@ -1532,6 +1566,18 @@ restart:
 			err = -EHOSTUNREACH;
 			goto error;
 		}
+
+		if (npols > 1)
+			err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+		else
+			err = xfrm_dst_update_origin(dst, fl);
+		if (unlikely(err)) {
+			write_unlock_bh(&policy->lock);
+			if (dst)
+				dst_free(dst);
+			goto error;
+		}
+
 		dst->next = policy->bundles;
 		policy->bundles = dst;
 		dst_hold(dst);
@@ -1947,6 +1993,15 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (fl) {
+		if (first->origin && !flow_cache_uli_match(first->origin, fl))
+			return 0;
+		if (first->partner &&
+		    !xfrm_selector_match(first->partner, fl, family))
+			return 0;
+	}
+#endif
 
 	last = NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From d551e4541dd60ae53459f77a971f2d6043431f5f Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 30 Apr 2007 00:42:20 -0700
Subject: [TCP] FRTO: RFC4138 allows Nagle override when new data must be sent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a corner case where less than MSS sized new data thingie
is awaiting in the send queue. For F-RTO to work correctly, a
new data segment must be sent at certain point or F-RTO cannot
be used at all. RFC4138 allows overriding of Nagle at that
point.

Implementation uses frto_counter states 2 and 3 to distinguish
when Nagle override is needed.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  5 +++++
 net/ipv4/tcp_input.c  | 13 ++++++++-----
 net/ipv4/tcp_output.c |  6 ++++--
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c6ecd455edab..ef8f9d4dae85 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1199,9 +1199,14 @@ static inline struct sk_buff *tcp_send_head(struct sock *sk)
 
 static inline void tcp_advance_send_head(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	sk->sk_send_head = skb->next;
 	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
 		sk->sk_send_head = NULL;
+	/* Don't override Nagle indefinately with F-RTO */
+	if (tp->frto_counter == 2)
+		tp->frto_counter = 3;
 }
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6b669898b197..7641b2761a14 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2637,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
  *                  algorithm is not part of the F-RTO detection algorithm
  *                  given in RFC4138 but can be selected separately).
  * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery.
+ * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
+ * of Nagle, this is done using frto_counter states 2 and 3, when a new data
+ * segment of any size sent during F-RTO, state 2 is upgraded to 3.
  *
  * Rationale: if the RTO was spurious, new ACKs should arrive from the
  * original window even after we transmit two new data segments.
@@ -2666,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		inet_csk(sk)->icsk_retransmits = 0;
 
 	if (!before(tp->snd_una, tp->frto_highmark)) {
-		tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
 		return 1;
 	}
 
@@ -2692,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 			return 1;
 		}
 
-		if ((tp->frto_counter == 2) &&
+		if ((tp->frto_counter >= 2) &&
 		    (!(flag&FLAG_FORWARD_PROGRESS) ||
 		     ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
 			/* RFC4138 shortcoming (see comment above) */
@@ -2709,14 +2711,15 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		if (!tcp_send_head(sk) ||
 		    after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
 				     tp->snd_una + tp->snd_wnd)) {
-			tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
+					    flag);
 			return 1;
 		}
 
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
 		tp->frto_counter = 2;
 		return 1;
-	} else /* frto_counter == 2 */ {
+	} else {
 		switch (sysctl_tcp_frto_response) {
 		case 2:
 			tcp_undo_spur_to_response(sk, flag);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b5fa3c19afee..0faacf9c419d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
 	if (nonagle & TCP_NAGLE_PUSH)
 		return 1;
 
-	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
-	if (tp->urg_mode ||
+	/* Don't use the nagle rule for urgent data (or for the final FIN).
+	 * Nagle can be ignored during F-RTO too (see RFC4138).
+	 */
+	if (tp->urg_mode || (tp->frto_counter == 2) ||
 	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
 		return 1;
 
-- 
cgit v1.2.3-59-g8ed1b


From 71ff6c0a857d11e70aec0c8f1e0d4ae9a45dd468 Mon Sep 17 00:00:00 2001
From: Mitsuru Chinen <mitch@linux.vnet.ibm.com>
Date: Mon, 30 Apr 2007 00:45:02 -0700
Subject: [SNMP]: Add definitions for {In,Out}BcastPkts

The updated IP-MIB RFC (RFC4293) specifys new objects, InBcastPkts
and OutBcastPkts. This adds definitions for them.

Signed-off-by: Mitsuru Chinen <mitch@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 854aa6b543f1..802b3a38b041 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -40,6 +40,8 @@ enum
 	IPSTATS_MIB_FRAGCREATES,		/* FragCreates */
 	IPSTATS_MIB_INMCASTPKTS,		/* InMcastPkts */
 	IPSTATS_MIB_OUTMCASTPKTS,		/* OutMcastPkts */
+	IPSTATS_MIB_INBCASTPKTS,		/* InBcastPkts */
+	IPSTATS_MIB_OUTBCASTPKTS,		/* OutBcastPkts */
 	__IPSTATS_MIB_MAX
 };
 
-- 
cgit v1.2.3-59-g8ed1b