From 9a0785b0da561e1e9c6617df85e93ae107a42f18 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 1 Apr 2010 15:01:04 -0700
Subject: blkio: Remove per-cfqq nr_sectors as we'll be passing

that info at request dispatch with other stats now. This patch removes the
existing support for accounting sectors for a blkio_group. This will be added
back differently in the next two patches.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  |  3 +--
 block/blk-cgroup.h  |  6 ++----
 block/cfq-iosched.c | 10 ++--------
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b686ad08eaa..5be39813fc9b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -56,10 +56,9 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
 void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-			unsigned long time, unsigned long sectors)
+						unsigned long time)
 {
 	blkg->time += time;
-	blkg->sectors += sectors;
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ccc20464dae..fe445178f586 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -106,7 +106,7 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
 void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-			unsigned long time, unsigned long sectors);
+						unsigned long time);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -123,8 +123,6 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-			unsigned long time, unsigned long sectors)
-{
-}
+						unsigned long time) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2c7a0f4f3cd7..7471d36bce89 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -142,8 +142,6 @@ struct cfq_queue {
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
-	/* Sectors dispatched in current dispatch round */
-	unsigned long nr_sectors;
 };
 
 /*
@@ -883,8 +881,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 			slice_used = cfqq->allocated_slice;
 	}
 
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
-				cfqq->nr_sectors);
+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
 	return slice_used;
 }
 
@@ -918,8 +915,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
-						cfqq->nr_sectors);
+	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -1525,7 +1521,6 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 		cfqq->allocated_slice = 0;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
-		cfqq->nr_sectors = 0;
 
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1870,7 +1865,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-	cfqq->nr_sectors += blk_rq_sectors(rq);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 303a3acb2362f16c7e7f4c53b40c2f4b396dc8d5 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 1 Apr 2010 15:01:24 -0700
Subject: blkio: Add io controller stats like

- io_service_time
- io_wait_time
- io_serviced
- io_service_bytes

These stats are accumulated per operation type helping us to distinguish between
read and write, and sync and async IO. This patch does not increment any of
these stats.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 178 +++++++++++++++++++++++++++++++++++++++++++++++-----
 block/blk-cgroup.h  |  39 +++++++++---
 block/cfq-iosched.c |   2 +-
 3 files changed, 194 insertions(+), 25 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5be39813fc9b..ad6843f2e0ab 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,12 +55,15 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-						unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 {
-	blkg->time += time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	blkg->stats.time += time;
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
@@ -170,13 +173,121 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-#define SHOW_FUNCTION_PER_GROUP(__VAR)					\
+static int
+blkiocg_reset_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	struct blkio_group_stats *stats;
+
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock_irq(&blkcg->lock);
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		spin_lock(&blkg->stats_lock);
+		stats = &blkg->stats;
+		memset(stats, 0, sizeof(struct blkio_group_stats));
+		spin_unlock(&blkg->stats_lock);
+	}
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
+}
+
+void get_key_name(int type, char *disk_id, char *str, int chars_left)
+{
+	strlcpy(str, disk_id, chars_left);
+	chars_left -= strlen(str);
+	if (chars_left <= 0) {
+		printk(KERN_WARNING
+			"Possibly incorrect cgroup stat display format");
+		return;
+	}
+	switch (type) {
+	case IO_READ:
+		strlcat(str, " Read", chars_left);
+		break;
+	case IO_WRITE:
+		strlcat(str, " Write", chars_left);
+		break;
+	case IO_SYNC:
+		strlcat(str, " Sync", chars_left);
+		break;
+	case IO_ASYNC:
+		strlcat(str, " Async", chars_left);
+		break;
+	case IO_TYPE_MAX:
+		strlcat(str, " Total", chars_left);
+		break;
+	default:
+		strlcat(str, " Invalid", chars_left);
+	}
+}
+
+typedef uint64_t (get_var) (struct blkio_group *, int);
+
+#define MAX_KEY_LEN 100
+uint64_t get_typed_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
+		get_var *getvar, char *disk_id)
+{
+	uint64_t disk_total;
+	char key_str[MAX_KEY_LEN];
+	int type;
+
+	for (type = 0; type < IO_TYPE_MAX; type++) {
+		get_key_name(type, disk_id, key_str, MAX_KEY_LEN);
+		cb->fill(cb, key_str, getvar(blkg, type));
+	}
+	disk_total = getvar(blkg, IO_READ) + getvar(blkg, IO_WRITE);
+	get_key_name(IO_TYPE_MAX, disk_id, key_str, MAX_KEY_LEN);
+	cb->fill(cb, key_str, disk_total);
+	return disk_total;
+}
+
+uint64_t get_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
+		get_var *getvar, char *disk_id)
+{
+	uint64_t var = getvar(blkg, 0);
+	cb->fill(cb, disk_id, var);
+	return var;
+}
+
+#define GET_STAT_INDEXED(__VAR)						\
+uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int type)		\
+{									\
+	return blkg->stats.__VAR[type];					\
+}									\
+
+GET_STAT_INDEXED(io_service_bytes);
+GET_STAT_INDEXED(io_serviced);
+GET_STAT_INDEXED(io_service_time);
+GET_STAT_INDEXED(io_wait_time);
+#undef GET_STAT_INDEXED
+
+#define GET_STAT(__VAR, __CONV)						\
+uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int dummy)	\
+{									\
+	uint64_t data = blkg->stats.__VAR;				\
+	if (__CONV)							\
+		data = (uint64_t)jiffies_to_msecs(data) * NSEC_PER_MSEC;\
+	return data;							\
+}
+
+GET_STAT(time, 1);
+GET_STAT(sectors, 0);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+GET_STAT(dequeue, 0);
+#endif
+#undef GET_STAT
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR, get_stats, getvar, show_total)	\
 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-			struct cftype *cftype, struct seq_file *m)	\
+		struct cftype *cftype, struct cgroup_map_cb *cb)	\
 {									\
 	struct blkio_cgroup *blkcg;					\
 	struct blkio_group *blkg;					\
 	struct hlist_node *n;						\
+	uint64_t cgroup_total = 0;					\
+	char disk_id[10];						\
 									\
 	if (!cgroup_lock_live_group(cgroup))				\
 		return -ENODEV;						\
@@ -184,19 +295,32 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
 	rcu_read_lock();						\
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-		if (blkg->dev)						\
-			seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),	\
-				 MINOR(blkg->dev), blkg->__VAR);	\
+		if (blkg->dev) {					\
+			spin_lock_irq(&blkg->stats_lock);		\
+			snprintf(disk_id, 10, "%u:%u", MAJOR(blkg->dev),\
+					MINOR(blkg->dev));		\
+			cgroup_total += get_stats(blkg, cb, getvar,	\
+						disk_id);		\
+			spin_unlock_irq(&blkg->stats_lock);		\
+		}							\
 	}								\
+	if (show_total)							\
+		cb->fill(cb, "Total", cgroup_total);			\
 	rcu_read_unlock();						\
 	cgroup_unlock();						\
 	return 0;							\
 }
 
-SHOW_FUNCTION_PER_GROUP(time);
-SHOW_FUNCTION_PER_GROUP(sectors);
+SHOW_FUNCTION_PER_GROUP(time, get_stat, get_time_stat, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, get_stat, get_sectors_stat, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, get_typed_stat,
+			get_io_service_bytes_stat, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, get_typed_stat, get_io_serviced_stat, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, get_typed_stat,
+			get_io_service_time_stat, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, get_typed_stat, get_io_wait_time_stat, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue);
+SHOW_FUNCTION_PER_GROUP(dequeue, get_stat, get_dequeue_stat, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
@@ -204,7 +328,7 @@ SHOW_FUNCTION_PER_GROUP(dequeue);
 void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue)
 {
-	blkg->dequeue += dequeue;
+	blkg->stats.dequeue += dequeue;
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
 #endif
@@ -217,16 +341,38 @@ struct cftype blkio_files[] = {
 	},
 	{
 		.name = "time",
-		.read_seq_string = blkiocg_time_read,
+		.read_map = blkiocg_time_read,
+		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "sectors",
-		.read_seq_string = blkiocg_sectors_read,
+		.read_map = blkiocg_sectors_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_service_bytes",
+		.read_map = blkiocg_io_service_bytes_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_serviced",
+		.read_map = blkiocg_io_serviced_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_service_time",
+		.read_map = blkiocg_io_service_time_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_wait_time",
+		.read_map = blkiocg_io_wait_time_read,
+		.write_u64 = blkiocg_reset_write,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        {
 		.name = "dequeue",
-		.read_seq_string = blkiocg_dequeue_read,
+		.read_map = blkiocg_dequeue_read,
        },
 #endif
 };
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index fe445178f586..5c5e5294b506 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,6 +23,14 @@ extern struct cgroup_subsys blkio_subsys;
 #define blkio_subsys_id blkio_subsys.subsys_id
 #endif
 
+enum io_type {
+	IO_READ = 0,
+	IO_WRITE,
+	IO_SYNC,
+	IO_ASYNC,
+	IO_TYPE_MAX
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -30,6 +38,23 @@ struct blkio_cgroup {
 	struct hlist_head blkg_list;
 };
 
+struct blkio_group_stats {
+	/* total disk time and nr sectors dispatched by this group */
+	uint64_t time;
+	uint64_t sectors;
+	/* Total disk time used by IOs in ns */
+	uint64_t io_service_time[IO_TYPE_MAX];
+	uint64_t io_service_bytes[IO_TYPE_MAX]; /* Total bytes transferred */
+	/* Total IOs serviced, post merge */
+	uint64_t io_serviced[IO_TYPE_MAX];
+	/* Total time spent waiting in scheduler queue in ns */
+	uint64_t io_wait_time[IO_TYPE_MAX];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* How many times this group has been removed from service tree */
+	unsigned long dequeue;
+#endif
+};
+
 struct blkio_group {
 	/* An rcu protected unique identifier for the group */
 	void *key;
@@ -38,15 +63,13 @@ struct blkio_group {
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Store cgroup path */
 	char path[128];
-	/* How many times this group has been removed from service tree */
-	unsigned long dequeue;
 #endif
 	/* The device MKDEV(major, minor), this group has been created for */
 	dev_t   dev;
 
-	/* total disk time and nr sectors dispatched by this group */
-	unsigned long time;
-	unsigned long sectors;
+	/* Need to serialize the stats in the case of reset/update */
+	spinlock_t stats_lock;
+	struct blkio_group_stats stats;
 };
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
@@ -105,8 +128,8 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-						unsigned long time);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+					unsigned long time);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -122,7 +145,7 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
-static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 						unsigned long time) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7471d36bce89..c5161bbf2fe9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -915,7 +915,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl);
+	blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-- 
cgit v1.2.3-59-g8ed1b


From 9195291e5f05e01d67f9a09c756b8aca8f009089 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 1 Apr 2010 15:01:41 -0700
Subject: blkio: Increment the blkio cgroup stats for real now

We also add start_time_ns and io_start_time_ns fields to struct request
here to record the time when a request is created and when it is
dispatched to device. We use ns uints here as ms and jiffies are
not very useful for non-rotational media.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
 block/blk-cgroup.h     | 14 +++++++++---
 block/blk-core.c       |  6 +++--
 block/cfq-iosched.c    |  4 +++-
 include/linux/blkdev.h | 20 ++++++++++++++++-
 5 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad6843f2e0ab..9af7257f429c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/blkdev.h>
 #include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(blkio_list_lock);
@@ -55,6 +56,26 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+/*
+ * Add to the appropriate stat variable depending on the request type.
+ * This should be called with the blkg->stats_lock held.
+ */
+void io_add_stat(uint64_t *stat, uint64_t add, unsigned int flags)
+{
+	if (flags & REQ_RW)
+		stat[IO_WRITE] += add;
+	else
+		stat[IO_READ] += add;
+	/*
+	 * Everywhere in the block layer, an IO is treated as sync if it is a
+	 * read or a SYNC write. We follow the same norm.
+	 */
+	if (!(flags & REQ_RW) || flags & REQ_RW_SYNC)
+		stat[IO_SYNC] += add;
+	else
+		stat[IO_ASYNC] += add;
+}
+
 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 {
 	unsigned long flags;
@@ -65,6 +86,41 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
+void blkiocg_update_request_dispatch_stats(struct blkio_group *blkg,
+						struct request *rq)
+{
+	struct blkio_group_stats *stats;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &blkg->stats;
+	stats->sectors += blk_rq_sectors(rq);
+	io_add_stat(stats->io_serviced, 1, rq->cmd_flags);
+	io_add_stat(stats->io_service_bytes, blk_rq_sectors(rq) << 9,
+			rq->cmd_flags);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+
+void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
+						struct request *rq)
+{
+	struct blkio_group_stats *stats;
+	unsigned long flags;
+	unsigned long long now = sched_clock();
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &blkg->stats;
+	if (time_after64(now, rq->io_start_time_ns))
+		io_add_stat(stats->io_service_time, now - rq->io_start_time_ns,
+				rq->cmd_flags);
+	if (time_after64(rq->io_start_time_ns, rq->start_time_ns))
+		io_add_stat(stats->io_wait_time,
+				rq->io_start_time_ns - rq->start_time_ns,
+				rq->cmd_flags);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_request_completion_stats);
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
 {
@@ -325,12 +381,12 @@ SHOW_FUNCTION_PER_GROUP(dequeue, get_stat, get_dequeue_stat, 0);
 #undef SHOW_FUNCTION_PER_GROUP
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue)
 {
 	blkg->stats.dequeue += dequeue;
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 #endif
 
 struct cftype blkio_files[] = {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 5c5e5294b506..80010ef64ab0 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -112,12 +112,12 @@ static inline char *blkg_path(struct blkio_group *blkg)
 {
 	return blkg->path;
 }
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 				unsigned long dequeue);
 #else
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline void blkiocg_update_blkio_group_dequeue_stats(
-			struct blkio_group *blkg, unsigned long dequeue) {}
+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+						unsigned long dequeue) {}
 #endif
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -130,6 +130,10 @@ extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 					unsigned long time);
+void blkiocg_update_request_dispatch_stats(struct blkio_group *blkg,
+						struct request *rq);
+void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
+						struct request *rq);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -147,5 +151,9 @@ static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 						unsigned long time) {}
+static inline void blkiocg_update_request_dispatch_stats(
+		struct blkio_group *blkg, struct request *rq) {}
+static inline void blkiocg_update_request_completion_stats(
+		struct blkio_group *blkg, struct request *rq) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174dc74d1..1d94f15d7f0d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->tag = -1;
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
+	set_start_time_ns(rq);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1855,8 +1856,10 @@ void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq))
+	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]++;
+		set_io_start_time_ns(rq);
+	}
 }
 
 /**
@@ -2517,4 +2520,3 @@ int __init blk_dev_init(void)
 
 	return 0;
 }
-
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c5161bbf2fe9..42028e7128a7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -855,7 +855,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->saved_workload_slice = 0;
-	blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+	blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -1865,6 +1865,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
+	blkiocg_update_request_dispatch_stats(&cfqq->cfqg->blkg, rq);
 }
 
 /*
@@ -3285,6 +3286,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
+	blkiocg_update_request_completion_stats(&cfqq->cfqg->blkg, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8bae7bb..f3fff8bf85ee 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,7 +194,10 @@ struct request {
 
 	struct gendisk *rq_disk;
 	unsigned long start_time;
-
+#ifdef CONFIG_BLK_CGROUP
+	unsigned long long start_time_ns;
+	unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
@@ -1196,6 +1199,21 @@ static inline void put_dev_sector(Sector p)
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 
+#ifdef CONFIG_BLK_CGROUP
+static inline void set_start_time_ns(struct request *req)
+{
+	req->start_time_ns = sched_clock();
+}
+
+static inline void set_io_start_time_ns(struct request *req)
+{
+	req->io_start_time_ns = sched_clock();
+}
+#else
+static inline void set_start_time_ns(struct request *req) {}
+static inline void set_io_start_time_ns(struct request *req) {}
+#endif
+
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
-- 
cgit v1.2.3-59-g8ed1b


From 31373d09da5b7fe21fe6f781e92bd534a3495f00 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Tue, 6 Apr 2010 14:25:14 +0200
Subject: laptop-mode: Make flushes per-device

One of the features of laptop-mode is that it forces a writeout of dirty
pages if something else triggers a physical read or write from a device.
The current implementation flushes pages on all devices, rather than only
the one that triggered the flush. This patch alters the behaviour so that
only the recently accessed block device is flushed, preventing other
disks being spun up for no terribly good reason.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            |  5 ++++-
 include/linux/backing-dev.h |  3 +++
 include/linux/writeback.h   |  4 +++-
 mm/page-writeback.c         | 39 ++++++++++++++++++++-------------------
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 1d94f15d7f0d..4b1b29ef2cb0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -451,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	 */
 	blk_sync_queue(q);
 
+	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
@@ -511,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		return NULL;
 	}
 
+	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
@@ -2101,7 +2104,7 @@ static void blk_finish_request(struct request *req, int error)
 	BUG_ON(blk_queued_rq(req));
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
-		laptop_io_completion();
+		laptop_io_completion(&req->q->backing_dev_info);
 
 	blk_delete_timer(req);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26af00e4..2742e1adfc30 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/timer.h>
 #include <linux/writeback.h>
 #include <asm/atomic.h>
 
@@ -88,6 +89,8 @@ struct backing_dev_info {
 
 	struct device *dev;
 
+	struct timer_list laptop_mode_wb_timer;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 36520ded3e06..eb38a2c645f6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -96,8 +96,10 @@ static inline void inode_sync_wait(struct inode *inode)
 /*
  * mm/page-writeback.c
  */
-void laptop_io_completion(void);
+void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
+void laptop_mode_sync(struct work_struct *work);
+void laptop_mode_timer_fn(unsigned long data);
 void throttle_vm_writeout(gfp_t gfp_mask);
 
 /* These are exported to sysctl. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..d0f2b3765f8d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
         }
 }
 
-static void laptop_timer_fn(unsigned long unused);
-
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
-
 /*
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
@@ -697,21 +693,19 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	return 0;
 }
 
-static void do_laptop_sync(struct work_struct *work)
+void laptop_mode_timer_fn(unsigned long data)
 {
-	wakeup_flusher_threads(0);
-	kfree(work);
-}
+	struct request_queue *q = (struct request_queue *)data;
+	int nr_pages = global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS);
 
-static void laptop_timer_fn(unsigned long unused)
-{
-	struct work_struct *work;
+	/*
+	 * We want to write everything out, not just down to the dirty
+	 * threshold
+	 */
 
-	work = kmalloc(sizeof(*work), GFP_ATOMIC);
-	if (work) {
-		INIT_WORK(work, do_laptop_sync);
-		schedule_work(work);
-	}
+	if (bdi_has_dirty_io(&q->backing_dev_info))
+		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
 }
 
 /*
@@ -719,9 +713,9 @@ static void laptop_timer_fn(unsigned long unused)
  * of all dirty data a few seconds from now.  If the flush is already scheduled
  * then push it back - the user is still using the disk.
  */
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
 {
-	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 
 /*
@@ -731,7 +725,14 @@ void laptop_io_completion(void)
  */
 void laptop_sync_completion(void)
 {
-	del_timer(&laptop_mode_wb_timer);
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+		del_timer(&bdi->laptop_mode_wb_timer);
+
+	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 84c124da9ff50bd71fab9c939ee5b7cd8bef2bd9 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Fri, 9 Apr 2010 08:31:19 +0200
Subject: blkio: Changes to IO controller additional stats patches

that include some minor fixes and addresses all comments.

Changelog: (most based on Vivek Goyal's comments)
o renamed blkiocg_reset_write to blkiocg_reset_stats
o more clarification in the documentation on io_service_time and io_wait_time
o Initialize blkg->stats_lock
o rename io_add_stat to blkio_add_stat and declare it static
o use bool for direction and sync
o derive direction and sync info from existing rq methods
o use 12 for major:minor string length
o define io_service_time better to cover the NCQ case
o add a separate reset_stats interface
o make the indexed stats a 2d array to simplify macro and function pointer code
o blkio.time now exports in jiffies as before
o Added stats description in patch description and
  Documentation/cgroup/blkio-controller.txt
o Prefix all stats functions with blkio and make them static as applicable
o replace IO_TYPE_MAX with IO_TYPE_TOTAL
o Moved #define constant to top of blk-cgroup.c
o Pass dev_t around instead of char *
o Add note to documentation file about resetting stats
o use BLK_CGROUP_MODULE in addition to BLK_CGROUP config option in #ifdef
  statements
o Avoid struct request specific knowledge in blk-cgroup. blk-cgroup.h now has
  rq_direction() and rq_sync() functions which are used by CFQ and when using
  io-controller at a higher level, bio_* functions can be added.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt |  48 +++++++-
 block/blk-cgroup.c                         | 190 +++++++++++++----------------
 block/blk-cgroup.h                         |  64 ++++++----
 block/cfq-iosched.c                        |   8 +-
 include/linux/blkdev.h                     |  18 +++
 5 files changed, 198 insertions(+), 130 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..ed04fe9cce1a 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -77,7 +77,6 @@ Details of cgroup files
 =======================
 - blkio.weight
 	- Specifies per cgroup weight.
-
 	  Currently allowed range of weights is from 100 to 1000.
 
 - blkio.time
@@ -92,6 +91,49 @@ Details of cgroup files
 	  third field specifies the number of sectors transferred by the
 	  group to/from the device.
 
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs completed to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
 - blkio.dequeue
 	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
 	  gives the statistics about how many a times a group was dequeued
@@ -99,6 +141,10 @@ Details of cgroup files
 	  and minor number of the device and third field specifies the number
 	  of times a group was dequeued from a particular device.
 
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
+
 CFQ sysfs tunable
 =================
 /sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9af7257f429c..6797df508821 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -18,6 +18,8 @@
 #include <linux/blkdev.h>
 #include "blk-cgroup.h"
 
+#define MAX_KEY_LEN 100
+
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
@@ -56,24 +58,27 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+void blkio_group_init(struct blkio_group *blkg)
+{
+	spin_lock_init(&blkg->stats_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_group_init);
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
  */
-void io_add_stat(uint64_t *stat, uint64_t add, unsigned int flags)
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+				bool sync)
 {
-	if (flags & REQ_RW)
-		stat[IO_WRITE] += add;
+	if (direction)
+		stat[BLKIO_STAT_WRITE] += add;
 	else
-		stat[IO_READ] += add;
-	/*
-	 * Everywhere in the block layer, an IO is treated as sync if it is a
-	 * read or a SYNC write. We follow the same norm.
-	 */
-	if (!(flags & REQ_RW) || flags & REQ_RW_SYNC)
-		stat[IO_SYNC] += add;
+		stat[BLKIO_STAT_READ] += add;
+	if (sync)
+		stat[BLKIO_STAT_SYNC] += add;
 	else
-		stat[IO_ASYNC] += add;
+		stat[BLKIO_STAT_ASYNC] += add;
 }
 
 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
@@ -86,23 +91,25 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
-void blkiocg_update_request_dispatch_stats(struct blkio_group *blkg,
-						struct request *rq)
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				uint64_t bytes, bool direction, bool sync)
 {
 	struct blkio_group_stats *stats;
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	stats = &blkg->stats;
-	stats->sectors += blk_rq_sectors(rq);
-	io_add_stat(stats->io_serviced, 1, rq->cmd_flags);
-	io_add_stat(stats->io_service_bytes, blk_rq_sectors(rq) << 9,
-			rq->cmd_flags);
+	stats->sectors += bytes >> 9;
+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+			sync);
+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+			direction, sync);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 
-void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
-						struct request *rq)
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 {
 	struct blkio_group_stats *stats;
 	unsigned long flags;
@@ -110,16 +117,15 @@ void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	stats = &blkg->stats;
-	if (time_after64(now, rq->io_start_time_ns))
-		io_add_stat(stats->io_service_time, now - rq->io_start_time_ns,
-				rq->cmd_flags);
-	if (time_after64(rq->io_start_time_ns, rq->start_time_ns))
-		io_add_stat(stats->io_wait_time,
-				rq->io_start_time_ns - rq->start_time_ns,
-				rq->cmd_flags);
+	if (time_after64(now, io_start_time))
+		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+				now - io_start_time, direction, sync);
+	if (time_after64(io_start_time, start_time))
+		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+				io_start_time - start_time, direction, sync);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_request_completion_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
@@ -230,7 +236,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 }
 
 static int
-blkiocg_reset_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
 	struct blkio_cgroup *blkcg;
 	struct blkio_group *blkg;
@@ -249,29 +255,32 @@ blkiocg_reset_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-void get_key_name(int type, char *disk_id, char *str, int chars_left)
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+				int chars_left, bool diskname_only)
 {
-	strlcpy(str, disk_id, chars_left);
+	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
 	chars_left -= strlen(str);
 	if (chars_left <= 0) {
 		printk(KERN_WARNING
 			"Possibly incorrect cgroup stat display format");
 		return;
 	}
+	if (diskname_only)
+		return;
 	switch (type) {
-	case IO_READ:
+	case BLKIO_STAT_READ:
 		strlcat(str, " Read", chars_left);
 		break;
-	case IO_WRITE:
+	case BLKIO_STAT_WRITE:
 		strlcat(str, " Write", chars_left);
 		break;
-	case IO_SYNC:
+	case BLKIO_STAT_SYNC:
 		strlcat(str, " Sync", chars_left);
 		break;
-	case IO_ASYNC:
+	case BLKIO_STAT_ASYNC:
 		strlcat(str, " Async", chars_left);
 		break;
-	case IO_TYPE_MAX:
+	case BLKIO_STAT_TOTAL:
 		strlcat(str, " Total", chars_left);
 		break;
 	default:
@@ -279,63 +288,47 @@ void get_key_name(int type, char *disk_id, char *str, int chars_left)
 	}
 }
 
-typedef uint64_t (get_var) (struct blkio_group *, int);
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
+				struct cgroup_map_cb *cb, dev_t dev)
+{
+	blkio_get_key_name(0, dev, str, chars_left, true);
+	cb->fill(cb, str, val);
+	return val;
+}
 
-#define MAX_KEY_LEN 100
-uint64_t get_typed_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
-		get_var *getvar, char *disk_id)
+/* This should be called with blkg->stats_lock held */
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
+		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
 {
 	uint64_t disk_total;
 	char key_str[MAX_KEY_LEN];
-	int type;
+	enum stat_sub_type sub_type;
+
+	if (type == BLKIO_STAT_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.time, cb, dev);
+	if (type == BLKIO_STAT_SECTORS)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	if (type == BLKIO_STAT_DEQUEUE)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.dequeue, cb, dev);
+#endif
 
-	for (type = 0; type < IO_TYPE_MAX; type++) {
-		get_key_name(type, disk_id, key_str, MAX_KEY_LEN);
-		cb->fill(cb, key_str, getvar(blkg, type));
+	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+			sub_type++) {
+		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
 	}
-	disk_total = getvar(blkg, IO_READ) + getvar(blkg, IO_WRITE);
-	get_key_name(IO_TYPE_MAX, disk_id, key_str, MAX_KEY_LEN);
+	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
 	cb->fill(cb, key_str, disk_total);
 	return disk_total;
 }
 
-uint64_t get_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
-		get_var *getvar, char *disk_id)
-{
-	uint64_t var = getvar(blkg, 0);
-	cb->fill(cb, disk_id, var);
-	return var;
-}
-
-#define GET_STAT_INDEXED(__VAR)						\
-uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int type)		\
-{									\
-	return blkg->stats.__VAR[type];					\
-}									\
-
-GET_STAT_INDEXED(io_service_bytes);
-GET_STAT_INDEXED(io_serviced);
-GET_STAT_INDEXED(io_service_time);
-GET_STAT_INDEXED(io_wait_time);
-#undef GET_STAT_INDEXED
-
-#define GET_STAT(__VAR, __CONV)						\
-uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int dummy)	\
-{									\
-	uint64_t data = blkg->stats.__VAR;				\
-	if (__CONV)							\
-		data = (uint64_t)jiffies_to_msecs(data) * NSEC_PER_MSEC;\
-	return data;							\
-}
-
-GET_STAT(time, 1);
-GET_STAT(sectors, 0);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-GET_STAT(dequeue, 0);
-#endif
-#undef GET_STAT
-
-#define SHOW_FUNCTION_PER_GROUP(__VAR, get_stats, getvar, show_total)	\
+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 		struct cftype *cftype, struct cgroup_map_cb *cb)	\
 {									\
@@ -343,7 +336,6 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	struct blkio_group *blkg;					\
 	struct hlist_node *n;						\
 	uint64_t cgroup_total = 0;					\
-	char disk_id[10];						\
 									\
 	if (!cgroup_lock_live_group(cgroup))				\
 		return -ENODEV;						\
@@ -353,10 +345,8 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
 		if (blkg->dev) {					\
 			spin_lock_irq(&blkg->stats_lock);		\
-			snprintf(disk_id, 10, "%u:%u", MAJOR(blkg->dev),\
-					MINOR(blkg->dev));		\
-			cgroup_total += get_stats(blkg, cb, getvar,	\
-						disk_id);		\
+			cgroup_total += blkio_get_stat(blkg, cb,	\
+						blkg->dev, type);	\
 			spin_unlock_irq(&blkg->stats_lock);		\
 		}							\
 	}								\
@@ -367,16 +357,14 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	return 0;							\
 }
 
-SHOW_FUNCTION_PER_GROUP(time, get_stat, get_time_stat, 0);
-SHOW_FUNCTION_PER_GROUP(sectors, get_stat, get_sectors_stat, 0);
-SHOW_FUNCTION_PER_GROUP(io_service_bytes, get_typed_stat,
-			get_io_service_bytes_stat, 1);
-SHOW_FUNCTION_PER_GROUP(io_serviced, get_typed_stat, get_io_serviced_stat, 1);
-SHOW_FUNCTION_PER_GROUP(io_service_time, get_typed_stat,
-			get_io_service_time_stat, 1);
-SHOW_FUNCTION_PER_GROUP(io_wait_time, get_typed_stat, get_io_wait_time_stat, 1);
+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue, get_stat, get_dequeue_stat, 0);
+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
@@ -398,32 +386,30 @@ struct cftype blkio_files[] = {
 	{
 		.name = "time",
 		.read_map = blkiocg_time_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "sectors",
 		.read_map = blkiocg_sectors_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_service_bytes",
 		.read_map = blkiocg_io_service_bytes_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_serviced",
 		.read_map = blkiocg_io_serviced_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_service_time",
 		.read_map = blkiocg_io_service_time_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_wait_time",
 		.read_map = blkiocg_io_wait_time_read,
-		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "reset_stats",
+		.write_u64 = blkiocg_reset_stats,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 80010ef64ab0..b22e55390a4f 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,12 +23,31 @@ extern struct cgroup_subsys blkio_subsys;
 #define blkio_subsys_id blkio_subsys.subsys_id
 #endif
 
-enum io_type {
-	IO_READ = 0,
-	IO_WRITE,
-	IO_SYNC,
-	IO_ASYNC,
-	IO_TYPE_MAX
+enum stat_type {
+	/* Total time spent (in ns) between request dispatch to the driver and
+	 * request completion for IOs doen by this cgroup. This may not be
+	 * accurate when NCQ is turned on. */
+	BLKIO_STAT_SERVICE_TIME = 0,
+	/* Total bytes transferred */
+	BLKIO_STAT_SERVICE_BYTES,
+	/* Total IOs serviced, post merge */
+	BLKIO_STAT_SERVICED,
+	/* Total time spent waiting in scheduler queue in ns */
+	BLKIO_STAT_WAIT_TIME,
+	/* All the single valued stats go below this */
+	BLKIO_STAT_TIME,
+	BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	BLKIO_STAT_DEQUEUE
+#endif
+};
+
+enum stat_sub_type {
+	BLKIO_STAT_READ = 0,
+	BLKIO_STAT_WRITE,
+	BLKIO_STAT_SYNC,
+	BLKIO_STAT_ASYNC,
+	BLKIO_STAT_TOTAL
 };
 
 struct blkio_cgroup {
@@ -42,13 +61,7 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
-	/* Total disk time used by IOs in ns */
-	uint64_t io_service_time[IO_TYPE_MAX];
-	uint64_t io_service_bytes[IO_TYPE_MAX]; /* Total bytes transferred */
-	/* Total IOs serviced, post merge */
-	uint64_t io_serviced[IO_TYPE_MAX];
-	/* Total time spent waiting in scheduler queue in ns */
-	uint64_t io_wait_time[IO_TYPE_MAX];
+	uint64_t stat_arr[BLKIO_STAT_WAIT_TIME + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* How many times this group has been removed from service tree */
 	unsigned long dequeue;
@@ -65,7 +78,7 @@ struct blkio_group {
 	char path[128];
 #endif
 	/* The device MKDEV(major, minor), this group has been created for */
-	dev_t   dev;
+	dev_t dev;
 
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
@@ -128,21 +141,21 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
+void blkio_group_init(struct blkio_group *blkg);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 					unsigned long time);
-void blkiocg_update_request_dispatch_stats(struct blkio_group *blkg,
-						struct request *rq);
-void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
-						struct request *rq);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+						bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
+static inline void blkio_group_init(struct blkio_group *blkg) {}
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev)
-{
-}
+			struct blkio_group *blkg, void *key, dev_t dev) {}
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
@@ -151,9 +164,10 @@ static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 						unsigned long time) {}
-static inline void blkiocg_update_request_dispatch_stats(
-		struct blkio_group *blkg, struct request *rq) {}
-static inline void blkiocg_update_request_completion_stats(
-		struct blkio_group *blkg, struct request *rq) {}
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+		uint64_t start_time, uint64_t io_start_time, bool direction,
+		bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 42028e7128a7..5617ae030b15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -955,6 +955,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
+	blkio_group_init(&cfqg->blkg);
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1865,7 +1866,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-	blkiocg_update_request_dispatch_stats(&cfqq->cfqg->blkg, rq);
+	blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+					rq_data_dir(rq), rq_is_sync(rq));
 }
 
 /*
@@ -3286,7 +3288,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
-	blkiocg_update_request_completion_stats(&cfqq->cfqg->blkg, rq);
+	blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
+			rq_io_start_time_ns(rq), rq_data_dir(rq),
+			rq_is_sync(rq));
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3fff8bf85ee..d483c494672a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1209,9 +1209,27 @@ static inline void set_io_start_time_ns(struct request *req)
 {
 	req->io_start_time_ns = sched_clock();
 }
+
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+        return req->start_time_ns;
+}
+
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+        return req->io_start_time_ns;
+}
 #else
 static inline void set_start_time_ns(struct request *req) {}
 static inline void set_io_start_time_ns(struct request *req) {}
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+	return 0;
+}
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+	return 0;
+}
 #endif
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
-- 
cgit v1.2.3-59-g8ed1b


From 812d402648f4fc1ab1091b2172a46fc1b367c724 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 8 Apr 2010 21:14:23 -0700
Subject: blkio: Add io_merged stat

This includes both the number of bios merged into requests belonging to this
cgroup as well as the number of requests merged together.
In the past, we've observed different merging behavior across upstream kernels,
some by design some actual bugs. This stat helps a lot in debugging such
problems when applications report decreased throughput with a new kernel
version.

This needed adding an extra elevator function to capture bios being merged as I
did not want to pollute elevator code with blkiocg knowledge and hence needed
the accounting invocation to come from CFQ.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt |  5 +++++
 block/blk-cgroup.c                         | 17 +++++++++++++++++
 block/blk-cgroup.h                         |  8 +++++++-
 block/blk-core.c                           |  2 ++
 block/cfq-iosched.c                        | 11 +++++++++++
 block/elevator.c                           |  9 +++++++++
 include/linux/elevator.h                   |  6 ++++++
 7 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index ed04fe9cce1a..810e30171a54 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -134,6 +134,11 @@ Details of cgroup files
 	  minor number of the device, third field specifies the operation type
 	  and the fourth field specifies the io_wait_time in ns.
 
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
 - blkio.dequeue
 	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
 	  gives the statistics about how many a times a group was dequeued
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6797df508821..d23b538858ce 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -127,6 +127,18 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+					bool sync)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+			sync);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
 {
@@ -363,6 +375,7 @@ SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
 SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
 #endif
@@ -407,6 +420,10 @@ struct cftype blkio_files[] = {
 		.name = "io_wait_time",
 		.read_map = blkiocg_io_wait_time_read,
 	},
+	{
+		.name = "io_merged",
+		.read_map = blkiocg_io_merged_read,
+	},
 	{
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b22e55390a4f..470a29db6bec 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -34,6 +34,8 @@ enum stat_type {
 	BLKIO_STAT_SERVICED,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
+	/* Number of IOs merged */
+	BLKIO_STAT_MERGED,
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 	BLKIO_STAT_SECTORS,
@@ -61,7 +63,7 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
-	uint64_t stat_arr[BLKIO_STAT_WAIT_TIME + 1][BLKIO_STAT_TOTAL];
+	uint64_t stat_arr[BLKIO_STAT_MERGED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* How many times this group has been removed from service tree */
 	unsigned long dequeue;
@@ -148,6 +150,8 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
 						bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+					bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -169,5 +173,7 @@ static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
 		uint64_t start_time, uint64_t io_start_time, bool direction,
 		bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+						bool direction, bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b1b29ef2cb0..e9a5ae25db8c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1202,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
+		elv_bio_merged(q, req, bio);
 		if (!attempt_back_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
@@ -1235,6 +1236,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
+		elv_bio_merged(q, req, bio);
 		if (!attempt_front_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5617ae030b15..4eb1906cf6c6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1467,6 +1467,14 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 	}
 }
 
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+				struct bio *bio)
+{
+	struct cfq_queue *cfqq = RQ_CFQQ(req);
+	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, bio_data_dir(bio),
+					cfq_bio_sync(bio));
+}
+
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
@@ -1484,6 +1492,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
+	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, rq_data_dir(next),
+					rq_is_sync(next));
 }
 
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -3861,6 +3871,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
 		.elevator_allow_merge_fn =	cfq_allow_merge,
+		.elevator_bio_merged_fn =	cfq_bio_merged,
 		.elevator_dispatch_fn =		cfq_dispatch_requests,
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702d5381..5e734592bb40 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	q->last_merge = rq;
 }
 
+void elv_bio_merged(struct request_queue *q, struct request *rq,
+			struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->ops->elevator_bio_merged_fn)
+		e->ops->elevator_bio_merged_fn(q, rq, bio);
+}
+
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
 	/*
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1cb3372e65d8..2c958f4fce1e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int
 
 typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
 
+typedef void (elevator_bio_merged_fn) (struct request_queue *,
+						struct request *, struct bio *);
+
 typedef int (elevator_dispatch_fn) (struct request_queue *, int);
 
 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
@@ -36,6 +39,7 @@ struct elevator_ops
 	elevator_merged_fn *elevator_merged_fn;
 	elevator_merge_req_fn *elevator_merge_req_fn;
 	elevator_allow_merge_fn *elevator_allow_merge_fn;
+	elevator_bio_merged_fn *elevator_bio_merged_fn;
 
 	elevator_dispatch_fn *elevator_dispatch_fn;
 	elevator_add_req_fn *elevator_add_req_fn;
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
+extern void elv_bio_merged(struct request_queue *q, struct request *,
+				struct bio *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3-59-g8ed1b


From cdc1184cf4a7bd99f5473a91244197accc49146b Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 8 Apr 2010 21:15:10 -0700
Subject: blkio: Add io_queued and avg_queue_size stats

These stats are useful for getting a feel for the queue depth of the cgroup,
i.e., how filled up its queues are at a given instant and over the existence of
the cgroup. This ability is useful when debugging problems in the wild as it
helps understand the application's IO pattern w/o having to read through the
userspace code (coz its tedious or just not available) or w/o the ability
to run blktrace (since you may not have root access and/or not want to disturb
performance).

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt | 11 ++++
 block/blk-cgroup.c                         | 98 ++++++++++++++++++++++++++++--
 block/blk-cgroup.h                         | 20 +++++-
 block/cfq-iosched.c                        | 11 ++++
 4 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 810e30171a54..6e52e7c512a4 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -139,6 +139,17 @@ Details of cgroup files
 	  cgroup. This is further divided by the type of operation - read or
 	  write, sync or async.
 
+- blkio.io_queued
+	- Total number of requests queued up at any given instant for this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.avg_queue_size
+	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	  The average queue size for this cgroup over the entire time of this
+	  cgroup's existence. Queue size samples are taken each time one of the
+	  queues of this cgroup gets a timeslice.
+
 - blkio.dequeue
 	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
 	  gives the statistics about how many a times a group was dequeued
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d23b538858ce..1e0c4970b35d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -81,6 +81,71 @@ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 		stat[BLKIO_STAT_ASYNC] += add;
 }
 
+/*
+ * Decrements the appropriate stat variable if non-zero depending on the
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
+{
+	if (direction) {
+		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+		stat[BLKIO_STAT_WRITE]--;
+	} else {
+		BUG_ON(stat[BLKIO_STAT_READ] == 0);
+		stat[BLKIO_STAT_READ]--;
+	}
+	if (sync) {
+		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+		stat[BLKIO_STAT_SYNC]--;
+	} else {
+		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+		stat[BLKIO_STAT_ASYNC]--;
+	}
+}
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
+{
+	unsigned long flags;
+	struct blkio_group_stats *stats;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &blkg->stats;
+	stats->avg_queue_size_sum +=
+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+	stats->avg_queue_size_samples++;
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats);
+#endif
+
+void blkiocg_update_request_add_stats(struct blkio_group *blkg,
+			struct blkio_group *curr_blkg, bool direction,
+			bool sync)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+			sync);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats);
+
+void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
+						bool direction, bool sync)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+					direction, sync);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_request_remove_stats);
+
 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 {
 	unsigned long flags;
@@ -253,14 +318,18 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	struct blkio_cgroup *blkcg;
 	struct blkio_group *blkg;
 	struct hlist_node *n;
-	struct blkio_group_stats *stats;
+	uint64_t queued[BLKIO_STAT_TOTAL];
+	int i;
 
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		spin_lock(&blkg->stats_lock);
-		stats = &blkg->stats;
-		memset(stats, 0, sizeof(struct blkio_group_stats));
+		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+			queued[i] = blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i];
+		memset(&blkg->stats, 0, sizeof(struct blkio_group_stats));
+		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+			blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
 		spin_unlock(&blkg->stats_lock);
 	}
 	spin_unlock_irq(&blkcg->lock);
@@ -323,6 +392,15 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.sectors, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
+		uint64_t sum = blkg->stats.avg_queue_size_sum;
+		uint64_t samples = blkg->stats.avg_queue_size_samples;
+		if (samples)
+			do_div(sum, samples);
+		else
+			sum = 0;
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+	}
 	if (type == BLKIO_STAT_DEQUEUE)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.dequeue, cb, dev);
@@ -376,8 +454,10 @@ SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
 SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
+SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
+SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
@@ -424,15 +504,23 @@ struct cftype blkio_files[] = {
 		.name = "io_merged",
 		.read_map = blkiocg_io_merged_read,
 	},
+	{
+		.name = "io_queued",
+		.read_map = blkiocg_io_queued_read,
+	},
 	{
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       {
+	{
+		.name = "avg_queue_size",
+		.read_map = blkiocg_avg_queue_size_read,
+	},
+	{
 		.name = "dequeue",
 		.read_map = blkiocg_dequeue_read,
-       },
+	},
 #endif
 };
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 470a29db6bec..bea7f3b9a88e 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -36,10 +36,13 @@ enum stat_type {
 	BLKIO_STAT_WAIT_TIME,
 	/* Number of IOs merged */
 	BLKIO_STAT_MERGED,
+	/* Number of IOs queued up */
+	BLKIO_STAT_QUEUED,
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 	BLKIO_STAT_SECTORS,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+	BLKIO_STAT_AVG_QUEUE_SIZE,
 	BLKIO_STAT_DEQUEUE
 #endif
 };
@@ -63,8 +66,12 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
-	uint64_t stat_arr[BLKIO_STAT_MERGED + 1][BLKIO_STAT_TOTAL];
+	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* Sum of number of IOs queued across all samples */
+	uint64_t avg_queue_size_sum;
+	/* Count of samples taken for average */
+	uint64_t avg_queue_size_samples;
 	/* How many times this group has been removed from service tree */
 	unsigned long dequeue;
 #endif
@@ -127,10 +134,13 @@ static inline char *blkg_path(struct blkio_group *blkg)
 {
 	return blkg->path;
 }
+void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg);
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 				unsigned long dequeue);
 #else
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+static inline void blkiocg_update_set_active_queue_stats(
+						struct blkio_group *blkg) {}
 static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 						unsigned long dequeue) {}
 #endif
@@ -152,6 +162,10 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 					bool sync);
+void blkiocg_update_request_add_stats(struct blkio_group *blkg,
+		struct blkio_group *curr_blkg, bool direction, bool sync);
+void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
+					bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -175,5 +189,9 @@ static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
 		bool sync) {}
 static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 						bool direction, bool sync) {}
+static inline void blkiocg_update_request_add_stats(struct blkio_group *blkg,
+		struct blkio_group *curr_blkg, bool direction, bool sync) {}
+static inline void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
+						bool direction, bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4eb1906cf6c6..8e0b86a9111a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1380,7 +1380,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
+	blkiocg_update_request_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+						rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
+	blkiocg_update_request_add_stats(
+			&cfqq->cfqg->blkg, &cfqq->cfqd->serving_group->blkg,
+			rq_data_dir(rq), rq_is_sync(rq));
 }
 
 static struct request *
@@ -1436,6 +1441,8 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
+	blkiocg_update_request_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+						rq_is_sync(rq));
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
@@ -1527,6 +1534,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
+		blkiocg_update_set_active_queue_stats(&cfqq->cfqg->blkg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -3213,6 +3221,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
 
+	blkiocg_update_request_add_stats(&cfqq->cfqg->blkg,
+			&cfqd->serving_group->blkg, rq_data_dir(rq),
+			rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 812df48d127365ffd0869aa139738f572a86759c Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 8 Apr 2010 21:15:35 -0700
Subject: blkio: Add more debug-only per-cgroup stats

1) group_wait_time - This is the amount of time the cgroup had to wait to get a
  timeslice for one of its queues from when it became busy, i.e., went from 0
  to 1 request queued. This is different from the io_wait_time which is the
  cumulative total of the amount of time spent by each IO in that cgroup waiting
  in the scheduler queue. This stat is a great way to find out any jobs in the
  fleet that are being starved or waiting for longer than what is expected (due
  to an IO controller bug or any other issue).
2) empty_time - This is the amount of time a cgroup spends w/o any pending
   requests. This stat is useful when a job does not seem to be able to use its
   assigned disk share by helping check if that is happening due to an IO
   controller bug or because the job is not submitting enough IOs.
3) idle_time - This is the amount of time spent by the IO scheduler idling
   for a given cgroup in anticipation of a better request than the exising ones
   from other queues/cgroups.

All these stats are recorded using start and stop events. When reading these
stats, we do not add the delta between the current time and the last start time
if we're between the start and stop events. We avoid doing this to make sure
that these numbers are always monotonically increasing when read. Since we're
using sched_clock() which may use the tsc as its source, it may induce some
inconsistency (due to tsc resync across cpus) if we included the current delta.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt |  29 ++++++
 block/blk-cgroup.c                         | 159 ++++++++++++++++++++++++++++-
 block/blk-cgroup.h                         |  54 ++++++++++
 block/cfq-iosched.c                        |  50 +++++----
 4 files changed, 271 insertions(+), 21 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 6e52e7c512a4..db054ea3e7fb 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -150,6 +150,35 @@ Details of cgroup files
 	  cgroup's existence. Queue size samples are taken each time one of the
 	  queues of this cgroup gets a timeslice.
 
+- blkio.group_wait_time
+	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	  This is the amount of time the cgroup had to wait since it became busy
+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+	  its queues. This is different from the io_wait_time which is the
+	  cumulative total of the amount of time spent by each IO in that cgroup
+	  waiting in the scheduler queue. This is in nanoseconds. If this is
+	  read when the cgroup is in a waiting (for timeslice) state, the stat
+	  will only report the group_wait_time accumulated till the last time it
+	  got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	  This is the amount of time a cgroup spends without any pending
+	  requests when not being served, i.e., it does not include any time
+	  spent idling for one of the queues of the cgroup. This is in
+	  nanoseconds. If this is read when the cgroup is in an empty state,
+	  the stat will only report the empty_time accumulated till the last
+	  time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	  This is the amount of time spent by the IO scheduler idling for a
+	  given cgroup in anticipation of a better request than the exising ones
+	  from other queues/cgroups. This is in nanoseconds. If this is read
+	  when the cgroup is in an idling state, the stat will only report the
+	  idle_time accumulated till the last idle period and will not include
+	  the current delta.
+
 - blkio.dequeue
 	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
 	  gives the statistics about how many a times a group was dequeued
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1e0c4970b35d..1ecff7a39f2c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -105,6 +105,76 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+						struct blkio_group *curr_blkg)
+{
+	if (blkio_blkg_waiting(&blkg->stats))
+		return;
+	if (blkg == curr_blkg)
+		return;
+	blkg->stats.start_group_wait_time = sched_clock();
+	blkio_mark_blkg_waiting(&blkg->stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+	unsigned long long now;
+
+	if (!blkio_blkg_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		stats->group_wait_time += now - stats->start_group_wait_time;
+	blkio_clear_blkg_waiting(stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+	unsigned long long now;
+
+	if (!blkio_blkg_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		stats->empty_time += now - stats->start_empty_time;
+	blkio_clear_blkg_empty(stats);
+}
+
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	BUG_ON(blkio_blkg_idling(&blkg->stats));
+	blkg->stats.start_idle_time = sched_clock();
+	blkio_mark_blkg_idling(&blkg->stats);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+	unsigned long flags;
+	unsigned long long now;
+	struct blkio_group_stats *stats;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &blkg->stats;
+	if (blkio_blkg_idling(stats)) {
+		now = sched_clock();
+		if (time_after64(now, stats->start_idle_time))
+			stats->idle_time += now - stats->start_idle_time;
+		blkio_clear_blkg_idling(stats);
+	}
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
+
 void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
 {
 	unsigned long flags;
@@ -116,9 +186,14 @@ void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 	stats->avg_queue_size_samples++;
+	blkio_update_group_wait_time(stats);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats);
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+					struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
 #endif
 
 void blkiocg_update_request_add_stats(struct blkio_group *blkg,
@@ -130,6 +205,8 @@ void blkiocg_update_request_add_stats(struct blkio_group *blkg,
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 			sync);
+	blkio_end_empty_time(&blkg->stats);
+	blkio_set_start_group_wait_time(blkg, curr_blkg);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats);
@@ -156,6 +233,33 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
+void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
+{
+	unsigned long flags;
+	struct blkio_group_stats *stats;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &blkg->stats;
+
+	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+		spin_unlock_irqrestore(&blkg->stats_lock, flags);
+		return;
+	}
+
+	/*
+	 * If ignore is set, we do not panic on the empty flag being set
+	 * already. This is to avoid cases where there are superfluous timeslice
+	 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
+	 * served which could result in triggering the empty check incorrectly.
+	 */
+	BUG_ON(!ignore && blkio_blkg_empty(stats));
+	stats->start_empty_time = sched_clock();
+	blkio_mark_blkg_empty(stats);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync)
 {
@@ -317,19 +421,44 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
 	struct blkio_cgroup *blkcg;
 	struct blkio_group *blkg;
+	struct blkio_group_stats *stats;
 	struct hlist_node *n;
 	uint64_t queued[BLKIO_STAT_TOTAL];
 	int i;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	bool idling, waiting, empty;
+	unsigned long long now = sched_clock();
+#endif
 
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
 	spin_lock_irq(&blkcg->lock);
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		spin_lock(&blkg->stats_lock);
+		stats = &blkg->stats;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+		idling = blkio_blkg_idling(stats);
+		waiting = blkio_blkg_waiting(stats);
+		empty = blkio_blkg_empty(stats);
+#endif
 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-			queued[i] = blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i];
-		memset(&blkg->stats, 0, sizeof(struct blkio_group_stats));
+			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+		memset(stats, 0, sizeof(struct blkio_group_stats));
 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-			blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+		if (idling) {
+			blkio_mark_blkg_idling(stats);
+			stats->start_idle_time = now;
+		}
+		if (waiting) {
+			blkio_mark_blkg_waiting(stats);
+			stats->start_group_wait_time = now;
+		}
+		if (empty) {
+			blkio_mark_blkg_empty(stats);
+			stats->start_empty_time = now;
+		}
+#endif
 		spin_unlock(&blkg->stats_lock);
 	}
 	spin_unlock_irq(&blkcg->lock);
@@ -401,6 +530,15 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 			sum = 0;
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
 	}
+	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.group_wait_time, cb, dev);
+	if (type == BLKIO_STAT_IDLE_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.idle_time, cb, dev);
+	if (type == BLKIO_STAT_EMPTY_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.empty_time, cb, dev);
 	if (type == BLKIO_STAT_DEQUEUE)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.dequeue, cb, dev);
@@ -458,6 +596,9 @@ SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
 SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
@@ -517,6 +658,18 @@ struct cftype blkio_files[] = {
 		.name = "avg_queue_size",
 		.read_map = blkiocg_avg_queue_size_read,
 	},
+	{
+		.name = "group_wait_time",
+		.read_map = blkiocg_group_wait_time_read,
+	},
+	{
+		.name = "idle_time",
+		.read_map = blkiocg_idle_time_read,
+	},
+	{
+		.name = "empty_time",
+		.read_map = blkiocg_empty_time_read,
+	},
 	{
 		.name = "dequeue",
 		.read_map = blkiocg_dequeue_read,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index bea7f3b9a88e..bfce085b1962 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -43,6 +43,9 @@ enum stat_type {
 	BLKIO_STAT_SECTORS,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	BLKIO_STAT_AVG_QUEUE_SIZE,
+	BLKIO_STAT_IDLE_TIME,
+	BLKIO_STAT_EMPTY_TIME,
+	BLKIO_STAT_GROUP_WAIT_TIME,
 	BLKIO_STAT_DEQUEUE
 #endif
 };
@@ -55,6 +58,13 @@ enum stat_sub_type {
 	BLKIO_STAT_TOTAL
 };
 
+/* blkg state flags */
+enum blkg_state_flags {
+	BLKG_waiting = 0,
+	BLKG_idling,
+	BLKG_empty,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -74,6 +84,21 @@ struct blkio_group_stats {
 	uint64_t avg_queue_size_samples;
 	/* How many times this group has been removed from service tree */
 	unsigned long dequeue;
+
+	/* Total time spent waiting for it to be assigned a timeslice. */
+	uint64_t group_wait_time;
+	uint64_t start_group_wait_time;
+
+	/* Time spent idling for this blkio_group */
+	uint64_t idle_time;
+	uint64_t start_idle_time;
+	/*
+	 * Total time when we have requests queued and do not contain the
+	 * current active queue.
+	 */
+	uint64_t empty_time;
+	uint64_t start_empty_time;
+	uint16_t flags;
 #endif
 };
 
@@ -137,12 +162,41 @@ static inline char *blkg_path(struct blkio_group *blkg)
 void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg);
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 				unsigned long dequeue);
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore);
+
+#define BLKG_FLAG_FNS(name)						\
+static inline void blkio_mark_blkg_##name(				\
+		struct blkio_group_stats *stats)			\
+{									\
+	stats->flags |= (1 << BLKG_##name);				\
+}									\
+static inline void blkio_clear_blkg_##name(				\
+		struct blkio_group_stats *stats)			\
+{									\
+	stats->flags &= ~(1 << BLKG_##name);				\
+}									\
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats)	\
+{									\
+	return (stats->flags & (1 << BLKG_##name)) != 0;		\
+}									\
+
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
 #else
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 static inline void blkiocg_update_set_active_queue_stats(
 						struct blkio_group *blkg) {}
 static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 						unsigned long dequeue) {}
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{}
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
+						bool ignore) {}
 #endif
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8e0b86a9111a..b6e095c7ef5e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -886,7 +886,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 }
 
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
-				struct cfq_queue *cfqq)
+				struct cfq_queue *cfqq, bool forced)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge_sl;
@@ -916,6 +916,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
 	blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+	blkiocg_set_start_empty_time(&cfqg->blkg, forced);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -1528,6 +1529,12 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	return cfqq == RQ_CFQQ(rq);
 }
 
+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	del_timer(&cfqd->idle_slice_timer);
+	blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+}
+
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
@@ -1547,7 +1554,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
 
-		del_timer(&cfqd->idle_slice_timer);
+		cfq_del_timer(cfqd, cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -1558,12 +1565,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		    bool timed_out)
+		    bool timed_out, bool forced)
 {
 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 
 	if (cfq_cfqq_wait_request(cfqq))
-		del_timer(&cfqd->idle_slice_timer);
+		cfq_del_timer(cfqd, cfqq);
 
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
@@ -1585,7 +1592,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 
-	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
+	cfq_group_served(cfqd, cfqq->cfqg, cfqq, forced);
 
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
@@ -1604,12 +1611,13 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 }
 
-static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
+static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out,
+					bool forced)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
 	if (cfqq)
-		__cfq_slice_expired(cfqd, cfqq, timed_out);
+		__cfq_slice_expired(cfqd, cfqq, timed_out, forced);
 }
 
 /*
@@ -1865,6 +1873,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+	blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
 
@@ -2176,7 +2185,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	}
 
 expire:
-	cfq_slice_expired(cfqd, 0);
+	cfq_slice_expired(cfqd, 0, false);
 new_queue:
 	/*
 	 * Current queue expired. Check if we have to switch to a new
@@ -2202,7 +2211,7 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 	BUG_ON(!list_empty(&cfqq->fifo));
 
 	/* By default cfqq is not expired if it is empty. Do it explicitly */
-	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
+	__cfq_slice_expired(cfqq->cfqd, cfqq, 0, true);
 	return dispatched;
 }
 
@@ -2218,7 +2227,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
-	cfq_slice_expired(cfqd, 0);
+	cfq_slice_expired(cfqd, 0, true);
 	BUG_ON(cfqd->busy_queues);
 
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
@@ -2382,10 +2391,15 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
-		cfq_slice_expired(cfqd, 0);
+		cfq_slice_expired(cfqd, 0, false);
 	}
 
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
+	/*
+	 * This is needed since we don't exactly match the mod_timer() and
+	 * del_timer() calls in CFQ.
+	 */
+	blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 	return 1;
 }
 
@@ -2413,7 +2427,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	orig_cfqg = cfqq->orig_cfqg;
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		__cfq_slice_expired(cfqd, cfqq, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0, false);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -2514,7 +2528,7 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	struct cfq_queue *__cfqq, *next;
 
 	if (unlikely(cfqq == cfqd->active_queue)) {
-		__cfq_slice_expired(cfqd, cfqq, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0, false);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -3143,7 +3157,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
-	cfq_slice_expired(cfqd, 1);
+	cfq_slice_expired(cfqd, 1, false);
 
 	/*
 	 * Put the new queue at the front of the of the current list,
@@ -3191,7 +3205,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		if (cfq_cfqq_wait_request(cfqq)) {
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
-				del_timer(&cfqd->idle_slice_timer);
+				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else
@@ -3352,7 +3366,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		 * - when there is a close cooperator
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
-			cfq_slice_expired(cfqd, 1);
+			cfq_slice_expired(cfqd, 1, false);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
 			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
@@ -3612,7 +3626,7 @@ static void cfq_idle_slice_timer(unsigned long data)
 		cfq_clear_cfqq_deep(cfqq);
 	}
 expire:
-	cfq_slice_expired(cfqd, timed_out);
+	cfq_slice_expired(cfqd, timed_out, false);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
@@ -3655,7 +3669,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	spin_lock_irq(q->queue_lock);
 
 	if (cfqd->active_queue)
-		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0, false);
 
 	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
-- 
cgit v1.2.3-59-g8ed1b


From 34d0f179d6dd711d3fc13c0820a456c59aae8048 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 13 Apr 2010 16:05:49 +0800
Subject: io-controller: Add a new interface "weight_device" for IO-Controller

Currently, IO Controller makes use of blkio.weight to assign weight for
all devices. Here a new user interface "blkio.weight_device" is introduced to
assign different weights for different devices. blkio.weight becomes the
default value for devices which are not configured by "blkio.weight_device"

You can use the following format to assigned specific weight for a given
device:
#echo "major:minor weight" > blkio.weight_device

major:minor represents device number.

And you can remove weight for a given device as following:
#echo "major:minor 0" > blkio.weight_device

V1->V2 changes:
- use user interface "weight_device" instead of "policy" suggested by Vivek
- rename some struct suggested by Vivek
- rebase to 2.6-block "for-linus" branch
- remove an useless list_empty check pointed out by Li Zefan
- some trivial typo fix

V2->V3 changes:
- Move policy_*_node() functions up to get rid of forward declarations
- rename related functions by adding prefix "blkio_"

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-cgroup.h  |  10 +++
 block/cfq-iosched.c |   2 +-
 3 files changed, 247 insertions(+), 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1ecff7a39f2c..649b05d7f291 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -17,6 +17,7 @@
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include "blk-cgroup.h"
+#include <linux/genhd.h>
 
 #define MAX_KEY_LEN 100
 
@@ -51,6 +52,32 @@ struct cgroup_subsys blkio_subsys = {
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+					    struct blkio_policy_node *pn)
+{
+	list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+{
+	list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	list_for_each_entry(pn, &blkcg->policy_list, node) {
+		if (pn->dev == dev)
+			return pn;
+	}
+
+	return NULL;
+}
+
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -398,6 +425,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	struct blkio_group *blkg;
 	struct hlist_node *n;
 	struct blkio_policy_type *blkiop;
+	struct blkio_policy_node *pn;
 
 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
 		return -EINVAL;
@@ -406,7 +434,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
+
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		pn = blkio_policy_search_node(blkcg, blkg->dev);
+
+		if (pn)
+			continue;
+
 		list_for_each_entry(blkiop, &blkio_list, list)
 			blkiop->ops.blkio_update_group_weight_fn(blkg,
 					blkcg->weight);
@@ -611,7 +645,202 @@ void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 #endif
 
+static int blkio_check_dev_num(dev_t dev)
+{
+	int part = 0;
+	struct gendisk *disk;
+
+	disk = get_gendisk(dev, &part);
+	if (!disk || part)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int blkio_policy_parse_and_set(char *buf,
+				      struct blkio_policy_node *newpn)
+{
+	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+	int ret;
+	unsigned long major, minor, temp;
+	int i = 0;
+	dev_t dev;
+
+	memset(s, 0, sizeof(s));
+
+	while ((p = strsep(&buf, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		s[i++] = p;
+
+		/* Prevent from inputing too many things */
+		if (i == 3)
+			break;
+	}
+
+	if (i != 2)
+		return -EINVAL;
+
+	p = strsep(&s[0], ":");
+	if (p != NULL)
+		major_s = p;
+	else
+		return -EINVAL;
+
+	minor_s = s[0];
+	if (!minor_s)
+		return -EINVAL;
+
+	ret = strict_strtoul(major_s, 10, &major);
+	if (ret)
+		return -EINVAL;
+
+	ret = strict_strtoul(minor_s, 10, &minor);
+	if (ret)
+		return -EINVAL;
+
+	dev = MKDEV(major, minor);
+
+	ret = blkio_check_dev_num(dev);
+	if (ret)
+		return ret;
+
+	newpn->dev = dev;
+
+	if (s[1] == NULL)
+		return -EINVAL;
+
+	ret = strict_strtoul(s[1], 10, &temp);
+	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+	    temp > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	newpn->weight =  temp;
+
+	return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+			      dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev);
+	if (pn)
+		return pn->weight;
+	else
+		return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+
+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+				       const char *buffer)
+{
+	int ret = 0;
+	char *buf;
+	struct blkio_policy_node *newpn, *pn;
+	struct blkio_cgroup *blkcg;
+	struct blkio_group *blkg;
+	int keep_newpn = 0;
+	struct hlist_node *n;
+	struct blkio_policy_type *blkiop;
+
+	buf = kstrdup(buffer, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+	if (!newpn) {
+		ret = -ENOMEM;
+		goto free_buf;
+	}
+
+	ret = blkio_policy_parse_and_set(buf, newpn);
+	if (ret)
+		goto free_newpn;
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	spin_lock_irq(&blkcg->lock);
+
+	pn = blkio_policy_search_node(blkcg, newpn->dev);
+	if (!pn) {
+		if (newpn->weight != 0) {
+			blkio_policy_insert_node(blkcg, newpn);
+			keep_newpn = 1;
+		}
+		spin_unlock_irq(&blkcg->lock);
+		goto update_io_group;
+	}
+
+	if (newpn->weight == 0) {
+		/* weight == 0 means deleteing a specific weight */
+		blkio_policy_delete_node(pn);
+		spin_unlock_irq(&blkcg->lock);
+		goto update_io_group;
+	}
+	spin_unlock_irq(&blkcg->lock);
+
+	pn->weight = newpn->weight;
+
+update_io_group:
+	/* update weight for each cfqg */
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (newpn->dev == blkg->dev) {
+			list_for_each_entry(blkiop, &blkio_list, list)
+				blkiop->ops.blkio_update_group_weight_fn(blkg,
+							 newpn->weight ?
+							 newpn->weight :
+							 blkcg->weight);
+		}
+	}
+
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+
+free_newpn:
+	if (!keep_newpn)
+		kfree(newpn);
+free_buf:
+	kfree(buf);
+	return ret;
+}
+
+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+				      struct seq_file *m)
+{
+	struct blkio_cgroup *blkcg;
+	struct blkio_policy_node *pn;
+
+	seq_printf(m, "dev\tweight\n");
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (list_empty(&blkcg->policy_list))
+		goto out;
+
+	spin_lock_irq(&blkcg->lock);
+	list_for_each_entry(pn, &blkcg->policy_list, node) {
+		seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+			   MINOR(pn->dev), pn->weight);
+	}
+	spin_unlock_irq(&blkcg->lock);
+
+out:
+	return 0;
+}
+
 struct cftype blkio_files[] = {
+	{
+		.name = "weight_device",
+		.read_seq_string = blkiocg_weight_device_read,
+		.write_string = blkiocg_weight_device_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "weight",
 		.read_u64 = blkiocg_weight_read,
@@ -690,6 +919,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 	struct blkio_group *blkg;
 	void *key;
 	struct blkio_policy_type *blkiop;
+	struct blkio_policy_node *pn, *pntmp;
 
 	rcu_read_lock();
 remove_entry:
@@ -720,7 +950,12 @@ remove_entry:
 		blkiop->ops.blkio_unlink_group_fn(key, blkg);
 	spin_unlock(&blkio_list_lock);
 	goto remove_entry;
+
 done:
+	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+		blkio_policy_delete_node(pn);
+		kfree(pn);
+	}
 	free_css_id(&blkio_subsys, &blkcg->css);
 	rcu_read_unlock();
 	if (blkcg != &blkio_root_cgroup)
@@ -751,6 +986,7 @@ done:
 	spin_lock_init(&blkcg->lock);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
+	INIT_LIST_HEAD(&blkcg->policy_list);
 	return &blkcg->css;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index bfce085b1962..3c27bdfc97b9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -70,6 +70,7 @@ struct blkio_cgroup {
 	unsigned int weight;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
+	struct list_head policy_list; /* list of blkio_policy_node */
 };
 
 struct blkio_group_stats {
@@ -119,6 +120,15 @@ struct blkio_group {
 	struct blkio_group_stats stats;
 };
 
+struct blkio_policy_node {
+	struct list_head node;
+	dev_t dev;
+	unsigned int weight;
+};
+
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
 						unsigned int weight);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b6e095c7ef5e..91af2f2e59ce 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -952,7 +952,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	if (!cfqg)
 		goto done;
 
-	cfqg->weight = blkcg->weight;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
@@ -970,6 +969,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					MKDEV(major, minor));
+	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
-- 
cgit v1.2.3-59-g8ed1b


From da69da184c06f365b335a0e013dc6360a82abe85 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 13 Apr 2010 16:07:50 +0800
Subject: io-controller: Document for blkio.weight_device

Here is the document for blkio.weight_device

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt | 31 +++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index db054ea3e7fb..d422b410a995 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -76,9 +76,38 @@ CONFIG_DEBUG_BLK_CGROUP
 Details of cgroup files
 =======================
 - blkio.weight
-	- Specifies per cgroup weight.
+	- Specifies per cgroup weight. This is default weight of the group
+	  on all the devices until and unless overridden by per device rule.
+	  (See blkio.weight_device).
 	  Currently allowed range of weights is from 100 to 1000.
 
+- blkio.weight_device
+	- One can specify per cgroup per device rules using this interface.
+	  These rules override the default value of group weight as specified
+	  by blkio.weight.
+
+	  Following is the format.
+
+	  #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup
+	  # echo 8:16 300 > blkio.weight_device
+	  # cat blkio.weight_device
+	  dev     weight
+	  8:16    300
+
+	  Configure weight=500 on /dev/sda (8:0) in this cgroup
+	  # echo 8:0 500 > blkio.weight_device
+	  # cat blkio.weight_device
+	  dev     weight
+	  8:0     500
+	  8:16    300
+
+	  Remove specific weight for /dev/sda in this cgroup
+	  # echo 8:0 0 > blkio.weight_device
+	  # cat blkio.weight_device
+	  dev     weight
+	  8:16    300
+
 - blkio.time
 	- disk time allocated to cgroup per device in milliseconds. First
 	  two fields specify the major and minor number of the device and
-- 
cgit v1.2.3-59-g8ed1b


From a11cdaa7af56423a921a8bdad8f5a5f4ddca918a Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Tue, 13 Apr 2010 19:59:17 +0200
Subject: block: Update to io-controller stats

Changelog from v1:
o Call blkiocg_update_idle_time_stats() at cfq_rq_enqueued() instead of at
  dispatch time.

Changelog from original patchset: (in response to Vivek Goyal's comments)
o group blkiocg_update_blkio_group_dequeue_stats() with other DEBUG functions
o rename blkiocg_update_set_active_queue_stats() to
  blkiocg_update_avg_queue_size_stats()
o s/request/io/ in blkiocg_update_request_add_stats() and
  blkiocg_update_request_remove_stats()
o Call cfq_del_timer() at request dispatch() instead of
  blkiocg_update_idle_time_stats()

Signed-off-by: Divyesh Shah<dpshah@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 28 +++++++++++++---------------
 block/blk-cgroup.h  | 12 ++++++------
 block/cfq-iosched.c | 20 +++++++++-----------
 3 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 649b05d7f291..25cc7514d817 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -202,7 +202,7 @@ void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 
-void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 {
 	unsigned long flags;
 	struct blkio_group_stats *stats;
@@ -216,14 +216,21 @@ void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
 	blkio_update_group_wait_time(stats);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+			unsigned long dequeue)
+{
+	blkg->stats.dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 #else
 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 					struct blkio_group *curr_blkg) {}
 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
 #endif
 
-void blkiocg_update_request_add_stats(struct blkio_group *blkg,
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 			struct blkio_group *curr_blkg, bool direction,
 			bool sync)
 {
@@ -236,9 +243,9 @@ void blkiocg_update_request_add_stats(struct blkio_group *blkg,
 	blkio_set_start_group_wait_time(blkg, curr_blkg);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 
-void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 						bool direction, bool sync)
 {
 	unsigned long flags;
@@ -248,7 +255,7 @@ void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
 					direction, sync);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_request_remove_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 {
@@ -636,15 +643,6 @@ SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue)
-{
-	blkg->stats.dequeue += dequeue;
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
-#endif
-
 static int blkio_check_dev_num(dev_t dev)
 {
 	int part = 0;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 3c27bdfc97b9..1d409ad9c6e8 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -169,7 +169,7 @@ static inline char *blkg_path(struct blkio_group *blkg)
 {
 	return blkg->path;
 }
-void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg);
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 				unsigned long dequeue);
 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
@@ -198,7 +198,7 @@ BLKG_FLAG_FNS(empty)
 #undef BLKG_FLAG_FNS
 #else
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline void blkiocg_update_set_active_queue_stats(
+static inline void blkiocg_update_avg_queue_size_stats(
 						struct blkio_group *blkg) {}
 static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 						unsigned long dequeue) {}
@@ -226,9 +226,9 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 					bool sync);
-void blkiocg_update_request_add_stats(struct blkio_group *blkg,
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 		struct blkio_group *curr_blkg, bool direction, bool sync);
-void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 					bool direction, bool sync);
 #else
 struct cgroup;
@@ -253,9 +253,9 @@ static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
 		bool sync) {}
 static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 						bool direction, bool sync) {}
-static inline void blkiocg_update_request_add_stats(struct blkio_group *blkg,
+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 		struct blkio_group *curr_blkg, bool direction, bool sync) {}
-static inline void blkiocg_update_request_remove_stats(struct blkio_group *blkg,
+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 						bool direction, bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 91af2f2e59ce..42be3b68d356 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1381,10 +1381,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	blkiocg_update_request_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+	blkiocg_update_io_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
-	blkiocg_update_request_add_stats(
+	blkiocg_update_io_add_stats(
 			&cfqq->cfqg->blkg, &cfqq->cfqd->serving_group->blkg,
 			rq_data_dir(rq), rq_is_sync(rq));
 }
@@ -1442,7 +1442,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	blkiocg_update_request_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+	blkiocg_update_io_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
@@ -1541,7 +1541,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		blkiocg_update_set_active_queue_stats(&cfqq->cfqg->blkg);
+		blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -2395,11 +2395,6 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	}
 
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
-	/*
-	 * This is needed since we don't exactly match the mod_timer() and
-	 * del_timer() calls in CFQ.
-	 */
-	blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 	return 1;
 }
 
@@ -3208,8 +3203,11 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
-			} else
+			} else {
+				blkiocg_update_idle_time_stats(
+						&cfqq->cfqg->blkg);
 				cfq_mark_cfqq_must_dispatch(cfqq);
+			}
 		}
 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
 		/*
@@ -3235,7 +3233,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
 
-	blkiocg_update_request_add_stats(&cfqq->cfqg->blkg,
+	blkiocg_update_io_add_stats(&cfqq->cfqg->blkg,
 			&cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
-- 
cgit v1.2.3-59-g8ed1b


From 28baf44299e0480d66ebb3093de5d51deff04e9f Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Wed, 14 Apr 2010 11:22:38 +0200
Subject: blkio: Fix compile errors

Fixes compile errors in blk-cgroup code for empty_time stat and a merge fix in
CFQ. The first error was when CONFIG_DEBUG_CFQ_IOSCHED is not set.

Signed-off-by: Divyesh Shah <dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 54 ++++++++++++++++++++++++++---------------------------
 block/cfq-iosched.c |  2 +-
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index aa97cd455cef..80c1261a7d38 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -219,6 +219,33 @@ void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 
+void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
+{
+	unsigned long flags;
+	struct blkio_group_stats *stats;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	stats = &blkg->stats;
+
+	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+		spin_unlock_irqrestore(&blkg->stats_lock, flags);
+		return;
+	}
+
+	/*
+	 * If ignore is set, we do not panic on the empty flag being set
+	 * already. This is to avoid cases where there are superfluous timeslice
+	 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
+	 * served which could result in triggering the empty check incorrectly.
+	 */
+	BUG_ON(!ignore && blkio_blkg_empty(stats));
+	stats->start_empty_time = sched_clock();
+	blkio_mark_blkg_empty(stats);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue)
 {
@@ -268,33 +295,6 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
-void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
-{
-	unsigned long flags;
-	struct blkio_group_stats *stats;
-
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
-
-	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
-		spin_unlock_irqrestore(&blkg->stats_lock, flags);
-		return;
-	}
-
-	/*
-	 * If ignore is set, we do not panic on the empty flag being set
-	 * already. This is to avoid cases where there are superfluous timeslice
-	 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
-	 * served which could result in triggering the empty check incorrectly.
-	 */
-	BUG_ON(!ignore && blkio_blkg_empty(stats));
-	stats->start_empty_time = sched_clock();
-	blkio_mark_blkg_empty(stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
-
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync)
 {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9e0df2bdcf21..01771098355d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2231,7 +2231,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	int dispatched = 0;
 
 	/* Expire the timeslice of the current active queue first */
-	cfq_slice_expired(cfqd, 0);
+	cfq_slice_expired(cfqd, 0, true);
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
-- 
cgit v1.2.3-59-g8ed1b


From c0d97e9ca2cfa66bdfd1ed8ecb5dcd230924d675 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Wed, 14 Apr 2010 20:53:37 +0200
Subject: block: ensure jiffies wrap is handled correctly in
 blk_rq_timed_out_timer

blk_rq_timed_out_timer() relied on blk_add_timer() never returning a
timer value of zero, but commit 7838c15b8dd18e78a523513749e5b54bda07b0cb
removed the code that bumped this value when it was zero.
Therefore when jiffies is near wrap we could get unlucky & not set the
timeout value correctly.

This patch uses a flag to indicate that the timeout value was set and so
handles jiffies wrap correctly, and it keeps all the logic in one
function so should be easier to maintain in the future.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-timeout.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 1ba7e0aca878..4f0c06c7a338 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned long data)
 	struct request_queue *q = (struct request_queue *) data;
 	unsigned long flags, next = 0;
 	struct request *rq, *tmp;
+	int next_set = 0;
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned long data)
 			if (blk_mark_rq_complete(rq))
 				continue;
 			blk_rq_timed_out(rq);
-		} else if (!next || time_after(next, rq->deadline))
+		} else if (!next_set || time_after(next, rq->deadline)) {
 			next = rq->deadline;
+			next_set = 1;
+		}
 	}
 
-	/*
-	 * next can never be 0 here with the list non-empty, since we always
-	 * bump ->deadline to 1 so we can detect if the timer was ever added
-	 * or not. See comment in blk_add_timer()
-	 */
-	if (next)
+	if (next_set)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-- 
cgit v1.2.3-59-g8ed1b


From b6ac23af2c66e114d3a87ef28d56f1ceec283007 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 15 Apr 2010 08:54:59 +0200
Subject: blkio: fix for modular blk-cgroup build

After merging the block tree, 20100414's linux-next build (x86_64
allmodconfig) failed like this:

ERROR: "get_gendisk" [block/blk-cgroup.ko] undefined!
ERROR: "sched_clock" [block/blk-cgroup.ko] undefined!

This happens because the two symbols aren't exported and hence not available
when blk-cgroup code is built as a module. I've tried to stay consistent with
the use of EXPORT_SYMBOL or EXPORT_SYMBOL_GPL with the other symbols in the
respective files.

Signed-off-by: Divyesh Shah <dpshah@google.com>
Acked-by: Gui Jianfeng <guijianfeng@cn.fujitsu.cn>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c        | 1 +
 kernel/sched_clock.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..154b5f80b3ab 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 
 	return disk;
 }
+EXPORT_SYMBOL(get_gendisk);
 
 /**
  * bdget_disk - do bdget() by gendisk and partition number
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
 					* (NSEC_PER_SEC / HZ);
 }
+EXPORT_SYMBOL_GPL(sched_clock);
 
 static __read_mostly int sched_clock_running;
 
-- 
cgit v1.2.3-59-g8ed1b


From 8d2a91f8960b230b8bbcc4d97ed2015f5271c87d Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Fri, 16 Apr 2010 08:10:51 +0200
Subject: blkio: Initialize blkg->stats_lock for the root cfqg too

This fixes the lockdep warning reported by Gui Jianfeng.

Signed-off-by: Divyesh Shah <dpshah@google.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 7 +------
 block/blk-cgroup.h  | 2 --
 block/cfq-iosched.c | 1 -
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 80c1261a7d38..83930f65016a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -86,12 +86,6 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
-void blkio_group_init(struct blkio_group *blkg)
-{
-	spin_lock_init(&blkg->stats_lock);
-}
-EXPORT_SYMBOL_GPL(blkio_group_init);
-
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -349,6 +343,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkcg->lock, flags);
+	spin_lock_init(&blkg->stats_lock);
 	rcu_assign_pointer(blkg->key, key);
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1d409ad9c6e8..2c956a06339a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -217,7 +217,6 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
-void blkio_group_init(struct blkio_group *blkg);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 					unsigned long time);
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
@@ -235,7 +234,6 @@ struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
-static inline void blkio_group_init(struct blkio_group *blkg) {}
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev) {}
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 01771098355d..62defd05518f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -961,7 +961,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
-	blkio_group_init(&cfqg->blkg);
 
 	/*
 	 * Take the initial reference that will be released on destroy
-- 
cgit v1.2.3-59-g8ed1b


From 7f1dc8a2d2f45fc557b27fd56115338b1d34fc24 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 21 Apr 2010 17:44:16 +0200
Subject: blkio: Fix blkio crash during rq stat update

blkio + cfq was crashing even when two sequential readers were put in two
separate cgroups (group_isolation=0).

The reason being that cfqq can migrate across groups based on its being
sync-noidle or not, it can happen that at request insertion time, cfqq
belonged to one cfqg and at request dispatch time, it belonged to root
group. In this case request stats per cgroup can go wrong and it also runs
into BUG_ON().

This patch implements rq stashing away a cfq group pointer and not relying
on cfqq->cfqg pointer alone for rq stat accounting.

[   65.163523] ------------[ cut here ]------------
[   65.164301] kernel BUG at block/blk-cgroup.c:117!
[   65.164301] invalid opcode: 0000 [#1] SMP
[   65.164301] last sysfs file: /sys/devices/pci0000:00/0000:00:05.0/0000:60:00.1/host9/rport-9:0-0/target9:0:0/9:0:0:2/block/sde/stat
[   65.164301] CPU 1
[   65.164301] Modules linked in: dm_round_robin dm_multipath qla2xxx scsi_transport_fc dm_zero dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
[   65.164301]
[   65.164301] Pid: 4505, comm: fio Not tainted 2.6.34-rc4-blk-for-35 #34 0A98h/HP xw8600 Workstation
[   65.164301] RIP: 0010:[<ffffffff8121924f>]  [<ffffffff8121924f>] blkiocg_update_io_remove_stats+0x5b/0xaf
[   65.164301] RSP: 0018:ffff8800ba5a79e8  EFLAGS: 00010046
[   65.164301] RAX: 0000000000000096 RBX: ffff8800bb268d60 RCX: 0000000000000000
[   65.164301] RDX: ffff8800bb268eb8 RSI: 0000000000000000 RDI: ffff8800bb268e00
[   65.164301] RBP: ffff8800ba5a7a08 R08: 0000000000000064 R09: 0000000000000001
[   65.164301] R10: 0000000000079640 R11: ffff8800a0bd5bf0 R12: ffff8800bab4af01
[   65.164301] R13: ffff8800bab4af00 R14: ffff8800bb1d8928 R15: 0000000000000000
[   65.164301] FS:  00007f18f75056f0(0000) GS:ffff880001e40000(0000) knlGS:0000000000000000
[   65.164301] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   65.164301] CR2: 000000000040e7f0 CR3: 00000000ba52b000 CR4: 00000000000006e0
[   65.164301] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   65.164301] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   65.164301] Process fio (pid: 4505, threadinfo ffff8800ba5a6000, task ffff8800ba45ae80)
[   65.164301] Stack:
[   65.164301]  ffff8800ba5a7a08 ffff8800ba722540 ffff8800bab4af68 ffff8800bab4af68
[   65.164301] <0> ffff8800ba5a7a38 ffffffff8121d814 ffff8800ba722540 ffff8800bab4af68
[   65.164301] <0> ffff8800ba722540 ffff8800a08f6800 ffff8800ba5a7a68 ffffffff8121d8ca
[   65.164301] Call Trace:
[   65.164301]  [<ffffffff8121d814>] cfq_remove_request+0xe4/0x116
[   65.164301]  [<ffffffff8121d8ca>] cfq_dispatch_insert+0x84/0xe1
[   65.164301]  [<ffffffff8121e833>] cfq_dispatch_requests+0x767/0x8e8
[   65.164301]  [<ffffffff8120e524>] ? submit_bio+0xc3/0xcc
[   65.164301]  [<ffffffff810ad657>] ? sync_page_killable+0x0/0x35
[   65.164301]  [<ffffffff8120ea8d>] blk_peek_request+0x191/0x1a7
[   65.164301]  [<ffffffffa000109c>] ? dm_get_live_table+0x44/0x4f [dm_mod]
[   65.164301]  [<ffffffffa0002799>] dm_request_fn+0x38/0x14c [dm_mod]
[   65.164301]  [<ffffffff810ad657>] ? sync_page_killable+0x0/0x35
[   65.164301]  [<ffffffff8120f600>] __generic_unplug_device+0x32/0x37
[   65.164301]  [<ffffffff8120f8a0>] generic_unplug_device+0x2e/0x3c
[   65.164301]  [<ffffffffa00011a6>] dm_unplug_all+0x42/0x5b [dm_mod]
[   65.164301]  [<ffffffff8120b063>] blk_unplug+0x29/0x2d
[   65.164301]  [<ffffffff8120b079>] blk_backing_dev_unplug+0x12/0x14
[   65.164301]  [<ffffffff81108a82>] block_sync_page+0x35/0x39
[   65.164301]  [<ffffffff810ad64e>] sync_page+0x41/0x4a
[   65.164301]  [<ffffffff810ad665>] sync_page_killable+0xe/0x35
[   65.164301]  [<ffffffff81589027>] __wait_on_bit_lock+0x46/0x8f
[   65.164301]  [<ffffffff810ad52d>] __lock_page_killable+0x66/0x6d
[   65.164301]  [<ffffffff81055fd4>] ? wake_bit_function+0x0/0x33
[   65.164301]  [<ffffffff810ad560>] lock_page_killable+0x2c/0x2e
[   65.164301]  [<ffffffff810aebfd>] generic_file_aio_read+0x361/0x4f0
[   65.164301]  [<ffffffff810e906c>] do_sync_read+0xcb/0x108
[   65.164301]  [<ffffffff811e32a3>] ? security_file_permission+0x16/0x18
[   65.164301]  [<ffffffff810e96d3>] vfs_read+0xab/0x108
[   65.164301]  [<ffffffff810e97f0>] sys_read+0x4a/0x6e
[   65.164301]  [<ffffffff81002b5b>] system_call_fastpath+0x16/0x1b
[   65.164301] Code: 00 74 1c 48 8b 8b 60 01 00 00 48 85 c9 75 04 0f 0b eb fe 48 ff c9 48 89 8b 60 01 00 00 eb 1a 48 8b 8b 58 01 00 00 48 85 c9 75 04 <0f> 0b eb fe 48 ff c9 48 89 8b 58 01 00 00 45 84 e4 74 16 48 8b
[   65.164301] RIP  [<ffffffff8121924f>] blkiocg_update_io_remove_stats+0x5b/0xaf
[   65.164301]  RSP <ffff8800ba5a79e8>
[   65.164301] ---[ end trace 1b2b828753032e68 ]---

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c    | 36 ++++++++++++++++++++++++++----------
 include/linux/blkdev.h |  3 ++-
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 62defd05518f..d5927b53020e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -1001,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 	return cfqg;
 }
 
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+	atomic_inc(&cfqg->ref);
+	return cfqg;
+}
+
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
@@ -1084,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
 	return &cfqd->root_group;
 }
+
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+	return NULL;
+}
+
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
@@ -1386,12 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	blkiocg_update_io_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
-	blkiocg_update_io_add_stats(
-			&cfqq->cfqg->blkg, &cfqq->cfqd->serving_group->blkg,
-			rq_data_dir(rq), rq_is_sync(rq));
+	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+			rq_is_sync(rq));
 }
 
 static struct request *
@@ -1447,7 +1460,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	blkiocg_update_io_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
@@ -1483,8 +1496,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	struct cfq_queue *cfqq = RQ_CFQQ(req);
-	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, bio_data_dir(bio),
+	blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
 					cfq_bio_sync(bio));
 }
 
@@ -1505,7 +1517,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, rq_data_dir(next),
+	blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
 					rq_is_sync(next));
 }
 
@@ -3240,8 +3252,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-
-	blkiocg_update_io_add_stats(&cfqq->cfqg->blkg,
+	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
@@ -3472,6 +3483,10 @@ static void cfq_put_request(struct request *rq)
 		rq->elevator_private = NULL;
 		rq->elevator_private2 = NULL;
 
+		/* Put down rq reference on cfqg */
+		cfq_put_cfqg(RQ_CFQG(rq));
+		rq->elevator_private3 = NULL;
+
 		cfq_put_queue(cfqq);
 	}
 }
@@ -3560,6 +3575,7 @@ new_queue:
 
 	rq->elevator_private = cic;
 	rq->elevator_private2 = cfqq;
+	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
 	return 0;
 
 queue_fail:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d483c494672a..5cf17a49ce38 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -186,11 +186,12 @@ struct request {
 	};
 
 	/*
-	 * two pointers are available for the IO schedulers, if they need
+	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
 	 */
 	void *elevator_private;
 	void *elevator_private2;
+	void *elevator_private3;
 
 	struct gendisk *rq_disk;
 	unsigned long start_time;
-- 
cgit v1.2.3-59-g8ed1b


From e5ff082e8a68d9a6874990597497c7e6a96ad752 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 26 Apr 2010 19:25:11 +0200
Subject: blkio: Fix another BUG_ON() crash due to cfqq movement across groups

o Once in a while, I was hitting a BUG_ON() in blkio code. empty_time was
  assuming that upon slice expiry, group can't be marked empty already (except
  forced dispatch).

  But this assumption is broken if cfqq can move (group_isolation=0) across
  groups after receiving a request.

  I think most likely in this case we got a request in a cfqq and accounted
  the rq in one group, later while adding the cfqq to tree, we moved the queue
  to a different group which was already marked empty and after dispatch from
  slice we found group already marked empty and raised alarm.

  This patch does not error out if group is already marked empty. This can
  introduce some empty_time stat error only in case of group_isolation=0. This
  is better than crashing. In case of group_isolation=1 we should still get
  same stats as before this patch.

[  222.308546] ------------[ cut here ]------------
[  222.309311] kernel BUG at block/blk-cgroup.c:236!
[  222.309311] invalid opcode: 0000 [#1] SMP
[  222.309311] last sysfs file: /sys/devices/virtual/block/dm-3/queue/scheduler
[  222.309311] CPU 1
[  222.309311] Modules linked in: dm_round_robin dm_multipath qla2xxx scsi_transport_fc dm_zero dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
[  222.309311]
[  222.309311] Pid: 4780, comm: fio Not tainted 2.6.34-rc4-blkio-config #68 0A98h/HP xw8600 Workstation
[  222.309311] RIP: 0010:[<ffffffff8121ad88>]  [<ffffffff8121ad88>] blkiocg_set_start_empty_time+0x50/0x83
[  222.309311] RSP: 0018:ffff8800ba6e79f8  EFLAGS: 00010002
[  222.309311] RAX: 0000000000000082 RBX: ffff8800a13b7990 RCX: ffff8800a13b7808
[  222.309311] RDX: 0000000000002121 RSI: 0000000000000082 RDI: ffff8800a13b7a30
[  222.309311] RBP: ffff8800ba6e7a18 R08: 0000000000000000 R09: 0000000000000001
[  222.309311] R10: 000000000002f8c8 R11: ffff8800ba6e7ad8 R12: ffff8800a13b78ff
[  222.309311] R13: ffff8800a13b7990 R14: 0000000000000001 R15: ffff8800a13b7808
[  222.309311] FS:  00007f3beec476f0(0000) GS:ffff880001e40000(0000) knlGS:0000000000000000
[  222.309311] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  222.309311] CR2: 000000000040e7f0 CR3: 00000000a12d5000 CR4: 00000000000006e0
[  222.309311] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  222.309311] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  222.309311] Process fio (pid: 4780, threadinfo ffff8800ba6e6000, task ffff8800b3d6bf00)
[  222.309311] Stack:
[  222.309311]  0000000000000001 ffff8800bab17a48 ffff8800bab17a48 ffff8800a13b7800
[  222.309311] <0> ffff8800ba6e7a68 ffffffff8121da35 ffff880000000001 00ff8800ba5c5698
[  222.309311] <0> ffff8800ba6e7a68 ffff8800a13b7800 0000000000000000 ffff8800bab17a48
[  222.309311] Call Trace:
[  222.309311]  [<ffffffff8121da35>] __cfq_slice_expired+0x2af/0x3ec
[  222.309311]  [<ffffffff8121fd7b>] cfq_dispatch_requests+0x2c8/0x8e8
[  222.309311]  [<ffffffff8120f1cd>] ? spin_unlock_irqrestore+0xe/0x10
[  222.309311]  [<ffffffff8120fb1a>] ? blk_insert_cloned_request+0x70/0x7b
[  222.309311]  [<ffffffff81210461>] blk_peek_request+0x191/0x1a7
[  222.309311]  [<ffffffffa0002799>] dm_request_fn+0x38/0x14c [dm_mod]
[  222.309311]  [<ffffffff810ae61f>] ? sync_page_killable+0x0/0x35
[  222.309311]  [<ffffffff81210fd4>] __generic_unplug_device+0x32/0x37
[  222.309311]  [<ffffffff81211274>] generic_unplug_device+0x2e/0x3c
[  222.309311]  [<ffffffffa00011a6>] dm_unplug_all+0x42/0x5b [dm_mod]
[  222.309311]  [<ffffffff8120ca37>] blk_unplug+0x29/0x2d
[  222.309311]  [<ffffffff8120ca4d>] blk_backing_dev_unplug+0x12/0x14
[  222.309311]  [<ffffffff81109a7a>] block_sync_page+0x35/0x39
[  222.309311]  [<ffffffff810ae616>] sync_page+0x41/0x4a
[  222.309311]  [<ffffffff810ae62d>] sync_page_killable+0xe/0x35
[  222.309311]  [<ffffffff8158aa59>] __wait_on_bit_lock+0x46/0x8f
[  222.309311]  [<ffffffff810ae4f5>] __lock_page_killable+0x66/0x6d
[  222.309311]  [<ffffffff81056f9c>] ? wake_bit_function+0x0/0x33
[  222.309311]  [<ffffffff810ae528>] lock_page_killable+0x2c/0x2e
[  222.309311]  [<ffffffff810afbc5>] generic_file_aio_read+0x361/0x4f0
[  222.309311]  [<ffffffff810ea044>] do_sync_read+0xcb/0x108
[  222.309311]  [<ffffffff811e42f7>] ? security_file_permission+0x16/0x18
[  222.309311]  [<ffffffff810ea6ab>] vfs_read+0xab/0x108
[  222.309311]  [<ffffffff810ea7c8>] sys_read+0x4a/0x6e
[  222.309311]  [<ffffffff81002b5b>] system_call_fastpath+0x16/0x1b
[  222.309311] Code: 58 01 00 00 00 48 89 c6 75 0a 48 83 bb 60 01 00 00 00 74 09 48 8d bb a0 00 00 00 eb 35 41 fe cc 74 0d f6 83 c0 01 00 00 04 74 04 <0f> 0b eb fe 48 89 75 e8 e8 be e0 de ff 66 83 8b c0 01 00 00 04
[  222.309311] RIP  [<ffffffff8121ad88>] blkiocg_set_start_empty_time+0x50/0x83
[  222.309311]  RSP <ffff8800ba6e79f8>
[  222.309311] ---[ end trace 32b4f71dffc15712 ]---

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Divyesh Shah <dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 15 +++++++++------
 block/blk-cgroup.h  |  5 ++---
 block/cfq-iosched.c | 33 ++++++++++++++++-----------------
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 83930f65016a..af42efbb0c1d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -213,7 +213,7 @@ void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 
-void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
 {
 	unsigned long flags;
 	struct blkio_group_stats *stats;
@@ -228,12 +228,15 @@ void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
 	}
 
 	/*
-	 * If ignore is set, we do not panic on the empty flag being set
-	 * already. This is to avoid cases where there are superfluous timeslice
-	 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
-	 * served which could result in triggering the empty check incorrectly.
+	 * group is already marked empty. This can happen if cfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
 	 */
-	BUG_ON(!ignore && blkio_blkg_empty(stats));
+	if(blkio_blkg_empty(stats)) {
+		spin_unlock_irqrestore(&blkg->stats_lock, flags);
+		return;
+	}
+
 	stats->start_empty_time = sched_clock();
 	blkio_mark_blkg_empty(stats);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2c956a06339a..a491a6d56ecf 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -174,7 +174,7 @@ void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 				unsigned long dequeue);
 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
 void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
-void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
 
 #define BLKG_FLAG_FNS(name)						\
 static inline void blkio_mark_blkg_##name(				\
@@ -205,8 +205,7 @@ static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
 {}
 static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
-static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
-						bool ignore) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 #endif
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5927b53020e..002a5b621653 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -888,7 +888,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 }
 
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
-				struct cfq_queue *cfqq, bool forced)
+				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge_sl;
@@ -918,7 +918,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
 	blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
-	blkiocg_set_start_empty_time(&cfqg->blkg, forced);
+	blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -1582,7 +1582,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		    bool timed_out, bool forced)
+		    bool timed_out)
 {
 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 
@@ -1609,7 +1609,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 
-	cfq_group_served(cfqd, cfqq->cfqg, cfqq, forced);
+	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
 
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
@@ -1628,13 +1628,12 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 }
 
-static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out,
-					bool forced)
+static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
 	if (cfqq)
-		__cfq_slice_expired(cfqd, cfqq, timed_out, forced);
+		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 
 /*
@@ -2202,7 +2201,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	}
 
 expire:
-	cfq_slice_expired(cfqd, 0, false);
+	cfq_slice_expired(cfqd, 0);
 new_queue:
 	/*
 	 * Current queue expired. Check if we have to switch to a new
@@ -2228,7 +2227,7 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 	BUG_ON(!list_empty(&cfqq->fifo));
 
 	/* By default cfqq is not expired if it is empty. Do it explicitly */
-	__cfq_slice_expired(cfqq->cfqd, cfqq, 0, true);
+	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
 	return dispatched;
 }
 
@@ -2242,7 +2241,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	int dispatched = 0;
 
 	/* Expire the timeslice of the current active queue first */
-	cfq_slice_expired(cfqd, 0, true);
+	cfq_slice_expired(cfqd, 0);
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
@@ -2411,7 +2410,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
-		cfq_slice_expired(cfqd, 0, false);
+		cfq_slice_expired(cfqd, 0);
 	}
 
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
@@ -2442,7 +2441,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	orig_cfqg = cfqq->orig_cfqg;
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		__cfq_slice_expired(cfqd, cfqq, 0, false);
+		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -2543,7 +2542,7 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	struct cfq_queue *__cfqq, *next;
 
 	if (unlikely(cfqq == cfqd->active_queue)) {
-		__cfq_slice_expired(cfqd, cfqq, 0, false);
+		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -3172,7 +3171,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
-	cfq_slice_expired(cfqd, 1, false);
+	cfq_slice_expired(cfqd, 1);
 
 	/*
 	 * Put the new queue at the front of the of the current list,
@@ -3383,7 +3382,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		 * - when there is a close cooperator
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
-			cfq_slice_expired(cfqd, 1, false);
+			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
 			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
@@ -3648,7 +3647,7 @@ static void cfq_idle_slice_timer(unsigned long data)
 		cfq_clear_cfqq_deep(cfqq);
 	}
 expire:
-	cfq_slice_expired(cfqd, timed_out, false);
+	cfq_slice_expired(cfqd, timed_out);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
@@ -3691,7 +3690,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	spin_lock_irq(q->queue_lock);
 
 	if (cfqd->active_queue)
-		__cfq_slice_expired(cfqd, cfqd->active_queue, 0, false);
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
 	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
-- 
cgit v1.2.3-59-g8ed1b


From afc24d49c1e5dbeef745c1c1246f5ae6ebd97c71 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 26 Apr 2010 19:27:56 +0200
Subject: blk-cgroup: config options re-arrangement

This patch fixes few usability and configurability issues.

o All the cgroup based controller options are configurable from
  "Genral Setup/Control Group Support/" menu. blkio is the only exception.
  Hence make this option visible in above menu and make it configurable from
  there to bring it inline with rest of the cgroup based controllers.

o Get rid of CONFIG_DEBUG_CFQ_IOSCHED.

  This option currently does two things.

  - Enable printing of cgroup paths in blktrace
  - Enables CONFIG_DEBUG_BLK_CGROUP, which in turn displays additional stat
    files in cgroup.

  If we are using group scheduling, blktrace data is of not really much use
  if cgroup information is not present. To get this data, currently one has to
  also enable CONFIG_DEBUG_CFQ_IOSCHED, which in turn brings the overhead of
  all the additional debug stat files which is not desired.

  Hence, this patch moves printing of cgroup paths under
  CONFIG_CFQ_GROUP_IOSCHED.

  This allows us to get rid of CONFIG_DEBUG_CFQ_IOSCHED completely. Now all
  the debug stat files are controlled only by CONFIG_DEBUG_BLK_CGROUP which
  can be enabled through config menu.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Divyesh Shah <dpshah@google.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt | 35 +++++++++++++-----------------
 block/Kconfig                              | 23 --------------------
 block/Kconfig.iosched                      | 16 +++++---------
 block/blk-cgroup.c                         |  2 --
 block/blk-cgroup.h                         | 14 ++++++------
 block/cfq-iosched.c                        |  2 +-
 init/Kconfig                               | 27 +++++++++++++++++++++++
 7 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index d422b410a995..48e0b21b0059 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
 You can do a very simple testing of running two dd threads in two different
 cgroups. Here is what you can do.
 
+- Enable Block IO controller
+	CONFIG_BLK_CGROUP=y
+
 - Enable group scheduling in CFQ
 	CONFIG_CFQ_GROUP_IOSCHED=y
 
@@ -54,24 +57,16 @@ cgroups. Here is what you can do.
 
 Various user visible config options
 ===================================
-CONFIG_CFQ_GROUP_IOSCHED
-	- Enables group scheduling in CFQ. Currently only 1 level of group
-	  creation is allowed.
-
-CONFIG_DEBUG_CFQ_IOSCHED
-	- Enables some debugging messages in blktrace. Also creates extra
-	  cgroup file blkio.dequeue.
-
-Config options selected automatically
-=====================================
-These config options are not user visible and are selected/deselected
-automatically based on IO scheduler configuration.
-
 CONFIG_BLK_CGROUP
-	- Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+	- Block IO controller.
 
 CONFIG_DEBUG_BLK_CGROUP
-	- Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+	- Debug help. Right now some additional stats file show up in cgroup
+	  if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+	- Enables group scheduling in CFQ. Currently only 1 level of group
+	  creation is allowed.
 
 Details of cgroup files
 =======================
@@ -174,13 +169,13 @@ Details of cgroup files
 	  write, sync or async.
 
 - blkio.avg_queue_size
-	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
 	  The average queue size for this cgroup over the entire time of this
 	  cgroup's existence. Queue size samples are taken each time one of the
 	  queues of this cgroup gets a timeslice.
 
 - blkio.group_wait_time
-	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
 	  This is the amount of time the cgroup had to wait since it became busy
 	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
 	  its queues. This is different from the io_wait_time which is the
@@ -191,7 +186,7 @@ Details of cgroup files
 	  got a timeslice and will not include the current delta.
 
 - blkio.empty_time
-	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
 	  This is the amount of time a cgroup spends without any pending
 	  requests when not being served, i.e., it does not include any time
 	  spent idling for one of the queues of the cgroup. This is in
@@ -200,7 +195,7 @@ Details of cgroup files
 	  time it had a pending request and will not include the current delta.
 
 - blkio.idle_time
-	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
 	  This is the amount of time spent by the IO scheduler idling for a
 	  given cgroup in anticipation of a better request than the exising ones
 	  from other queues/cgroups. This is in nanoseconds. If this is read
@@ -209,7 +204,7 @@ Details of cgroup files
 	  the current delta.
 
 - blkio.dequeue
-	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
 	  gives the statistics about how many a times a group was dequeued
 	  from service tree of the device. First two fields specify the major
 	  and minor number of the device and third field specifies the number
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94bb..9be0b56eaee1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
-config BLK_CGROUP
-	tristate "Block cgroup support"
-	depends on CGROUPS
-	depends on CFQ_GROUP_IOSCHED
-	default n
-	---help---
-	Generic block IO controller cgroup interface. This is the common
-	cgroup interface which should be used by various IO controlling
-	policies.
-
-	Currently, CFQ IO scheduler uses it to recognize task groups and
-	control disk bandwidth allocation (proportional time slice allocation)
-	to such task groups.
-
-config DEBUG_BLK_CGROUP
-	bool
-	depends on BLK_CGROUP
-	default n
-	---help---
-	Enable some debugging help. Currently it stores the cgroup path
-	in the blk group which can be used by cfq for tracing various
-	group related activity.
-
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb2..3199b76f795d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
-	select BLK_CGROUP if CFQ_GROUP_IOSCHED
+	# If BLK_CGROUP is a module, CFQ has to be built as module.
+	depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
+	  Note: If BLK_CGROUP=m, then CFQ can be built only as module.
+
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
-	depends on IOSCHED_CFQ && CGROUPS
+	depends on IOSCHED_CFQ && BLK_CGROUP
 	default n
 	---help---
 	  Enable group IO scheduling in CFQ.
 
-config DEBUG_CFQ_IOSCHED
-	bool "Debug CFQ Scheduling"
-	depends on CFQ_GROUP_IOSCHED
-	select DEBUG_BLK_CGROUP
-	default n
-	---help---
-	  Enable CFQ IO scheduling debugging in CFQ. Currently it makes
-	  blktrace output more verbose.
-
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index af42efbb0c1d..d02bbf88de13 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -351,10 +351,8 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	spin_unlock_irqrestore(&blkcg->lock, flags);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Need to take css reference ? */
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-#endif
 	blkg->dev = dev;
 }
 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a491a6d56ecf..2b866ec1dcea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -108,10 +108,8 @@ struct blkio_group {
 	void *key;
 	struct hlist_node blkcg_node;
 	unsigned short blkcg_id;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Store cgroup path */
 	char path[128];
-#endif
 	/* The device MKDEV(major, minor), this group has been created for */
 	dev_t dev;
 
@@ -147,6 +145,11 @@ struct blkio_policy_type {
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
 
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+	return blkg->path;
+}
+
 #else
 
 struct blkio_group {
@@ -158,6 +161,8 @@ struct blkio_policy_type {
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+
 #endif
 
 #define BLKIO_WEIGHT_MIN	100
@@ -165,10 +170,6 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 #define BLKIO_WEIGHT_DEFAULT	500
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static inline char *blkg_path(struct blkio_group *blkg)
-{
-	return blkg->path;
-}
 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 				unsigned long dequeue);
@@ -197,7 +198,6 @@ BLKG_FLAG_FNS(idling)
 BLKG_FLAG_FNS(empty)
 #undef BLKG_FLAG_FNS
 #else
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 static inline void blkiocg_update_avg_queue_size_stats(
 						struct blkio_group *blkg) {}
 static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 002a5b621653..286008cf889e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -345,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1c..087c14f3c595 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -612,6 +612,33 @@ config RT_GROUP_SCHED
 
 endif #CGROUP_SCHED
 
+config BLK_CGROUP
+	tristate "Block IO controller"
+	depends on CGROUPS && BLOCK
+	default n
+	---help---
+	Generic block IO controller cgroup interface. This is the common
+	cgroup interface which should be used by various IO controlling
+	policies.
+
+	Currently, CFQ IO scheduler uses it to recognize task groups and
+	control disk bandwidth allocation (proportional time slice allocation)
+	to such task groups.
+
+	This option only enables generic Block IO controller infrastructure.
+	One needs to also enable actual IO controlling logic in CFQ for it
+	to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+
+	See Documentation/cgroups/blkio-controller.txt for more information.
+
+config DEBUG_BLK_CGROUP
+	bool "Enable Block IO controller debugging"
+	depends on BLK_CGROUP
+	default n
+	---help---
+	Enable some debugging help. Currently it exports additional stat
+	files in a cgroup which can be useful for debugging.
+
 endif # CGROUPS
 
 config MM_OWNER
-- 
cgit v1.2.3-59-g8ed1b


From 1a3cbbc5a5e8a66934aa0947896a4aca6fd77298 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 7 Apr 2010 18:52:29 +0900
Subject: block: factor out bd_may_claim()

Factor out bd_may_claim() from bd_claim(), add comments and apply a
couple of cosmetic edits.  This is to prepare for further updates to
claim path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 65 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2a6d0193f139..e59440c7e1cf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -661,41 +661,70 @@ void bd_forget(struct inode *inode)
 		iput(bdev->bd_inode);
 }
 
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+			 void *holder)
 {
-	int res;
-	spin_lock(&bdev_lock);
-
-	/* first decide result */
 	if (bdev->bd_holder == holder)
-		res = 0;	 /* already a holder */
+		return true;	 /* already a holder */
 	else if (bdev->bd_holder != NULL)
-		res = -EBUSY; 	 /* held by someone else */
+		return false; 	 /* held by someone else */
 	else if (bdev->bd_contains == bdev)
-		res = 0;  	 /* is a whole device which isn't held */
+		return true;  	 /* is a whole device which isn't held */
 
-	else if (bdev->bd_contains->bd_holder == bd_claim)
-		res = 0; 	 /* is a partition of a device that is being partitioned */
-	else if (bdev->bd_contains->bd_holder != NULL)
-		res = -EBUSY;	 /* is a partition of a held device */
+	else if (whole->bd_holder == bd_claim)
+		return true; 	 /* is a partition of a device that is being partitioned */
+	else if (whole->bd_holder != NULL)
+		return false;	 /* is a partition of a held device */
 	else
-		res = 0;	 /* is a partition of an un-held device */
+		return true;	 /* is a partition of an un-held device */
+}
 
-	/* now impose change */
-	if (res==0) {
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+	struct block_device *whole = bdev->bd_contains;
+	int res = -EBUSY;
+
+	spin_lock(&bdev_lock);
+
+	if (bd_may_claim(bdev, whole, holder)) {
 		/* note that for a whole device bd_holders
 		 * will be incremented twice, and bd_holder will
 		 * be set to bd_claim before being set to holder
 		 */
-		bdev->bd_contains->bd_holders ++;
-		bdev->bd_contains->bd_holder = bd_claim;
+		whole->bd_holders++;
+		whole->bd_holder = bd_claim;
 		bdev->bd_holders++;
 		bdev->bd_holder = holder;
+		res = 0;
 	}
+
 	spin_unlock(&bdev_lock);
 	return res;
 }
-
 EXPORT_SYMBOL(bd_claim);
 
 void bd_release(struct block_device *bdev)
-- 
cgit v1.2.3-59-g8ed1b


From 6b4517a7913a09d3259bb1d21c9cb300f12294bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 7 Apr 2010 18:53:59 +0900
Subject: block: implement bd_claiming and claiming block

Currently, device claiming for exclusive open is done after low level
open - disk->fops->open() - has completed successfully.  This means
that exclusive open attempts while a device is already exclusively
open will fail only after disk->fops->open() is called.

cdrom driver issues commands during open() which means that O_EXCL
open attempt can unintentionally inject commands to in-progress
command stream for burning thus disturbing burning process.  In most
cases, this doesn't cause problems because the first command to be
issued is TUR which most devices can process in the middle of burning.
However, depending on how a device replies to TUR during burning,
cdrom driver may end up issuing further commands.

This can't be resolved trivially by moving bd_claim() before doing
actual open() because that means an open attempt which will end up
failing could interfere other legit O_EXCL open attempts.
ie. unconfirmed open attempts can fail others.

This patch resolves the problem by introducing claiming block which is
started by bd_start_claiming() and terminated either by bd_claim() or
bd_abort_claiming().  bd_claim() from inside a claiming block is
guaranteed to succeed and once a claiming block is started, other
bd_start_claiming() or bd_claim() attempts block till the current
claiming block is terminated.

bd_claim() can still be used standalone although now it always
synchronizes against claiming blocks, so the existing users will keep
working without any change.

blkdev_open() and open_bdev_exclusive() are converted to use claiming
blocks so that exclusive open attempts from these functions don't
interfere with the existing exclusive open.

This problem was discovered while investigating bko#15403.

  https://bugzilla.kernel.org/show_bug.cgi?id=15403

The burning problem itself can be resolved by updating userspace
probing tools to always open w/ O_EXCL.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Matthias-Christian Ott <ott@mirix.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c     | 198 ++++++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/fs.h |   1 +
 2 files changed, 175 insertions(+), 24 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e59440c7e1cf..ea8385ea58ab 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -693,12 +693,145 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 		return true;	 /* is a partition of an un-held device */
 }
 
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+			       struct block_device *whole, void *holder)
+{
+retry:
+	/* if someone else claimed, fail */
+	if (!bd_may_claim(bdev, whole, holder))
+		return -EBUSY;
+
+	/* if someone else is claiming, wait for it to finish */
+	if (whole->bd_claiming && whole->bd_claiming != holder) {
+		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&bdev_lock);
+		schedule();
+		finish_wait(wq, &wait);
+		spin_lock(&bdev_lock);
+		goto retry;
+	}
+
+	/* yay, all mine */
+	return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming().  If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder)
+{
+	struct gendisk *disk;
+	struct block_device *whole;
+	int partno, err;
+
+	might_sleep();
+
+	/*
+	 * @bdev might not have been initialized properly yet, look up
+	 * and grab the outer block device the hard way.
+	 */
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		return ERR_PTR(-ENXIO);
+
+	whole = bdget_disk(disk, 0);
+	put_disk(disk);
+	if (!whole)
+		return ERR_PTR(-ENOMEM);
+
+	/* prepare to claim, if successful, mark claiming in progress */
+	spin_lock(&bdev_lock);
+
+	err = bd_prepare_to_claim(bdev, whole, holder);
+	if (err == 0) {
+		whole->bd_claiming = holder;
+		spin_unlock(&bdev_lock);
+		return whole;
+	} else {
+		spin_unlock(&bdev_lock);
+		bdput(whole);
+		return ERR_PTR(err);
+	}
+}
+
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+	BUG_ON(whole->bd_claiming != holder);
+	whole->bd_claiming = NULL;
+	wake_up_bit(&whole->bd_claiming, 0);
+
+	spin_unlock(&bdev_lock);
+	bdput(whole);
+}
+
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+	spin_lock(&bdev_lock);
+	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
+}
+
 /**
  * bd_claim - claim a block device
  * @bdev: block device to claim
  * @holder: holder trying to claim @bdev
  *
- * Try to claim @bdev.
+ * Try to claim @bdev which must have been opened successfully.  This
+ * function may be called with or without preceding
+ * blk_start_claiming().  In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
  *
  * RETURNS:
  * 0 if successful, -EBUSY if @bdev is already claimed.
@@ -706,11 +839,14 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 int bd_claim(struct block_device *bdev, void *holder)
 {
 	struct block_device *whole = bdev->bd_contains;
-	int res = -EBUSY;
+	int res;
+
+	might_sleep();
 
 	spin_lock(&bdev_lock);
 
-	if (bd_may_claim(bdev, whole, holder)) {
+	res = bd_prepare_to_claim(bdev, whole, holder);
+	if (res == 0) {
 		/* note that for a whole device bd_holders
 		 * will be incremented twice, and bd_holder will
 		 * be set to bd_claim before being set to holder
@@ -719,10 +855,13 @@ int bd_claim(struct block_device *bdev, void *holder)
 		whole->bd_holder = bd_claim;
 		bdev->bd_holders++;
 		bdev->bd_holder = holder;
-		res = 0;
 	}
 
-	spin_unlock(&bdev_lock);
+	if (whole->bd_claiming)
+		__bd_abort_claiming(whole, holder);	/* releases bdev_lock */
+	else
+		spin_unlock(&bdev_lock);
+
 	return res;
 }
 EXPORT_SYMBOL(bd_claim);
@@ -1338,6 +1477,7 @@ EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+	struct block_device *whole = NULL;
 	struct block_device *bdev;
 	int res;
 
@@ -1360,22 +1500,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
+	if (filp->f_mode & FMODE_EXCL) {
+		whole = bd_start_claiming(bdev, filp);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
 	res = blkdev_get(bdev, filp->f_mode);
-	if (res)
-		return res;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		res = bd_claim(bdev, filp);
-		if (res)
-			goto out_blkdev_put;
+	if (whole) {
+		if (res == 0)
+			BUG_ON(bd_claim(bdev, filp) != 0);
+		else
+			bd_abort_claiming(whole, filp);
 	}
 
-	return 0;
-
- out_blkdev_put:
-	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 
@@ -1586,27 +1729,34 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-	struct block_device *bdev;
-	int error = 0;
+	struct block_device *bdev, *whole;
+	int error;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
+	whole = bd_start_claiming(bdev, holder);
+	if (IS_ERR(whole)) {
+		bdput(bdev);
+		return whole;
+	}
+
 	error = blkdev_get(bdev, mode);
 	if (error)
-		return ERR_PTR(error);
+		goto out_abort_claiming;
+
 	error = -EACCES;
 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto blkdev_put;
-	error = bd_claim(bdev, holder);
-	if (error)
-		goto blkdev_put;
+		goto out_blkdev_put;
 
+	BUG_ON(bd_claim(bdev, holder) != 0);
 	return bdev;
-	
-blkdev_put:
+
+out_blkdev_put:
 	blkdev_put(bdev, mode);
+out_abort_claiming:
+	bd_abort_claiming(whole, holder);
 	return ERR_PTR(error);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39d57bc6cc71..31ee31be51e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -651,6 +651,7 @@ struct block_device {
 	int			bd_openers;
 	struct mutex		bd_mutex;	/* open/close mutex */
 	struct list_head	bd_inodes;
+	void *			bd_claiming;
 	void *			bd_holder;
 	int			bd_holders;
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3-59-g8ed1b


From fbd9b09a177a481eda256447c881f014f29034fe Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 17:55:06 +0400
Subject: blkdev: generalize flags for blkdev_issue_fn functions

The patch just convert all blkdev_issue_xxx function to common
set of flags. Wait/allocation semantics preserved.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c                | 20 +++++++++++---------
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_int.h      |  3 ++-
 drivers/block/drbd/drbd_receiver.c |  3 ++-
 fs/block_dev.c                     |  3 ++-
 fs/btrfs/extent-tree.c             |  2 +-
 fs/ext3/fsync.c                    |  3 ++-
 fs/ext4/fsync.c                    |  6 ++++--
 fs/gfs2/rgrp.c                     |  5 +++--
 fs/jbd2/checkpoint.c               |  3 ++-
 fs/jbd2/commit.c                   |  6 ++++--
 fs/reiserfs/file.c                 |  3 ++-
 fs/xfs/linux-2.6/xfs_super.c       |  3 ++-
 include/linux/blkdev.h             | 18 +++++++++++-------
 mm/swapfile.c                      |  9 ++++++---
 15 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677f..cf14311b98fc 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -293,19 +293,22 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
  *    wish to.
  */
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+		sector_t *error_sector, unsigned long flags)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
-	int ret;
+	int ret = 0;
 
 	if (bdev->bd_disk == NULL)
 		return -ENXIO;
@@ -314,7 +317,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	if (!q)
 		return -ENXIO;
 
-	bio = bio_alloc(GFP_KERNEL, 0);
+	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_empty_barrier;
 	bio->bi_private = &wait;
 	bio->bi_bdev = bdev;
@@ -330,7 +333,6 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	if (error_sector)
 		*error_sector = bio->bi_sector;
 
-	ret = 0;
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -362,17 +364,17 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	DISCARD_FL_* flags to control behaviour
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a discard request for the sectors in question.
  */
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, int flags)
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = flags & DISCARD_FL_BARRIER ?
+	int type = flags & BLKDEV_IFL_BARRIER ?
 		DISCARD_BARRIER : DISCARD_NOBARRIER;
 	struct bio *bio;
 	struct page *page;
@@ -395,7 +397,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
-		if (flags & DISCARD_FL_WAIT)
+		if (flags & BLKDEV_IFL_WAIT)
 			bio->bi_private = &wait;
 
 		/*
@@ -426,7 +428,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_get(bio);
 		submit_bio(type, bio);
 
-		if (flags & DISCARD_FL_WAIT)
+		if (flags & BLKDEV_IFL_WAIT)
 			wait_for_completion(&wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a2a717..e8eb679f2f9b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
-				    DISCARD_FL_WAIT);
+				    BLKDEV_IFL_WAIT);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e5e86a781820..d6f1ae342b1d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2251,7 +2251,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 	if (test_bit(MD_NO_BARRIER, &mdev->flags))
 		return;
 
-	r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	if (r) {
 		set_bit(MD_NO_BARRIER, &mdev->flags);
 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index ed9f1de24a71..54f56ea8a786 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -945,7 +945,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 	int rv;
 
 	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
-		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
+					NULL, BLKDEV_IFL_WAIT);
 		if (rv) {
 			dev_err(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ea8385ea58ab..dd769304382e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -413,7 +413,8 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	if (error)
 		return error;
 	
-	error = blkdev_issue_flush(bdev, NULL);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL,
+				(BLKDEV_IFL_WAIT));
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			     DISCARD_FL_BARRIER);
+			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..9492f6003ef9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -91,7 +91,8 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	 * storage
 	 */
 	if (test_opt(inode->i_sb, BARRIER))
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+				BLKDEV_IFL_WAIT);
 out:
 	return ret;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		if (ext4_should_writeback_data(inode) &&
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+					NULL, BLKDEV_IFL_WAIT);
 		jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	return ret;
 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..bf011dc63471 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    DISCARD_FL_BARRIER);
+							    BLKDEV_IFL_WAIT |
+							    BLKDEV_IFL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
 	}
 	if (nr_sects) {
 		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 DISCARD_FL_BARRIER);
+					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (rv)
 			goto fail;
 	}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
 		if (err)
 			__jbd2_journal_abort_hard(journal);
 		if (journal->j_flags & JBD2_BARRIER)
-			blkdev_issue_flush(journal->j_dev, NULL);
+			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+				BLKDEV_IFL_WAIT);
 	}
 
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+			BLKDEV_IFL_WAIT);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 52e06b487ced..2b177c778ba7 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, NULL);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 }
 
 STATIC void
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5cf17a49ce38..59b9aed0ee7d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -998,12 +998,16 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 		return NULL;
 	return bqt->tag_index[tag];
 }
-
-extern int blkdev_issue_flush(struct block_device *, sector_t *);
-#define DISCARD_FL_WAIT		0x01	/* wait for completion */
-#define DISCARD_FL_BARRIER	0x02	/* issue DISCARD_BARRIER request */
-extern int blkdev_issue_discard(struct block_device *, sector_t sector,
-		sector_t nr_sects, gfp_t, int flags);
+enum{
+	BLKDEV_WAIT,	/* wait for completion */
+	BLKDEV_BARRIER,	/*issue request with barrier */
+};
+#define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
+#define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
+			unsigned long);
+extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
@@ -1011,7 +1015,7 @@ static inline int sb_issue_discard(struct super_block *sb,
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
 	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
-				    DISCARD_FL_BARRIER);
+				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc7..eb086e0f4dcc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+				nr_blocks, GFP_KERNEL,
+				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (err)
 			return err;
 		cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+				nr_blocks, GFP_KERNEL,
+				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (err)
 			break;
 
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+							BLKDEV_IFL_BARRIER))
 				break;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From f17e232e9237c231daf9f0f4b177c61218bcb2e4 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 17:55:07 +0400
Subject: blkdev: allow async blkdev_issue_flush requests

In some places caller don't want to wait a request to complete.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index cf14311b98fc..f11eec9669e4 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,8 +286,9 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
 			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	}
-
-	complete(bio->bi_private);
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
 }
 
 /**
@@ -300,7 +301,8 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
- *    wish to.
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		sector_t *error_sector, unsigned long flags)
@@ -319,19 +321,22 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_empty_barrier;
-	bio->bi_private = &wait;
 	bio->bi_bdev = bdev;
-	submit_bio(WRITE_BARRIER, bio);
-
-	wait_for_completion(&wait);
+	if (test_bit(BLKDEV_WAIT, &flags))
+		bio->bi_private = &wait;
 
-	/*
-	 * The driver must store the error location in ->bi_sector, if
-	 * it supports it. For non-stacked drivers, this should be copied
-	 * from blk_rq_pos(rq).
-	 */
-	if (error_sector)
-		*error_sector = bio->bi_sector;
+	bio_get(bio);
+	submit_bio(WRITE_BARRIER, bio);
+	if (test_bit(BLKDEV_WAIT, &flags)) {
+		wait_for_completion(&wait);
+		/*
+		 * The driver must store the error location in ->bi_sector, if
+		 * it supports it. For non-stacked drivers, this should be
+		 * copied from blk_rq_pos(rq).
+		 */
+		if (error_sector)
+			*error_sector = bio->bi_sector;
+	}
 
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From f31e7e4022841c43c53b847b86b1bf97a08b2c94 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 17:55:08 +0400
Subject: blkdev: move blkdev_issue helper functions to separate file

Move blkdev_issue_discard from blk-barrier.c because it is
not barrier related.
Later the file will be populated by other helpers.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile      |   2 +-
 block/blk-barrier.c | 104 -----------------------------------------------
 block/blk-lib.c     | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+), 105 deletions(-)
 create mode 100644 block/blk-lib.c

diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6e..0bb499a739cd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f11eec9669e4..0d710c9d403b 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -347,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
-
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	__free_page(bio_page(bio));
-
-	bio_put(bio);
-}
-
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev:	blockdev to issue discard for
- * @sector:	start sector
- * @nr_sects:	number of sectors to discard
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
- *
- * Description:
- *    Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	struct request_queue *q = bdev_get_queue(bdev);
-	int type = flags & BLKDEV_IFL_BARRIER ?
-		DISCARD_BARRIER : DISCARD_NOBARRIER;
-	struct bio *bio;
-	struct page *page;
-	int ret = 0;
-
-	if (!q)
-		return -ENXIO;
-
-	if (!blk_queue_discard(q))
-		return -EOPNOTSUPP;
-
-	while (nr_sects && !ret) {
-		unsigned int sector_size = q->limits.logical_block_size;
-		unsigned int max_discard_sectors =
-			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-
-		bio = bio_alloc(gfp_mask, 1);
-		if (!bio)
-			goto out;
-		bio->bi_sector = sector;
-		bio->bi_end_io = blkdev_discard_end_io;
-		bio->bi_bdev = bdev;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &wait;
-
-		/*
-		 * Add a zeroed one-sector payload as that's what
-		 * our current implementations need.  If we'll ever need
-		 * more the interface will need revisiting.
-		 */
-		page = alloc_page(gfp_mask | __GFP_ZERO);
-		if (!page)
-			goto out_free_bio;
-		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
-			goto out_free_page;
-
-		/*
-		 * And override the bio size - the way discard works we
-		 * touch many more blocks on disk than the actual payload
-		 * length.
-		 */
-		if (nr_sects > max_discard_sectors) {
-			bio->bi_size = max_discard_sectors << 9;
-			nr_sects -= max_discard_sectors;
-			sector += max_discard_sectors;
-		} else {
-			bio->bi_size = nr_sects << 9;
-			nr_sects = 0;
-		}
-
-		bio_get(bio);
-		submit_bio(type, bio);
-
-		if (flags & BLKDEV_IFL_WAIT)
-			wait_for_completion(&wait);
-
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
-			ret = -EIO;
-		bio_put(bio);
-	}
-	return ret;
-out_free_page:
-	__free_page(page);
-out_free_bio:
-	bio_put(bio);
-out:
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644
index 000000000000..0dc438812d81
--- /dev/null
+++ b/block/blk-lib.c
@@ -0,0 +1,114 @@
+/*
+ * Functions related to generic helpers functions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	__free_page(bio_page(bio));
+
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = flags & BLKDEV_IFL_BARRIER ?
+		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	struct bio *bio;
+	struct page *page;
+	int ret = 0;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	while (nr_sects && !ret) {
+		unsigned int sector_size = q->limits.logical_block_size;
+		unsigned int max_discard_sectors =
+			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio)
+			goto out;
+		bio->bi_sector = sector;
+		bio->bi_end_io = blkdev_discard_end_io;
+		bio->bi_bdev = bdev;
+		if (flags & BLKDEV_IFL_WAIT)
+			bio->bi_private = &wait;
+
+		/*
+		 * Add a zeroed one-sector payload as that's what
+		 * our current implementations need.  If we'll ever need
+		 * more the interface will need revisiting.
+		 */
+		page = alloc_page(gfp_mask | __GFP_ZERO);
+		if (!page)
+			goto out_free_bio;
+		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+			goto out_free_page;
+
+		/*
+		 * And override the bio size - the way discard works we
+		 * touch many more blocks on disk than the actual payload
+		 * length.
+		 */
+		if (nr_sects > max_discard_sectors) {
+			bio->bi_size = max_discard_sectors << 9;
+			nr_sects -= max_discard_sectors;
+			sector += max_discard_sectors;
+		} else {
+			bio->bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+
+		bio_get(bio);
+		submit_bio(type, bio);
+
+		if (flags & BLKDEV_IFL_WAIT)
+			wait_for_completion(&wait);
+
+		if (bio_flagged(bio, BIO_EOPNOTSUPP))
+			ret = -EOPNOTSUPP;
+		else if (!bio_flagged(bio, BIO_UPTODATE))
+			ret = -EIO;
+		bio_put(bio);
+	}
+	return ret;
+out_free_page:
+	__free_page(page);
+out_free_bio:
+	bio_put(bio);
+out:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
-- 
cgit v1.2.3-59-g8ed1b


From 3f14d792f9a8fede64ce918dbb517f934497a4f8 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 17:55:09 +0400
Subject: blkdev: add blkdev_issue_zeroout helper function

- Add bio_batch helper primitive. This is rather generic primitive
  for submitting/waiting a complex request which consists of several
  bios.
- blkdev_issue_zeroout() generate number of zero filed write bios.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-lib.c        | 118 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   3 +-
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 0dc438812d81..886c3f9e1be4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -112,3 +112,121 @@ out:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
+
+struct bio_batch
+{
+	atomic_t 		done;
+	unsigned long 		flags;
+	struct completion 	*wait;
+	bio_end_io_t		*end_io;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+	struct bio_batch *bb = bio->bi_private;
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bb->flags);
+		else
+			clear_bit(BIO_UPTODATE, &bb->flags);
+	}
+	if (bb) {
+		if (bb->end_io)
+			bb->end_io(bio, err);
+		atomic_inc(&bb->done);
+		complete(bb->wait);
+	}
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_zeroout generate number of zero filed write bios
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ *  Send barrier at the beginning and at the end if requested. This guarantie
+ *  correct request ordering. Empty barrier allow us to avoid post queue flush.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+	int ret = 0;
+	struct bio *bio;
+	struct bio_batch bb;
+	unsigned int sz, issued = 0;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	atomic_set(&bb.done, 0);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+	bb.end_io = NULL;
+
+	if (flags & BLKDEV_IFL_BARRIER) {
+		/* issue async barrier before the data */
+		ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
+		if (ret)
+			return ret;
+	}
+submit:
+	while (nr_sects != 0) {
+		bio = bio_alloc(gfp_mask,
+				min(nr_sects, (sector_t)BIO_MAX_PAGES));
+		if (!bio)
+			break;
+
+		bio->bi_sector = sector;
+		bio->bi_bdev   = bdev;
+		bio->bi_end_io = bio_batch_end_io;
+		if (flags & BLKDEV_IFL_WAIT)
+			bio->bi_private = &bb;
+
+		while(nr_sects != 0) {
+			sz = min(PAGE_SIZE >> 9 , nr_sects);
+			if (sz == 0)
+				/* bio has maximum size possible */
+				break;
+			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+			nr_sects -= ret >> 9;
+			sector += ret >> 9;
+			if (ret < (sz << 9))
+				break;
+		}
+		issued++;
+		submit_bio(WRITE, bio);
+	}
+	/*
+	 * When all data bios are in flight. Send final barrier if requeted.
+	 */
+	if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
+		ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
+					flags & BLKDEV_IFL_WAIT);
+
+
+	if (flags & BLKDEV_IFL_WAIT)
+		/* Wait for bios in-flight */
+		while ( issued != atomic_read(&bb.done))
+			wait_for_completion(&wait);
+
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
+		/* One of bios in the batch was completed with error.*/
+		ret = -EIO;
+
+	if (ret)
+		goto out;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (nr_sects != 0)
+		goto submit;
+out:
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 59b9aed0ee7d..3ac2bd2fc485 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1008,7 +1008,8 @@ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 			unsigned long);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
-
+extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 50eaeb323a170e231263ccb433bb2f99bd9e27ac Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 19:50:33 +0200
Subject: cfq-iosched: fix broken cfq_ref_get_cfqf() for CONFIG_BLK_CGROUP=y &&
 CFQ_GROUP_IOSCHED=n

We should return the cfq_group for this case, not NULL.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 286008cf889e..0f3eb70f9ce1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1094,7 +1094,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-	return NULL;
+	return cfqg;
 }
 
 static inline void
-- 
cgit v1.2.3-59-g8ed1b


From 0341aafb7f3313bcedc6811a098500be85f3fc77 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 29 Apr 2010 09:28:21 +0200
Subject: block: fix bad use of min() on different types

Just cast the page size to sector_t, that will always fit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-lib.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 886c3f9e1be4..d0216b9f22d4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -124,6 +124,7 @@ struct bio_batch
 static void bio_batch_end_io(struct bio *bio, int err)
 {
 	struct bio_batch *bb = bio->bi_private;
+
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			set_bit(BIO_EOPNOTSUPP, &bb->flags);
@@ -186,8 +187,8 @@ submit:
 		if (flags & BLKDEV_IFL_WAIT)
 			bio->bi_private = &bb;
 
-		while(nr_sects != 0) {
-			sz = min(PAGE_SIZE >> 9 , nr_sects);
+		while (nr_sects != 0) {
+			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
 			if (sz == 0)
 				/* bio has maximum size possible */
 				break;
-- 
cgit v1.2.3-59-g8ed1b


From 6a47dc1418682c83d603b491df1d048f73aa973e Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 29 Apr 2010 09:32:00 +0200
Subject: nilfs: fix breakage caused by barrier flag changes

After merging the block tree, today's linux-next build (powerpc ppc64_defconfig)
failed like this:

fs/nilfs2/the_nilfs.c: In function 'nilfs_discard_segments':
fs/nilfs2/the_nilfs.c:673: error: 'DISCARD_FL_BARRIER' undeclared (first use in this function)

Caused by commit fbd9b09a177a481eda256447c881f014f29034fe ("blkdev:
generalize flags for blkdev_issue_fn functions") interacting with commit
e902ec9906e844f4613fa6190c6fa65f162dc86e ("nilfs2: issue discard request
after cleaning segments") (which netered Linus' tree on about March 4 -
before v2.6.34-rc1).

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/nilfs2/the_nilfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..7ffcf2b8b1f4 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -670,7 +670,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
 						   GFP_NOFS,
-						   DISCARD_FL_BARRIER);
+						   BLKDEV_IFL_BARRIER);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -680,7 +680,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS, DISCARD_FL_BARRIER);
+					   GFP_NOFS, BLKDEV_IFL_BARRIER);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0f3942a39ed768c967cb71ea0e9be7fc94112713 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 3 May 2010 14:28:55 +0200
Subject: block: kill some useless goto's in blk-cgroup.c

goto has its place, but lets cut back on some of the more
frivolous uses of it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c | 84 ++++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d02bbf88de13..60bb049b6106 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -376,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
 
 	rcu_read_lock();
 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
-	if (!css)
-		goto out;
-
-	blkcg = container_of(css, struct blkio_cgroup, css);
-	spin_lock_irqsave(&blkcg->lock, flags);
-	if (!hlist_unhashed(&blkg->blkcg_node)) {
-		__blkiocg_del_blkio_group(blkg);
-		ret = 0;
+	if (css) {
+		blkcg = container_of(css, struct blkio_cgroup, css);
+		spin_lock_irqsave(&blkcg->lock, flags);
+		if (!hlist_unhashed(&blkg->blkcg_node)) {
+			__blkiocg_del_blkio_group(blkg);
+			ret = 0;
+		}
+		spin_unlock_irqrestore(&blkcg->lock, flags);
 	}
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-out:
+
 	rcu_read_unlock();
 	return ret;
 }
@@ -815,17 +814,15 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 	seq_printf(m, "dev\tweight\n");
 
 	blkcg = cgroup_to_blkio_cgroup(cgrp);
-	if (list_empty(&blkcg->policy_list))
-		goto out;
-
-	spin_lock_irq(&blkcg->lock);
-	list_for_each_entry(pn, &blkcg->policy_list, node) {
-		seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-			   MINOR(pn->dev), pn->weight);
+	if (!list_empty(&blkcg->policy_list)) {
+		spin_lock_irq(&blkcg->lock);
+		list_for_each_entry(pn, &blkcg->policy_list, node) {
+			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+				   MINOR(pn->dev), pn->weight);
+		}
+		spin_unlock_irq(&blkcg->lock);
 	}
-	spin_unlock_irq(&blkcg->lock);
 
-out:
 	return 0;
 }
 
@@ -917,40 +914,39 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 	struct blkio_policy_node *pn, *pntmp;
 
 	rcu_read_lock();
-remove_entry:
-	spin_lock_irqsave(&blkcg->lock, flags);
+	do {
+		spin_lock_irqsave(&blkcg->lock, flags);
 
-	if (hlist_empty(&blkcg->blkg_list)) {
-		spin_unlock_irqrestore(&blkcg->lock, flags);
-		goto done;
-	}
+		if (hlist_empty(&blkcg->blkg_list)) {
+			spin_unlock_irqrestore(&blkcg->lock, flags);
+			break;
+		}
 
-	blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
-				blkcg_node);
-	key = rcu_dereference(blkg->key);
-	__blkiocg_del_blkio_group(blkg);
+		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+					blkcg_node);
+		key = rcu_dereference(blkg->key);
+		__blkiocg_del_blkio_group(blkg);
 
-	spin_unlock_irqrestore(&blkcg->lock, flags);
+		spin_unlock_irqrestore(&blkcg->lock, flags);
 
-	/*
-	 * This blkio_group is being unlinked as associated cgroup is going
-	 * away. Let all the IO controlling policies know about this event.
-	 *
-	 * Currently this is static call to one io controlling policy. Once
-	 * we have more policies in place, we need some dynamic registration
-	 * of callback function.
-	 */
-	spin_lock(&blkio_list_lock);
-	list_for_each_entry(blkiop, &blkio_list, list)
-		blkiop->ops.blkio_unlink_group_fn(key, blkg);
-	spin_unlock(&blkio_list_lock);
-	goto remove_entry;
+		/*
+		 * This blkio_group is being unlinked as associated cgroup is
+		 * going away. Let all the IO controlling policies know about
+		 * this event. Currently this is static call to one io
+		 * controlling policy. Once we have more policies in place, we
+		 * need some dynamic registration of callback function.
+		 */
+		spin_lock(&blkio_list_lock);
+		list_for_each_entry(blkiop, &blkio_list, list)
+			blkiop->ops.blkio_unlink_group_fn(key, blkg);
+		spin_unlock(&blkio_list_lock);
+	} while (1);
 
-done:
 	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
 		blkio_policy_delete_node(pn);
 		kfree(pn);
 	}
+
 	free_css_id(&blkio_subsys, &blkcg->css);
 	rcu_read_unlock();
 	if (blkcg != &blkio_root_cgroup)
-- 
cgit v1.2.3-59-g8ed1b


From 01effb0dc1451fad55925873ffbfb88fa4eadce0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 11 May 2010 08:57:42 +0200
Subject: block: allow initialization of previously allocated request_queue

blk_init_queue() allocates the request_queue structure and then
initializes it as needed (request_fn, elevator, etc).

Split initialization out to blk_init_allocated_queue_node.
Introduce blk_init_allocated_queue wrapper function to model existing
blk_init_queue and blk_init_queue_node interfaces.

Export elv_register_queue to allow a newly added elevator to be
registered with sysfs.  Export elv_unregister_queue for symmetry.

These changes allow DM to initialize a device's request_queue with more
precision.  In particular, DM no longer unconditionally initializes a
full request_queue (elevator et al).  It only does so for a
request-based DM device.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 18 +++++++++++++++++-
 block/elevator.c       |  2 ++
 include/linux/blkdev.h |  5 +++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index e9a5ae25db8c..3bc5579d6f54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -572,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 
+	return blk_init_allocated_queue_node(q, rfn, lock, node_id);
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+struct request_queue *
+blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
+			 spinlock_t *lock)
+{
+	return blk_init_allocated_queue_node(q, rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_allocated_queue);
+
+struct request_queue *
+blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
+			      spinlock_t *lock, int node_id)
+{
 	if (!q)
 		return NULL;
 
@@ -605,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	blk_put_queue(q);
 	return NULL;
 }
-EXPORT_SYMBOL(blk_init_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue_node);
 
 int blk_get_queue(struct request_queue *q)
 {
diff --git a/block/elevator.c b/block/elevator.c
index 5e734592bb40..6df2b5056b51 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -930,6 +930,7 @@ int elv_register_queue(struct request_queue *q)
 	}
 	return error;
 }
+EXPORT_SYMBOL(elv_register_queue);
 
 static void __elv_unregister_queue(struct elevator_queue *e)
 {
@@ -942,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q)
 	if (q)
 		__elv_unregister_queue(q->elevator);
 }
+EXPORT_SYMBOL(elv_unregister_queue);
 
 void elv_register(struct elevator_type *e)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3ac2bd2fc485..346fd4856733 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -921,7 +921,12 @@ extern void blk_abort_queue(struct request_queue *);
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
+extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
+							   request_fn_proc *,
+							   spinlock_t *, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
+extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
+						      request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-- 
cgit v1.2.3-59-g8ed1b


From 2395e463fefd4aa8b784787e926e9b84e216d14f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 11 May 2010 09:02:55 +0200
Subject: paride: fix menu indentation

Make the PARIDE menu be displayed correctly, with proper/expected
indentation, by moving the GDROM kconfig symbol, which was
splitting the PARIDE kconfig symbol from its dependent symbols.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/Kconfig | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 77bfce52e9ca..de277689da61 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -76,6 +76,17 @@ config BLK_DEV_XD
 
 	  It's pretty unlikely that you have one of these: say N.
 
+config GDROM
+	tristate "SEGA Dreamcast GD-ROM drive"
+	depends on SH_DREAMCAST
+	help
+	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
+	  "GD-ROM" by SEGA to signify it is capable of reading special disks
+	  with up to 1 GB of data. This drive will also read standard CD ROM
+	  disks. Select this option to access any disks in your GD ROM drive.
+	  Most users will want to say "Y" here.
+	  You can also build this as a module which will be called gdrom.
+
 config PARIDE
 	tristate "Parallel port IDE device support"
 	depends on PARPORT_PC
@@ -103,17 +114,6 @@ config PARIDE
 	  "MicroSolutions backpack protocol", "DataStor Commuter protocol"
 	  etc.).
 
-config GDROM
-	tristate "SEGA Dreamcast GD-ROM drive"
-	depends on SH_DREAMCAST
-	help
-	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
-	  "GD-ROM" by SEGA to signify it is capable of reading special disks
-	  with up to 1 GB of data. This drive will also read standard CD ROM
-	  disks. Select this option to access any disks in your GD ROM drive.
-	  Most users will want to say "Y" here.
-	  You can also build this as a module which will be called gdrom.
-
 source "drivers/block/paride/Kconfig"
 
 config BLK_CPQ_DA
-- 
cgit v1.2.3-59-g8ed1b


From 69b62d01ec44fe0d505d89917392347732135a4d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 17 May 2010 12:51:03 +0200
Subject: writeback: disable periodic old data writeback for
 !dirty_writeback_centisecs

Prior to 2.6.32, setting /proc/sys/vm/dirty_writeback_centisecs disabled
periodic dirty writeback from kupdate. This got broken and now causes
excessive sys CPU usage if set to zero, as we'll keep beating on
schedule().

Cc: stable@kernel.org
Reported-by: Justin Maggard <jmaggard10@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..760dc8d0b4ff 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -852,6 +852,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 	unsigned long expired;
 	long nr_pages;
 
+	/*
+	 * When set to zero, disable periodic writeback
+	 */
+	if (!dirty_writeback_interval)
+		return 0;
+
 	expired = wb->last_old_flush +
 			msecs_to_jiffies(dirty_writeback_interval * 10);
 	if (time_before(jiffies, expired))
@@ -947,8 +953,12 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 				break;
 		}
 
-		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-		schedule_timeout_interruptible(wait_jiffies);
+		if (dirty_writeback_interval) {
+			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+			schedule_timeout_interruptible(wait_jiffies);
+		} else
+			schedule();
+
 		try_to_freeze();
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From e913fc825dc685a444cb4c1d0f9d32f372f59861 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 17 May 2010 12:55:07 +0200
Subject: writeback: fix WB_SYNC_NONE writeback from umount

When umount calls sync_filesystem(), we first do a WB_SYNC_NONE
writeback to kick off writeback of pending dirty inodes, then follow
that up with a WB_SYNC_ALL to wait for it. Since umount already holds
the sb s_umount mutex, WB_SYNC_NONE ends up doing nothing and all
writeback happens as WB_SYNC_ALL. This can greatly slow down umount,
since WB_SYNC_ALL writeback is a data integrity operation and thus
a bigger hammer than simple WB_SYNC_NONE. For barrier aware file systems
it's a lot slower.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           | 48 ++++++++++++++++++++++++++++++++++-----------
 fs/sync.c                   |  2 +-
 include/linux/backing-dev.h |  2 +-
 include/linux/writeback.h   | 10 ++++++++++
 mm/page-writeback.c         |  4 ++--
 5 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 760dc8d0b4ff..67db89786e7d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_args {
 	int for_kupdate:1;
 	int range_cyclic:1;
 	int for_background:1;
+	int sb_pinned:1;
 };
 
 /*
@@ -230,6 +231,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
+		/*
+		 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+		 * lets make it explicitly clear.
+		 */
+		.sb_pinned	= 1,
 	};
 	struct bdi_work work;
 
@@ -245,21 +251,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
  * @bdi: the backing device to write from
  * @sb: write inodes from this super_block
  * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
  *
  * Description:
  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
  *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages)
+			 long nr_pages, int sb_locked)
 {
 	struct wb_writeback_args args = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.nr_pages	= nr_pages,
 		.range_cyclic	= 1,
+		.sb_pinned	= sb_locked,
 	};
 
 	/*
@@ -577,7 +585,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
 	/*
 	 * Caller must already hold the ref for this
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
 		return SB_NOT_PINNED;
 	}
@@ -751,6 +759,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		.for_kupdate		= args->for_kupdate,
 		.for_background		= args->for_background,
 		.range_cyclic		= args->range_cyclic,
+		.sb_pinned		= args->sb_pinned,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -1193,6 +1202,18 @@ static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+	long nr_to_write;
+
+	nr_to_write = nr_dirty + nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1204,17 +1225,22 @@ static void wait_sb_inodes(struct super_block *sb)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	long nr_to_write;
-
-	nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+	__writeback_inodes_sb(sb, 0);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
+/**
+ * writeback_inodes_sb_locked	- writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+	__writeback_inodes_sb(sb, 1);
+}
+
 /**
  * writeback_inodes_sb_if_idle	-	start writeback if none underway
  * @sb: the superblock
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..de6a44192832 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	if (wait)
 		sync_inodes_sb(sb);
 	else
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb_locked(sb);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7534979d83bd..ff8bac63213f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,7 +106,7 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-				long nr_pages);
+				long nr_pages, int sb_locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index eb38a2c645f6..47e1c686cb02 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -65,6 +65,15 @@ struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+
+	/*
+	 * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
+	 * the writeback code will pin the sb for the caller. However,
+	 * for eg umount, the caller does WB_SYNC_NONE but already has
+	 * the sb pinned. If the below is set, caller already has the
+	 * sb pinned.
+	 */
+	unsigned sb_pinned:1;
 };
 
 /*
@@ -73,6 +82,7 @@ struct writeback_control {
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_locked(struct super_block *);
 int writeback_inodes_sb_if_idle(struct super_block *);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d0f2b3765f8d..53b2fcf2d283 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_writeback(bdi, NULL, 0);
+		bdi_start_writeback(bdi, NULL, 0, 0);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -705,7 +705,7 @@ void laptop_mode_timer_fn(unsigned long data)
 	 */
 
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
+		bdi_start_writeback(&q->backing_dev_info, NULL, 0, nr_pages);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5547e8aac6f71505d621a612de2fca0dd988b439 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 7 May 2010 13:35:44 +0400
Subject: writeback: Update dirty flags in two steps

Filesystems with delalloc support may dirty inode during writepages.
As result inode will have dirty metadata flags even after write_inode.
In fact we have two dedicated functions for proper data and metadata
writeback. It is reasonable to separate flags updates in two stages.

https://bugzilla.kernel.org/show_bug.cgi?id=15906

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 67db89786e7d..0f629571234f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -460,11 +460,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	BUG_ON(inode->i_state & I_SYNC);
 
-	/* Set I_SYNC, reset I_DIRTY */
-	dirty = inode->i_state & I_DIRTY;
+	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	inode->i_state |= I_SYNC;
-	inode->i_state &= ~I_DIRTY;
-
+	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
@@ -480,6 +478,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			ret = err;
 	}
 
+	/*
+	 * Some filesystems may redirty the inode during the writeback
+	 * due to delalloc, clear dirty metadata flags right before
+	 * write_inode()
+	 */
+	spin_lock(&inode_lock);
+	dirty = inode->i_state & I_DIRTY;
+	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	spin_unlock(&inode_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		int err = write_inode(inode, wbc);
-- 
cgit v1.2.3-59-g8ed1b


From 8c484ee4910b36c9ac273ad1150261c6ebfc1ef7 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 11 Mar 2010 16:47:58 +0100
Subject: drbd: use proc_create_data with explicit NULL argument

To document that we know about deprecation of proc_create,
even though we are not affected, as we don't use the ->data member,
open code proc_create_data(..., NULL);

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 93d1f9b469d4..db7a07a9a2cf 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3129,7 +3129,7 @@ int __init drbd_init(void)
 	if (err)
 		goto Enomem;
 
-	drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
 	if (!drbd_proc)	{
 		printk(KERN_ERR "drbd: unable to register proc file\n");
 		goto Enomem;
-- 
cgit v1.2.3-59-g8ed1b


From e4f925e12ea5daaa9baf2dd5af9c4951721dae95 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 17 Mar 2010 14:18:41 +0100
Subject: drbd: Do not upgrade state to Outdated if already Inconsistent [Bugz
 277]

There was a race condition:
  In a situation with a SyncSource+Primary and a SyncTarget+Secondary node,
  and a resync dependency to some other device. After both nodes decided
  to do the resync, the other device finishes its resync process.
  At that time SyncSource already sent the P_SYNC_UUID packet, and
  already updated its peer disk state to Inconsistent.
  The SyncTarget node waits for the P_SYNC_UUID and sends a state packet
  to report the resync dependency change. That packet still carries
  a disk state of Outdated.

Impact:
  If application writes come in, during that time on the Primary node,
  those do not get replicated, and the out-of-sync counter gets increased.
  => The completion of resync is not detected on the primary node.
  => stalled.
  Those blocks get resync'ed with the next resync, since the are get
  marked as out-of-sync in the bitmap.

In order to fix this, we filter out that wrong state change in the
sanitize_state() function.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index db7a07a9a2cf..8f84a9f58c99 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -840,7 +840,7 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 			break;
 		case C_WF_BITMAP_S:
 		case C_PAUSED_SYNC_S:
-			ns.pdsk = D_OUTDATED;
+			ns.pdsk = os.pdsk > D_OUTDATED ? D_OUTDATED : os.pdsk;
 			break;
 		case C_SYNC_SOURCE:
 			ns.pdsk = D_INCONSISTENT;
-- 
cgit v1.2.3-59-g8ed1b


From e0f83012dc510b0be92ee2d59227a573a36777b8 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 1 Apr 2010 15:13:19 +0200
Subject: drbd: fix regression: attach while connected failed

commit e4f925e12ea5daaa9baf2dd5af9c4951721dae95
Author: Philipp Reisner <philipp.reisner@linbit.com>
Date:   Wed Mar 17 14:18:41 2010 +0100

    drbd: Do not upgrade state to Outdated if already Inconsistent

prevented the necessary state transition for attaching while connected
(Diskless -> Consistent respectively Outdated).
This is the fix for the fix.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8f84a9f58c99..b1ce5dc7c603 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -840,7 +840,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 			break;
 		case C_WF_BITMAP_S:
 		case C_PAUSED_SYNC_S:
-			ns.pdsk = os.pdsk > D_OUTDATED ? D_OUTDATED : os.pdsk;
+			/* remap any consistent state to D_OUTDATED,
+			 * but disallow "upgrade" of not even consistent states.
+			 */
+			ns.pdsk =
+				(D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
+				? os.pdsk : D_OUTDATED;
 			break;
 		case C_SYNC_SOURCE:
 			ns.pdsk = D_INCONSISTENT;
-- 
cgit v1.2.3-59-g8ed1b


From 8d1894ebe441093cfd967affcbc56b764960575e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 1 Apr 2010 16:55:18 +0200
Subject: drbd: remove bogus ASSERT

block_id may be ID_SYNCER,
as well as checksum based resync request magic, or online verify magic.

Let's just drop that ASSERT.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c786023001d2..93106fb92be8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4233,7 +4233,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
 
 	sector = be64_to_cpu(p->sector);
 	size = be32_to_cpu(p->blksize);
-	D_ASSERT(p->block_id == ID_SYNCER);
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-- 
cgit v1.2.3-59-g8ed1b


From c3470cde57ea34d9b4bd34891ec040e46b9fb3bf Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 1 Apr 2010 16:57:19 +0200
Subject: drbd: fix potential protocol error

Don't forget to drain the digest in case we cannot satisfy a
checksum based resync or online-verify request.

It would additionally cause a protocoll error,
dropping the connection.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 93106fb92be8..c7285e16b667 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1331,6 +1331,9 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 	int rr, rv = 1;
 	void *data;
 
+	if (!data_size)
+		return TRUE;
+
 	page = drbd_pp_alloc(mdev, 1);
 
 	data = kmap(page);
@@ -1946,7 +1949,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 			    "no local data.\n");
 		drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
 				 P_NEG_RS_DREPLY , p);
-		return TRUE;
+		return drbd_drain_block(mdev, h->length - brps);
 	}
 
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
-- 
cgit v1.2.3-59-g8ed1b


From 8d4ce82b3ccd755c8ba401469ced5286b1e02284 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 1 Apr 2010 16:59:32 +0200
Subject: drbd: don't start a resync without access to up-to-date Data

In case both nodes are "inconsistent", invalidate would
have started a resync anyways, without a chance to ever
succeed, just filling the logs with warning messages.

Simply disallow that state change,
re-using the SS_NO_UP_TO_DATE_DISK return value.

This also changes the corresponding error string to
"Need access to UpToDate Data" -- I found the
"Refusing to be Primary without at least one UpToDate disk"
answer misleading in some situations anyways.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c    | 3 +++
 drivers/block/drbd/drbd_strings.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index b1ce5dc7c603..e181e405e244 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
 	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
 		rv = SS_NO_REMOTE_DISK;
 
+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
 	else if ((ns.conn == C_CONNECTED ||
 		  ns.conn == C_WF_BITMAP_S ||
 		  ns.conn == C_SYNC_SOURCE ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 76863e3f05be..85179e1fb50a 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = {
 
 static const char *drbd_state_sw_errors[] = {
 	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
-	[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
+	[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
 	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
 	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
 	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
-- 
cgit v1.2.3-59-g8ed1b


From c3fe30b0e7cd67e0207097f5f39ce9626644879e Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 1 Apr 2010 09:57:40 +0200
Subject: drbd: cleanup: This code path to trigger a resync is no longer needed

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7285e16b667..c3504ddd59c1 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2853,7 +2853,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 	unsigned int max_seg_s;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
-	enum drbd_conns nconn;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 	if (drbd_recv(mdev, h->payload, h->length) != h->length)
@@ -2920,22 +2919,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 		drbd_set_my_capacity(mdev, p_size);
 	}
 
-	if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
-		nconn = drbd_sync_handshake(mdev,
-				mdev->state.peer, mdev->state.pdsk);
-		put_ldev(mdev);
-
-		if (nconn == C_MASK) {
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-			return FALSE;
-		}
-
-		if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-			return FALSE;
-		}
-	}
-
 	if (get_ldev(mdev)) {
 		if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
 			mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
-- 
cgit v1.2.3-59-g8ed1b


From 6666032ade5a758aa05380ab92f416ab8ef25005 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 6 Apr 2010 12:15:04 +0200
Subject: drbd: check for corrupt or malicous sector addresses when receiving
 data

Even if it should never happen if the peer does behave, we need to
double check, and not even attempt access beyond end of device.
It usually would be caught by lower layers, resulting in "IO error",
but may also end up in the internal meta data area.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c3504ddd59c1..3a36bc814e77 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1262,6 +1262,7 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
 static struct drbd_epoch_entry *
 read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
 {
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct drbd_epoch_entry *e;
 	struct bio_vec *bvec;
 	struct page *page;
@@ -1287,6 +1288,15 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	ERR_IF(data_size &  0x1ff) return NULL;
 	ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
 
+	/* even though we trust out peer,
+	 * we sometimes have to double check. */
+	if (sector + (data_size>>9) > capacity) {
+		dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
+			(unsigned long long)capacity,
+			(unsigned long long)sector, data_size);
+		return NULL;
+	}
+
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-- 
cgit v1.2.3-59-g8ed1b


From 979f5c7f1f6c8a532b943defb790d43b999934eb Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 6 Apr 2010 14:15:06 +0200
Subject: drbd: fail_requests_early: remove incorrect and unnecessary
 optimization

The condition does not fit the commend (I may well be Primary,
even if I lost the disk earlier and now the connection).

And this is catched below anyways, where it also gets logged.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index de81ab7b4627..d8d9bbfca3b8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -962,11 +962,6 @@ fail_and_free_req:
  */
 static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 {
-	/* Unconfigured */
-	if (mdev->state.conn == C_DISCONNECTING &&
-	    mdev->state.disk == D_DISKLESS)
-		return 1;
-
 	if (mdev->state.role != R_PRIMARY &&
 		(!allow_oos || is_write)) {
 		if (__ratelimit(&drbd_ratelimit_state)) {
-- 
cgit v1.2.3-59-g8ed1b


From 3a11a4878939e0e3c355bf3f52ef642a4cb6ba84 Mon Sep 17 00:00:00 2001
From: Adam Gandelman <adam.gandelman@linbit.com>
Date: Thu, 8 Apr 2010 16:48:23 -0700
Subject: drbd: New handler: initial-split-brain

Some wish to be notified of all instances of split brain, not just those that
go unresolved.  The initial-split-brain handler is called to notify someone
upon  detection of all split brain conditions even if auto-recovery policies
are configured.

Signed-off-by: Adam Gandelman <adam.gandelman@linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3a36bc814e77..6876041fc3d8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2487,6 +2487,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 		     hg > 0 ? "source" : "target");
 	}
 
+	if (abs(hg) == 100)
+		drbd_khelper(mdev, "initial-split-brain");
+
 	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
 		int pcount = (mdev->state.role == R_PRIMARY)
 			   + (peer_role == R_PRIMARY);
@@ -2532,7 +2535,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 		 * after an attempted attach on a diskless node.
 		 * We just refuse to attach -- well, we drop the "connection"
 		 * to that disk, in a way... */
-		dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
+		dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
 		drbd_khelper(mdev, "split-brain");
 		return C_MASK;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From d845030f21859dd11bcecc7e1b8575fb845eb425 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 24 Mar 2010 15:51:26 +0100
Subject: drbd: made determin_dev_size's parameter an flag enum

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 6 +++++-
 drivers/block/drbd/drbd_nl.c  | 6 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d6f1ae342b1d..f5c56f4fd2a3 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1382,8 +1382,12 @@ extern void drbd_suspend_io(struct drbd_conf *mdev);
 extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
+enum dds_flags {
+	DDSF_FORCED    = 1,
+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local);
+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
 extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6429d2b19e06..97abbc2acc5c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
  * Returns 0 on success, negative return values indicate errors.
  * You should call drbd_md_sync() after calling this function.
  */
-enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local)
+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
 {
 	sector_t prev_first_sect, prev_size; /* previous meta location */
 	sector_t la_size;
@@ -541,7 +541,7 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force
 	/* TODO: should only be some assert here, not (re)init... */
 	drbd_md_set_sector_offsets(mdev, mdev->ldev);
 
-	size = drbd_new_dev_size(mdev, mdev->ldev, force);
+	size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
 
 	if (drbd_get_capacity(mdev->this_bdev) != size ||
 	    drbd_bm_capacity(mdev) != size) {
@@ -1508,7 +1508,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	}
 
 	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
-	dd = drbd_determin_dev_size(mdev, rs.resize_force);
+	dd = drbd_determin_dev_size(mdev, rs.resize_force ? DDSF_FORCED : 0);
 	drbd_md_sync(mdev);
 	put_ldev(mdev);
 	if (dd == dev_size_error) {
-- 
cgit v1.2.3-59-g8ed1b


From 02d9a94bbb0d4e0fec8db6735bdc4ccfaac8f0ce Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 24 Mar 2010 16:23:03 +0100
Subject: drbd: Implemented the set_new_bits parameter for drbd_bm_resize()

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 10 +++++++---
 drivers/block/drbd/drbd_int.h    |  2 +-
 drivers/block/drbd/drbd_main.c   |  2 +-
 drivers/block/drbd/drbd_nl.c     |  2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 3390716898d5..695fb64cba00 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -441,7 +441,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
  * In case this is actually a resize, we copy the old bitmap into the new one.
  * Otherwise, the bitmap is initialized to all bits set.
  */
-int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long bits, words, owords, obits, *p_addr, *bm;
@@ -526,8 +526,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
 	b->bm_dev_capacity = capacity;
 
 	if (growing) {
-		bm_memset(b, owords, 0xff, words-owords);
-		b->bm_set += bits - obits;
+		if (set_new_bits) {
+			bm_memset(b, owords, 0xff, words-owords);
+			b->bm_set += bits - obits;
+		} else
+			bm_memset(b, owords, 0x00, words-owords);
+
 	}
 
 	if (want < have) {
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f5c56f4fd2a3..37a25a6084dd 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1311,7 +1311,7 @@ struct bm_extent {
 #define APP_R_HSIZE 15
 
 extern int  drbd_bm_init(struct drbd_conf *mdev);
-extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
+extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
 extern void drbd_bm_cleanup(struct drbd_conf *mdev);
 extern void drbd_bm_set_all(struct drbd_conf *mdev);
 extern void drbd_bm_clear_all(struct drbd_conf *mdev);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index e181e405e244..65c2a65d3d64 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2688,7 +2688,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	drbd_set_my_capacity(mdev, 0);
 	if (mdev->bitmap) {
 		/* maybe never allocated. */
-		drbd_bm_resize(mdev, 0);
+		drbd_bm_resize(mdev, 0, 1);
 		drbd_bm_cleanup(mdev);
 	}
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 97abbc2acc5c..360e506426b0 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -546,7 +546,7 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_
 	if (drbd_get_capacity(mdev->this_bdev) != size ||
 	    drbd_bm_capacity(mdev) != size) {
 		int err;
-		err = drbd_bm_resize(mdev, size);
+		err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
 		if (unlikely(err)) {
 			/* currently there is only one error: ENOMEM! */
 			size = drbd_bm_capacity(mdev)>>1;
-- 
cgit v1.2.3-59-g8ed1b


From e89b591c3aba0af87f5248b15f56ce7a4f439c16 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 24 Mar 2010 17:11:33 +0100
Subject: drbd: Implemented flags for the resize packet

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 14 ++++++++------
 drivers/block/drbd/drbd_main.c     |  8 ++++----
 drivers/block/drbd/drbd_nl.c       |  2 +-
 drivers/block/drbd/drbd_receiver.c | 19 ++++++++++++-------
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 37a25a6084dd..e09132483980 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -481,7 +481,8 @@ struct p_sizes {
 	u64	    u_size;  /* user requested size */
 	u64	    c_size;  /* current exported size */
 	u32	    max_segment_size;  /* Maximal size of a BIO */
-	u32	    queue_order_type;
+	u16	    queue_order_type;  /* not yet implemented in DRBD*/
+	u16	    dds_flags; /* use enum dds_flags here. */
 } __packed;
 
 struct p_state {
@@ -1081,6 +1082,11 @@ enum chg_state_flags {
 	CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
 };
 
+enum dds_flags {
+	DDSF_FORCED    = 1,
+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
 extern void drbd_init_set_defaults(struct drbd_conf *mdev);
 extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
 			union drbd_state mask, union drbd_state val);
@@ -1113,7 +1119,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev);
 extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
 extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
-extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
 extern int _drbd_send_state(struct drbd_conf *mdev);
 extern int drbd_send_state(struct drbd_conf *mdev);
 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
@@ -1382,10 +1388,6 @@ extern void drbd_suspend_io(struct drbd_conf *mdev);
 extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
-enum dds_flags {
-	DDSF_FORCED    = 1,
-	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
-};
 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
 extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 65c2a65d3d64..a478dad82dae 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1240,7 +1240,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
 		kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
 		mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
-		drbd_send_sizes(mdev, 0);  /* to start sync... */
+		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
 		drbd_send_uuids(mdev);
 		drbd_send_state(mdev);
 	}
@@ -1763,7 +1763,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
 			     (struct p_header *)&p, sizeof(p));
 }
 
-int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
 {
 	struct p_sizes p;
 	sector_t d_size, u_size;
@@ -1775,7 +1775,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
 		d_size = drbd_get_max_capacity(mdev->ldev);
 		u_size = mdev->ldev->dc.disk_size;
 		q_order_type = drbd_queue_order_type(mdev);
-		p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
 		put_ldev(mdev);
 	} else {
 		d_size = 0;
@@ -1787,7 +1786,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
 	p.u_size = cpu_to_be64(u_size);
 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
 	p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
-	p.queue_order_type = cpu_to_be32(q_order_type);
+	p.queue_order_type = cpu_to_be16(q_order_type);
+	p.dds_flags = cpu_to_be16(flags);
 
 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
 			   (struct p_header *)&p, sizeof(p));
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 360e506426b0..6f7933376a11 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1521,7 +1521,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			set_bit(RESIZE_PENDING, &mdev->flags);
 
 		drbd_send_uuids(mdev);
-		drbd_send_sizes(mdev, 1);
+		drbd_send_sizes(mdev, 1, 0);
 	}
 
  fail:
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6876041fc3d8..11b1bafde28b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -902,7 +902,7 @@ retry:
 	if (!drbd_send_protocol(mdev))
 		return -1;
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
-	drbd_send_sizes(mdev, 0);
+	drbd_send_sizes(mdev, 0, 0);
 	drbd_send_uuids(mdev);
 	drbd_send_state(mdev);
 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
@@ -2866,6 +2866,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 	unsigned int max_seg_s;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
+	enum dds_flags ddsf;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 	if (drbd_recv(mdev, h->payload, h->length) != h->length)
@@ -2921,8 +2922,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 	}
 #undef min_not_zero
 
+	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(mdev)) {
-	  dd = drbd_determin_dev_size(mdev, 0);
+		dd = drbd_determin_dev_size(mdev, ddsf);
 		put_ldev(mdev);
 		if (dd == dev_size_error)
 			return FALSE;
@@ -2942,7 +2944,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 		if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
 			drbd_setup_queue_param(mdev, max_seg_s);
 
-		drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
+		drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
 		put_ldev(mdev);
 	}
 
@@ -2951,14 +2953,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 		    drbd_get_capacity(mdev->this_bdev) || ldsc) {
 			/* we have different sizes, probably peer
 			 * needs to know my new size... */
-			drbd_send_sizes(mdev, 0);
+			drbd_send_sizes(mdev, 0, ddsf);
 		}
 		if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
 		    (dd == grew && mdev->state.conn == C_CONNECTED)) {
 			if (mdev->state.pdsk >= D_INCONSISTENT &&
-			    mdev->state.disk >= D_INCONSISTENT)
-				resync_after_online_grow(mdev);
-			else
+			    mdev->state.disk >= D_INCONSISTENT) {
+				if (ddsf & DDSF_NO_RESYNC)
+					dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
+				else
+					resync_after_online_grow(mdev);
+			} else
 				set_bit(RESYNC_AFTER_NEG, &mdev->flags);
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From fd76438c2421324fa2fb9303e760ec5332ff0b58 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 1 Apr 2010 09:57:40 +0200
Subject: drbd: Make sure to resync all of the new storage upon online resize

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 695fb64cba00..178cf1642b2d 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -84,6 +84,9 @@ struct drbd_bitmap {
 #define BM_MD_IO_ERROR  1
 #define BM_P_VMALLOCED  2
 
+int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+			       unsigned long e, int val, const enum km_type km);
+
 static int bm_is_locked(struct drbd_bitmap *b)
 {
 	return test_bit(BM_LOCKED, &b->bm_flags);
@@ -529,6 +532,9 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 		if (set_new_bits) {
 			bm_memset(b, owords, 0xff, words-owords);
 			b->bm_set += bits - obits;
+			__bm_change_bits_to(mdev, obits,
+					    ALIGN(obits, BITS_PER_LONG),
+					    1, KM_IRQ1);
 		} else
 			bm_memset(b, owords, 0x00, words-owords);
 
-- 
cgit v1.2.3-59-g8ed1b


From b4ee79dac3bddc468e21cae0deb00b80ec4ac051 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 1 Apr 2010 09:57:40 +0200
Subject: drbd: Added some missing statics

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 178cf1642b2d..aa7e23cf7e22 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -84,7 +84,7 @@ struct drbd_bitmap {
 #define BM_MD_IO_ERROR  1
 #define BM_P_VMALLOCED  2
 
-int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 			       unsigned long e, int val, const enum km_type km);
 
 static int bm_is_locked(struct drbd_bitmap *b)
@@ -783,7 +783,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
 	/* nothing to do, on disk == in memory */
 # define bm_cpu_to_lel(x) ((void)0)
 # else
-void bm_cpu_to_lel(struct drbd_bitmap *b)
+static void bm_cpu_to_lel(struct drbd_bitmap *b)
 {
 	/* need to cpu_to_lel all the pages ...
 	 * this may be optimized by using
@@ -1025,7 +1025,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
  * wants bitnr, not sector.
  * expected to be called for only a few bits (e - s about BITS_PER_LONG).
  * Must hold bitmap lock already. */
-int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 	unsigned long e, int val, const enum km_type km)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
@@ -1063,7 +1063,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
  * for val != 0, we change 0 -> 1, return code positive
  * for val == 0, we change 1 -> 0, return code negative
  * wants bitnr, not sector */
-int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 	const unsigned long e, int val)
 {
 	unsigned long flags;
-- 
cgit v1.2.3-59-g8ed1b


From 6495d2c6d04f4c45411fdb1b40527c24015f39d6 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 24 Mar 2010 16:07:04 +0100
Subject: drbd: Implemented the --assume-clean option for drbdsetup resize

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 11 +++++++++--
 include/linux/drbd.h         |  3 ++-
 include/linux/drbd_nl.h      |  1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6f7933376a11..19b9a2851e7b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1479,6 +1479,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	int retcode = NO_ERROR;
 	int ldsc = 0; /* local disk size changed */
 	enum determine_dev_size dd;
+	enum dds_flags ddsf;
 
 	memset(&rs, 0, sizeof(struct resize));
 	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
@@ -1502,13 +1503,19 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
+	if (rs.no_resync && mdev->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail;
+	}
+
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
 		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
 		ldsc = 1;
 	}
 
 	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
-	dd = drbd_determin_dev_size(mdev, rs.resize_force ? DDSF_FORCED : 0);
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determin_dev_size(mdev, ddsf);
 	drbd_md_sync(mdev);
 	put_ldev(mdev);
 	if (dd == dev_size_error) {
@@ -1521,7 +1528,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			set_bit(RESIZE_PENDING, &mdev->flags);
 
 		drbd_send_uuids(mdev);
-		drbd_send_sizes(mdev, 1, 0);
+		drbd_send_sizes(mdev, 1, ddsf);
 	}
 
  fail:
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 4341b1a97a34..7944dd36ee3e 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -56,7 +56,7 @@ extern const char *drbd_buildtag(void);
 #define REL_VERSION "8.3.7"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 92
+#define PRO_VERSION_MAX 93
 
 
 enum drbd_io_error_p {
@@ -139,6 +139,7 @@ enum drbd_ret_codes {
 	ERR_DATA_NOT_CURRENT	= 150,
 	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
 	ERR_PERM		= 152,
+	ERR_NEED_APV_93		= 153,
 
 	/* insert new ones above this line */
 	AFTER_LAST_ERR_CODE
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index f7431a4ca608..de055c07c2ba 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -71,6 +71,7 @@ NL_PACKET(disconnect, 6, )
 NL_PACKET(resize, 7,
 	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
 	NL_BIT(			68,	T_MAY_IGNORE,	resize_force)
+	NL_BIT(			69,	T_MANDATORY,	no_resync)
 )
 
 NL_PACKET(syncer_conf, 8,
-- 
cgit v1.2.3-59-g8ed1b


From 087c24925cf4209be1a91f8ede9241e17e9734c7 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 26 Mar 2010 13:49:56 +0100
Subject: drbd: bugfix: Make resize work, if remote's size was limiting and
 increased in the meantime

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 19b9a2851e7b..6cb703686b28 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1477,7 +1477,6 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 {
 	struct resize rs;
 	int retcode = NO_ERROR;
-	int ldsc = 0; /* local disk size changed */
 	enum determine_dev_size dd;
 	enum dds_flags ddsf;
 
@@ -1508,10 +1507,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
-	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
 		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
-		ldsc = 1;
-	}
 
 	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
 	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
@@ -1523,7 +1520,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
-	if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
+	if (mdev->state.conn == C_CONNECTED) {
 		if (dd == grew)
 			set_bit(RESIZE_PENDING, &mdev->flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From 6b4388ac1f282515db3a651707238cad00b50e80 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 26 Apr 2010 14:11:45 +0200
Subject: drbd: Added transmission faults to the fault injection code

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 1 +
 drivers/block/drbd/drbd_main.c     | 3 ++-
 drivers/block/drbd/drbd_receiver.c | 8 +++++++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e09132483980..2409de12f013 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -132,6 +132,7 @@ enum {
 	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
 	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
 	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+	DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
 
 	DRBD_FAULT_MAX,
 };
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a478dad82dae..7468d2ce7347 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3668,7 +3668,8 @@ _drbd_fault_str(unsigned int type) {
 		[DRBD_FAULT_DT_RD] = "Data read",
 		[DRBD_FAULT_DT_RA] = "Data read ahead",
 		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
-		[DRBD_FAULT_AL_EE] = "EE allocation"
+		[DRBD_FAULT_AL_EE] = "EE allocation",
+		[DRBD_FAULT_RECEIVE] = "receive data corruption",
 	};
 
 	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 11b1bafde28b..b27f4dd36f89 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1270,6 +1270,7 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	int dgs, ds, i, rr;
 	void *dig_in = mdev->int_dig_in;
 	void *dig_vv = mdev->int_dig_vv;
+	unsigned long *data;
 
 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
 		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
@@ -1307,7 +1308,12 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	ds = data_size;
 	bio_for_each_segment(bvec, bio, i) {
 		page = bvec->bv_page;
-		rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
+		data = kmap(page);
+		rr = drbd_recv(mdev, data, min_t(int, ds, PAGE_SIZE));
+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
+			dev_err(DEV, "Fault injection: Corrupting data on receive\n");
+			data[0] = data[0] ^ (unsigned long)-1;
+		}
 		kunmap(page);
 		if (rr != min_t(int, ds, PAGE_SIZE)) {
 			drbd_free_ee(mdev, e);
-- 
cgit v1.2.3-59-g8ed1b


From 5223671bb0315d83f9ad7becbbb9e703aa735bbe Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 28 Apr 2010 14:46:57 +0200
Subject: drbd: Fixed bitmap in case of online-grow without resync

The "surplus" bits of the old (smaller) bitmap must be clean
in case of online-grow without resync.

Note: Reverted 67ae8b80d4a116ab3b7094eb3723506b20c06dff as
well, since the lines added by this patch are redundant. The
bits get set by the bm_set_surplus(b) call before that.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index aa7e23cf7e22..e3f88d6e1412 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -519,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 	obits  = b->bm_bits;
 
 	growing = bits > obits;
-	if (opages)
+	if (opages && growing && set_new_bits)
 		bm_set_surplus(b);
 
 	b->bm_pages = npages;
@@ -532,9 +532,6 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 		if (set_new_bits) {
 			bm_memset(b, owords, 0xff, words-owords);
 			b->bm_set += bits - obits;
-			__bm_change_bits_to(mdev, obits,
-					    ALIGN(obits, BITS_PER_LONG),
-					    1, KM_IRQ1);
 		} else
 			bm_memset(b, owords, 0x00, words-owords);
 
-- 
cgit v1.2.3-59-g8ed1b


From 0ced55a3bed25b0e30dcb3c7dce9634ce3c60cf2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 30 Apr 2010 15:26:20 +0200
Subject: drbd: Receiving of delay_probes

Delay_probes are new packets in the DRBD protocol, which allow
DRBD to know the current delay packets have on the data socket.
(relative to the meta data socket)

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 20 +++++++-
 drivers/block/drbd/drbd_main.c     |  3 ++
 drivers/block/drbd/drbd_receiver.c | 96 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2409de12f013..fd7615f1e526 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -209,8 +209,11 @@ enum drbd_packets {
 	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
 	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
 	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
+	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
+	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
 
-	P_MAX_CMD	      = 0x25,
+	P_MAX_CMD	      = 0x28,
 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
 	P_MAX_OPT_CMD	      = 0x101,
 
@@ -540,6 +543,18 @@ struct p_compressed_bm {
 	u8 code[0];
 } __packed;
 
+struct p_delay_probe {
+	struct p_header head;
+	u32	seq_num; /* sequence number to match the two probe packets */
+	u32	offset;	 /* usecs the probe got sent after the reference time point */
+} __packed;
+
+struct delay_probe {
+	struct list_head list;
+	int seq_num;
+	struct timeval time;
+};
+
 /* DCBP: Drbd Compressed Bitmap Packet ... */
 static inline enum drbd_bitmap_code
 DCBP_get_code(struct p_compressed_bm *p)
@@ -1028,6 +1043,9 @@ struct drbd_conf {
 	u64 ed_uuid; /* UUID of the exposed data */
 	struct mutex state_mutex;
 	char congestion_reason;  /* Why we where congested... */
+	struct list_head delay_probes; /* protected by peer_seq_lock */
+	int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
+	atomic_t delay_seq; /* To generate sequence numbers of delay probes */
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7468d2ce7347..3d5fe307a19d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2608,6 +2608,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->net_cnt, 0);
 	atomic_set(&mdev->packet_seq, 0);
 	atomic_set(&mdev->pp_in_use, 0);
+	atomic_set(&mdev->delay_seq, 0);
 
 	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
@@ -2636,6 +2637,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->unplug_work.list);
 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+	INIT_LIST_HEAD(&mdev->delay_probes);
+
 	mdev->resync_work.cb  = w_resync_inactive;
 	mdev->unplug_work.cb  = w_send_write_hint;
 	mdev->md_sync_work.cb = w_md_sync;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b27f4dd36f89..fee0d249adf7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3501,6 +3501,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
+static void timeval_sub_us(struct timeval* tv, unsigned int us)
+{
+	tv->tv_sec -= us / 1000000;
+	us = us % 1000000;
+	if (tv->tv_usec > us) {
+		tv->tv_usec += 1000000;
+		tv->tv_sec--;
+	}
+	tv->tv_usec -= us;
+}
+
+static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
+{
+	struct delay_probe *dp;
+	struct list_head *le;
+	struct timeval now;
+	int seq_num;
+	int offset;
+	int data_delay;
+
+	seq_num = be32_to_cpu(p->seq_num);
+	offset  = be32_to_cpu(p->offset);
+
+	spin_lock(&mdev->peer_seq_lock);
+	if (!list_empty(&mdev->delay_probes)) {
+		if (from == USE_DATA_SOCKET)
+			le = mdev->delay_probes.next;
+		else
+			le = mdev->delay_probes.prev;
+
+		dp = list_entry(le, struct delay_probe, list);
+
+		if (dp->seq_num == seq_num) {
+			list_del(le);
+			spin_unlock(&mdev->peer_seq_lock);
+			do_gettimeofday(&now);
+			timeval_sub_us(&now, offset);
+			data_delay =
+				now.tv_usec - dp->time.tv_usec +
+				(now.tv_sec - dp->time.tv_sec) * 1000000;
+
+			if (data_delay > 0)
+				mdev->data_delay = data_delay;
+
+			kfree(dp);
+			return;
+		}
+
+		if (dp->seq_num > seq_num) {
+			spin_unlock(&mdev->peer_seq_lock);
+			dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
+			return; /* Do not alloca a struct delay_probe.... */
+		}
+	}
+	spin_unlock(&mdev->peer_seq_lock);
+
+	dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
+	if (!dp) {
+		dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
+		return;
+	}
+
+	dp->seq_num = seq_num;
+	do_gettimeofday(&dp->time);
+	timeval_sub_us(&dp->time, offset);
+
+	spin_lock(&mdev->peer_seq_lock);
+	if (from == USE_DATA_SOCKET)
+		list_add(&dp->list, &mdev->delay_probes);
+	else
+		list_add_tail(&dp->list, &mdev->delay_probes);
+	spin_unlock(&mdev->peer_seq_lock);
+}
+
+static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_delay_probe *p = (struct p_delay_probe *)h;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	got_delay_probe(mdev, USE_DATA_SOCKET, p);
+	return TRUE;
+}
+
 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
 
 static drbd_cmd_handler_f drbd_default_handler[] = {
@@ -3524,6 +3610,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
 	[P_OV_REQUEST]      = receive_DataRequest,
 	[P_OV_REPLY]        = receive_DataRequest,
 	[P_CSUM_RS_REQUEST]    = receive_DataRequest,
+	[P_DELAY_PROBE]     = receive_delay_probe,
 	/* anything missing from this table is in
 	 * the asender_tbl, see get_asender_cmd */
 	[P_MAX_CMD]	    = NULL,
@@ -4300,6 +4387,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
+static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_delay_probe *p = (struct p_delay_probe *)h;
+
+	got_delay_probe(mdev, USE_META_SOCKET, p);
+	return TRUE;
+}
+
 struct asender_cmd {
 	size_t pkt_size;
 	int (*process)(struct drbd_conf *mdev, struct p_header *h);
@@ -4324,6 +4419,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_delay_probe_m },
 	[P_MAX_CMD]	    = { 0, NULL },
 	};
 	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
-- 
cgit v1.2.3-59-g8ed1b


From 7237bc430f49de1145d761c4b39f2ebae58842d5 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 3 May 2010 15:10:47 +0200
Subject: drbd: Sending of delay_probes

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  1 +
 drivers/block/drbd/drbd_main.c | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index fd7615f1e526..3e4d8b574fef 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1046,6 +1046,7 @@ struct drbd_conf {
 	struct list_head delay_probes; /* protected by peer_seq_lock */
 	int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
 	atomic_t delay_seq; /* To generate sequence numbers of delay probes */
+	struct timeval dps_time; /* delay-probes-start-time */
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 3d5fe307a19d..710bfebb7b63 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2188,6 +2188,39 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
 	return ok;
 }
 
+static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
+{
+	struct p_delay_probe dp;
+	int offset, ok = 0;
+	struct timeval now;
+
+	mutex_lock(&ds->mutex);
+	if (likely(ds->socket)) {
+		do_gettimeofday(&now);
+		offset = now.tv_usec - mdev->dps_time.tv_usec +
+			 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
+		dp.seq_num  = cpu_to_be32(atomic_read(&mdev->delay_seq));
+		dp.offset   = cpu_to_be32(offset);
+
+		ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
+				    (struct p_header *)&dp, sizeof(dp), 0);
+	}
+	mutex_unlock(&ds->mutex);
+
+	return ok;
+}
+
+static int drbd_send_dalay_probes(struct drbd_conf *mdev)
+{
+	int ok;
+	atomic_inc(&mdev->delay_seq);
+	do_gettimeofday(&mdev->dps_time);
+	ok = drbd_send_delay_probe(mdev, &mdev->meta);
+	ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
+
+	return ok;
+}
+
 /* called on sndtimeo
  * returns FALSE if we should retry,
  * TRUE if we think connection is dead
-- 
cgit v1.2.3-59-g8ed1b


From 67c7ddd055c794f0d8e9466ca2d6b5cc0b73d4df Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 4 May 2010 11:12:00 +0200
Subject: drbd: Four new configuration settings for resync speed control

To reasonably control resync speed over drbd-proxy connections,
drbd has to measure the current delay of packets transmitted over
the (possibly congested) data socket vs the meta-data socket.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c |  4 ++++
 include/linux/drbd_limits.h  | 16 ++++++++++++++++
 include/linux/drbd_nl.h      |  4 ++++
 3 files changed, 24 insertions(+)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6cb703686b28..93d150661f4b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1555,6 +1555,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		sc.rate       = DRBD_RATE_DEF;
 		sc.after      = DRBD_AFTER_DEF;
 		sc.al_extents = DRBD_AL_EXTENTS_DEF;
+		sc.dp_volume  = DRBD_DP_VOLUME_DEF;
+		sc.dp_interval = DRBD_DP_INTERVAL_DEF;
+		sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
+		sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
 	} else
 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 51f47a586ad8..440b42e38e89 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -133,5 +133,21 @@
 #define DRBD_MAX_BIO_BVECS_MAX 128
 #define DRBD_MAX_BIO_BVECS_DEF 0
 
+#define DRBD_DP_VOLUME_MIN 4
+#define DRBD_DP_VOLUME_MAX 1048576
+#define DRBD_DP_VOLUME_DEF 16384
+
+#define DRBD_DP_INTERVAL_MIN 1
+#define DRBD_DP_INTERVAL_MAX 600
+#define DRBD_DP_INTERVAL_DEF 5
+
+#define DRBD_RS_THROTTLE_TH_MIN 1
+#define DRBD_RS_THROTTLE_TH_MAX 600
+#define DRBD_RS_THROTTLE_TH_DEF 20
+
+#define DRBD_RS_HOLD_OFF_TH_MIN 1
+#define DRBD_RS_HOLD_OFF_TH_MAX 6000
+#define DRBD_RS_HOLD_OFF_TH_DEF 100
+
 #undef RANGE
 #endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index de055c07c2ba..ce77a746fc9d 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -78,6 +78,10 @@ NL_PACKET(syncer_conf, 8,
 	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
 	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
 	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
+	NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume)
+	NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval)
+	NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th)
+	NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th)
 	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
 	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
 	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
-- 
cgit v1.2.3-59-g8ed1b


From bd26bfc5b4253425d17aa49648ae1f3e976041c4 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 4 May 2010 12:33:58 +0200
Subject: drbd: Actually send delay probes

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  6 +++++-
 drivers/block/drbd/drbd_main.c | 43 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3e4d8b574fef..210870ed8a79 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -925,9 +925,11 @@ struct drbd_conf {
 	unsigned int ko_count;
 	struct drbd_work  resync_work,
 			  unplug_work,
-			  md_sync_work;
+			  md_sync_work,
+			  delay_probe_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
+	struct timer_list delay_probe_timer;
 
 	/* Used after attach while negotiating new disk state. */
 	union drbd_state new_state_tmp;
@@ -1047,6 +1049,8 @@ struct drbd_conf {
 	int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
 	atomic_t delay_seq; /* To generate sequence numbers of delay probes */
 	struct timeval dps_time; /* delay-probes-start-time */
+	int dp_volume_last;  /* send_cnt of last delay probe */
+	int c_sync_rate; /* current resync rate after delay_probe magic */
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 710bfebb7b63..98785d08cf70 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2207,10 +2207,13 @@ static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
 	}
 	mutex_unlock(&ds->mutex);
 
+	mdev->dp_volume_last = mdev->send_cnt;
+	mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
+
 	return ok;
 }
 
-static int drbd_send_dalay_probes(struct drbd_conf *mdev)
+static int drbd_send_delay_probes(struct drbd_conf *mdev)
 {
 	int ok;
 	atomic_inc(&mdev->delay_seq);
@@ -2350,6 +2353,30 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 	return 1;
 }
 
+static void consider_delay_probes(struct drbd_conf *mdev)
+{
+	if (mdev->state.conn != C_SYNC_SOURCE)
+		return;
+
+	if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
+		drbd_send_delay_probes(mdev);
+}
+
+static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
+		drbd_send_delay_probes(mdev);
+
+	return 1;
+}
+
+static void delay_probe_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
+}
+
 /* Used to send write requests
  * R_PRIMARY -> Peer	(P_DATA)
  */
@@ -2412,6 +2439,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 	}
 
 	drbd_put_data_sock(mdev);
+
+	if (ok)
+		consider_delay_probes(mdev);
+
 	return ok;
 }
 
@@ -2457,6 +2488,10 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 		ok = _drbd_send_zc_bio(mdev, e->private_bio);
 
 	drbd_put_data_sock(mdev);
+
+	if (ok)
+		consider_delay_probes(mdev);
+
 	return ok;
 }
 
@@ -2671,17 +2706,23 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
 	INIT_LIST_HEAD(&mdev->delay_probes);
+	INIT_LIST_HEAD(&mdev->delay_probe_work.list);
 
 	mdev->resync_work.cb  = w_resync_inactive;
 	mdev->unplug_work.cb  = w_send_write_hint;
 	mdev->md_sync_work.cb = w_md_sync;
 	mdev->bm_io_work.w.cb = w_bitmap_io;
+	mdev->delay_probe_work.cb = w_delay_probes;
 	init_timer(&mdev->resync_timer);
 	init_timer(&mdev->md_sync_timer);
+	init_timer(&mdev->delay_probe_timer);
 	mdev->resync_timer.function = resync_timer_fn;
 	mdev->resync_timer.data = (unsigned long) mdev;
 	mdev->md_sync_timer.function = md_sync_timer_fn;
 	mdev->md_sync_timer.data = (unsigned long) mdev;
+	mdev->delay_probe_timer.function = delay_probe_timer_fn;
+	mdev->delay_probe_timer.data = (unsigned long) mdev;
+
 
 	init_waitqueue_head(&mdev->misc_wait);
 	init_waitqueue_head(&mdev->state_wait);
-- 
cgit v1.2.3-59-g8ed1b


From cdd67a74603d0453ddffc24c572aed2ddd1795b8 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 4 May 2010 16:57:18 +0200
Subject: drbd: Control the actual resync rate based on the queuing delay of
 data packets

In a setup with a high bandwidth and high latency network, eventually
involving deep queues in routers, it is beneficial to only fill those
queues up to an limited extend with resync data.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 44bf6d11197e..0bbecf45b485 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -414,6 +414,18 @@ void resync_timer_fn(unsigned long data)
 		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
 }
 
+static int calc_resync_rate(struct drbd_conf *mdev)
+{
+	int d = mdev->data_delay / 1000; /* us -> ms */
+	int td = mdev->sync_conf.throttle_th * 100;  /* 0.1s -> ms */
+	int hd = mdev->sync_conf.hold_off_th * 100;  /* 0.1s -> ms */
+	int cr = mdev->sync_conf.rate;
+
+	return d <= td ? cr :
+		d >= hd ? 0 :
+		cr + (cr * (td - d) / (hd - td));
+}
+
 int w_make_resync_request(struct drbd_conf *mdev,
 		struct drbd_work *w, int cancel)
 {
@@ -446,7 +458,8 @@ int w_make_resync_request(struct drbd_conf *mdev,
 		return 1;
 	}
 
-	number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+	mdev->c_sync_rate = calc_resync_rate(mdev);
+	number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 	pe = atomic_read(&mdev->rs_pending_cnt);
 
 	mutex_lock(&mdev->data.mutex);
-- 
cgit v1.2.3-59-g8ed1b


From eedf386ae9d9e80a5669107e960090951e62f3a3 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 4 May 2010 16:31:03 +0200
Subject: drbd: Proc bits of new resync speed stuff

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_proc.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..81dea0a85933 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -73,14 +73,22 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
 	/* if more than 1 GB display in MB */
 	if (mdev->rs_total > 0x100000L)
-		seq_printf(seq, "(%lu/%lu)M\n\t",
+		seq_printf(seq, "(%lu/%lu)M",
 			    (unsigned long) Bit2KB(rs_left >> 10),
 			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
 	else
-		seq_printf(seq, "(%lu/%lu)K\n\t",
+		seq_printf(seq, "(%lu/%lu)K",
 			    (unsigned long) Bit2KB(rs_left),
 			    (unsigned long) Bit2KB(mdev->rs_total));
 
+	if (mdev->state.conn == C_SYNC_TARGET)
+		seq_printf(seq, " queue_delay: %d.%d ms\n\t",
+			   mdev->data_delay / 1000,
+			   (mdev->data_delay % 1000) / 100);
+	else if (mdev->state.conn == C_SYNC_SOURCE)
+		seq_printf(seq, " delay_probe: %d\n\t",
+			   atomic_read(&mdev->delay_seq));
+
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
 	 * the * 100 / 100 trick are important. We do a +1 to be
@@ -128,6 +136,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	else
 		seq_printf(seq, " (%ld)", dbdt);
 
+	if (mdev->state.conn == C_SYNC_TARGET) {
+		if (mdev->c_sync_rate > 1000)
+			seq_printf(seq, " want: %d,%03d",
+				   mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
+		else
+			seq_printf(seq, " want: %d", mdev->c_sync_rate);
+	}
+
 	seq_printf(seq, " K/sec\n");
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a8cdfd8d3bf0b6d2bbe792f5e74f54ccc6bc1d4f Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 5 May 2010 20:53:33 +0200
Subject: drbd: A fixes to the new resync speed code

* Mention P_DELAY_PROBE in the packet naming array
* Do not corrupt the mdev->data.work list in case the timer goes
  off before delay_probe_work got handled by the worker
* Do not mod_timer() twice for a single delay_probe pair

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  | 1 +
 drivers/block/drbd/drbd_main.c | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 210870ed8a79..37380d2c869d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -268,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
 		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
 		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
 		[P_COMPRESSED_BITMAP]   = "CBitmap",
+		[P_DELAY_PROBE]         = "DelayProbe",
 		[P_MAX_CMD]	        = NULL,
 	};
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 98785d08cf70..44cc7b415ed4 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2207,9 +2207,6 @@ static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
 	}
 	mutex_unlock(&ds->mutex);
 
-	mdev->dp_volume_last = mdev->send_cnt;
-	mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
-
 	return ok;
 }
 
@@ -2221,6 +2218,9 @@ static int drbd_send_delay_probes(struct drbd_conf *mdev)
 	ok = drbd_send_delay_probe(mdev, &mdev->meta);
 	ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
 
+	mdev->dp_volume_last = mdev->send_cnt;
+	mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
+
 	return ok;
 }
 
@@ -2374,7 +2374,8 @@ static void delay_probe_timer_fn(unsigned long data)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 
-	drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
+	if (list_empty(&mdev->delay_probe_work.list))
+		drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
 }
 
 /* Used to send write requests
-- 
cgit v1.2.3-59-g8ed1b


From 162f3ec7f026784ff2e216f19147d67e2f8ccd56 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 6 May 2010 15:19:30 +0200
Subject: drbd: Fixes to the new delay_probes code

* Only send delay_probes with protocol 93 or newer
* drbd_send_delay_probes() is called only from worker context,
  no atomic_t needed for delay_seq

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  | 6 +++---
 drivers/block/drbd/drbd_main.c | 8 ++++----
 drivers/block/drbd/drbd_proc.c | 3 +--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 37380d2c869d..45d9a4534c40 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -552,7 +552,7 @@ struct p_delay_probe {
 
 struct delay_probe {
 	struct list_head list;
-	int seq_num;
+	unsigned int seq_num;
 	struct timeval time;
 };
 
@@ -1048,9 +1048,9 @@ struct drbd_conf {
 	char congestion_reason;  /* Why we where congested... */
 	struct list_head delay_probes; /* protected by peer_seq_lock */
 	int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
-	atomic_t delay_seq; /* To generate sequence numbers of delay probes */
+	unsigned int delay_seq; /* To generate sequence numbers of delay probes */
 	struct timeval dps_time; /* delay-probes-start-time */
-	int dp_volume_last;  /* send_cnt of last delay probe */
+	unsigned int dp_volume_last;  /* send_cnt of last delay probe */
 	int c_sync_rate; /* current resync rate after delay_probe magic */
 };
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 44cc7b415ed4..3aa0add1c230 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2199,7 +2199,7 @@ static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
 		do_gettimeofday(&now);
 		offset = now.tv_usec - mdev->dps_time.tv_usec +
 			 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
-		dp.seq_num  = cpu_to_be32(atomic_read(&mdev->delay_seq));
+		dp.seq_num  = cpu_to_be32(mdev->delay_seq);
 		dp.offset   = cpu_to_be32(offset);
 
 		ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
@@ -2213,7 +2213,8 @@ static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
 static int drbd_send_delay_probes(struct drbd_conf *mdev)
 {
 	int ok;
-	atomic_inc(&mdev->delay_seq);
+
+	mdev->delay_seq++;
 	do_gettimeofday(&mdev->dps_time);
 	ok = drbd_send_delay_probe(mdev, &mdev->meta);
 	ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
@@ -2355,7 +2356,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 
 static void consider_delay_probes(struct drbd_conf *mdev)
 {
-	if (mdev->state.conn != C_SYNC_SOURCE)
+	if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
 		return;
 
 	if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
@@ -2677,7 +2678,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->net_cnt, 0);
 	atomic_set(&mdev->packet_seq, 0);
 	atomic_set(&mdev->pp_in_use, 0);
-	atomic_set(&mdev->delay_seq, 0);
 
 	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 81dea0a85933..d0f1767ea4c3 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -86,8 +86,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 			   mdev->data_delay / 1000,
 			   (mdev->data_delay % 1000) / 100);
 	else if (mdev->state.conn == C_SYNC_SOURCE)
-		seq_printf(seq, " delay_probe: %d\n\t",
-			   atomic_read(&mdev->delay_seq));
+		seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
 
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
-- 
cgit v1.2.3-59-g8ed1b


From 708d740ed8242b84eefc63df144313a7308c7de5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 3 May 2010 10:38:57 +0200
Subject: drbd: reduce sizeof struct drbd_epoch_entry by 8 byte by aligning
 members

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 45d9a4534c40..1bc86ddac38b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -747,12 +747,8 @@ struct drbd_epoch_entry {
 	struct hlist_node colision;
 	sector_t sector;
 	unsigned int size;
-	struct drbd_epoch *epoch;
-
-	/* up to here, the struct layout is identical to drbd_request;
-	 * we might be able to use that to our advantage...  */
-
 	unsigned int flags;
+	struct drbd_epoch *epoch;
 	u64    block_id;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 45bb912bd5ea4d2b3a270a93cbdf767a0e2df6f5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 14 May 2010 17:10:48 +0200
Subject: drbd: Allow drbd_epoch_entries to use multiple bios. This should
 allow for better performance if the lower level IO stack of the peers differs
 in limits exposed either via the queue, or via some merge_bvec_fn.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  90 +++++--
 drivers/block/drbd/drbd_main.c     |  19 +-
 drivers/block/drbd/drbd_nl.c       |  18 +-
 drivers/block/drbd/drbd_receiver.c | 483 +++++++++++++++++++++----------------
 drivers/block/drbd/drbd_worker.c   | 178 ++++++++------
 drivers/block/drbd/drbd_wrappers.h |  16 +-
 6 files changed, 480 insertions(+), 324 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 1bc86ddac38b..4b97f30bb7c6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -740,18 +740,6 @@ enum epoch_event {
 	EV_CLEANUP = 32, /* used as flag */
 };
 
-struct drbd_epoch_entry {
-	struct drbd_work    w;
-	struct drbd_conf *mdev;
-	struct bio *private_bio;
-	struct hlist_node colision;
-	sector_t sector;
-	unsigned int size;
-	unsigned int flags;
-	struct drbd_epoch *epoch;
-	u64    block_id;
-};
-
 struct drbd_wq_barrier {
 	struct drbd_work w;
 	struct completion done;
@@ -762,17 +750,49 @@ struct digest_info {
 	void *digest;
 };
 
-/* ee flag bits */
+struct drbd_epoch_entry {
+	struct drbd_work w;
+	struct hlist_node colision;
+	struct drbd_epoch *epoch;
+	struct drbd_conf *mdev;
+	struct page *pages;
+	atomic_t pending_bios;
+	unsigned int size;
+	/* see comments on ee flag bits below */
+	unsigned long flags;
+	sector_t sector;
+	u64 block_id;
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
 enum {
 	__EE_CALL_AL_COMPLETE_IO,
-	__EE_CONFLICT_PENDING,
 	__EE_MAY_SET_IN_SYNC,
+
+	/* This epoch entry closes an epoch using a barrier.
+	 * On sucessful completion, the epoch is released,
+	 * and the P_BARRIER_ACK send. */
 	__EE_IS_BARRIER,
+
+	/* In case a barrier failed,
+	 * we need to resubmit without the barrier flag. */
+	__EE_RESUBMITTED,
+
+	/* we may have several bios per epoch entry.
+	 * if any of those fail, we set this flag atomically
+	 * from the endio callback */
+	__EE_WAS_ERROR,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
-#define EE_CONFLICT_PENDING    (1<<__EE_CONFLICT_PENDING)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
 #define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
+#define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 
 /* global flag bits */
 enum {
@@ -1441,7 +1461,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
 }
 
 
-extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
 /* worker callbacks */
 extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
 extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
@@ -1465,6 +1486,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
 extern void resync_timer_fn(unsigned long data);
 
 /* drbd_receiver.c */
+extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+		const unsigned rw, const int fault_type);
 extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
 extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 					    u64 id,
@@ -1620,6 +1643,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
  * inline helper functions
  *************************/
 
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+	return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+			page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+static inline int drbd_bio_has_active_page(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (page_count(bvec->bv_page) > 1)
+			return 1;
+	}
+
+	return 0;
+}
+
+static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+{
+	struct page *page = e->pages;
+	page_chain_for_each(page) {
+		if (page_count(page) > 1)
+			return 1;
+	}
+	return 0;
+}
+
+
 static inline void drbd_state_lock(struct drbd_conf *mdev)
 {
 	wait_event(mdev->misc_wait,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 3aa0add1c230..d0fabace1452 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2354,6 +2354,19 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 	return 1;
 }
 
+static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	struct page *page = e->pages;
+	unsigned len = e->size;
+	page_chain_for_each(page) {
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+		if (!_drbd_send_page(mdev, page, 0, l))
+			return 0;
+		len -= l;
+	}
+	return 1;
+}
+
 static void consider_delay_probes(struct drbd_conf *mdev)
 {
 	if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
@@ -2430,7 +2443,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
 	if (ok && dgs) {
 		dgb = mdev->int_dig_out;
-		drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
 		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
 	}
 	if (ok) {
@@ -2483,11 +2496,11 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 					sizeof(p), MSG_MORE);
 	if (ok && dgs) {
 		dgb = mdev->int_dig_out;
-		drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
+		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
 		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
 	}
 	if (ok)
-		ok = _drbd_send_zc_bio(mdev, e->private_bio);
+		ok = _drbd_send_zc_ee(mdev, e);
 
 	drbd_put_data_sock(mdev);
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 93d150661f4b..28ef76bd5230 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2215,9 +2215,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 {
 	struct cn_msg *cn_reply;
 	struct drbd_nl_cfg_reply *reply;
-	struct bio_vec *bvec;
 	unsigned short *tl;
-	int i;
+	struct page *page;
+	unsigned len;
 
 	if (!e)
 		return;
@@ -2255,11 +2255,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 	put_unaligned(T_ee_data, tl++);
 	put_unaligned(e->size, tl++);
 
-	__bio_for_each_segment(bvec, e->private_bio, i, 0) {
-		void *d = kmap(bvec->bv_page);
-		memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
-		kunmap(bvec->bv_page);
-		tl=(unsigned short*)((char*)tl + bvec->bv_len);
+	len = e->size;
+	page = e->pages;
+	page_chain_for_each(page) {
+		void *d = kmap_atomic(page, KM_USER0);
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+		memcpy(tl, d, l);
+		kunmap_atomic(d, KM_USER0);
+		tl = (unsigned short*)((char*)tl + l);
+		len -= l;
 	}
 	put_unaligned(TT_END, tl++); /* Close the tag list */
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index fee0d249adf7..388a3e8bb0d0 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -80,30 +80,124 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo
 
 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
 
-static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+	struct page *page;
+	struct page *tmp;
+
+	BUG_ON(!n);
+	BUG_ON(!head);
+
+	page = *head;
+	while (page) {
+		tmp = page_chain_next(page);
+		if (--n == 0)
+			break; /* found sufficient pages */
+		if (tmp == NULL)
+			/* insufficient pages, don't use any of them. */
+			return NULL;
+		page = tmp;
+	}
+
+	/* add end of list marker for the returned list */
+	set_page_private(page, 0);
+	/* actual return value, and adjustment of head */
+	page = *head;
+	*head = tmp;
+	return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+	struct page *tmp;
+	int i = 1;
+	while ((tmp = page_chain_next(page)))
+		++i, page = tmp;
+	if (len)
+		*len = i;
+	return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+	struct page *tmp;
+	int i = 0;
+	page_chain_for_each_safe(page, tmp) {
+		put_page(page);
+		++i;
+	}
+	return i;
+}
+
+static void page_chain_add(struct page **head,
+		struct page *chain_first, struct page *chain_last)
+{
+#if 1
+	struct page *tmp;
+	tmp = page_chain_tail(chain_first, NULL);
+	BUG_ON(tmp != chain_last);
+#endif
+
+	/* add chain to head */
+	set_page_private(chain_last, (unsigned long)*head);
+	*head = chain_first;
+}
+
+static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
 {
 	struct page *page = NULL;
+	struct page *tmp = NULL;
+	int i = 0;
 
 	/* Yes, testing drbd_pp_vacant outside the lock is racy.
 	 * So what. It saves a spin_lock. */
-	if (drbd_pp_vacant > 0) {
+	if (drbd_pp_vacant >= number) {
 		spin_lock(&drbd_pp_lock);
-		page = drbd_pp_pool;
-		if (page) {
-			drbd_pp_pool = (struct page *)page_private(page);
-			set_page_private(page, 0); /* just to be polite */
-			drbd_pp_vacant--;
-		}
+		page = page_chain_del(&drbd_pp_pool, number);
+		if (page)
+			drbd_pp_vacant -= number;
 		spin_unlock(&drbd_pp_lock);
+		if (page)
+			return page;
 	}
+
 	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	if (!page)
-		page = alloc_page(GFP_TRY);
-	if (page)
-		atomic_inc(&mdev->pp_in_use);
-	return page;
+	for (i = 0; i < number; i++) {
+		tmp = alloc_page(GFP_TRY);
+		if (!tmp)
+			break;
+		set_page_private(tmp, (unsigned long)page);
+		page = tmp;
+	}
+
+	if (i == number)
+		return page;
+
+	/* Not enough pages immediately available this time.
+	 * No need to jump around here, drbd_pp_alloc will retry this
+	 * function "soon". */
+	if (page) {
+		tmp = page_chain_tail(page, NULL);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	return NULL;
 }
 
 /* kick lower level device, if we have more than (arbitrary number)
@@ -127,7 +221,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
 
 	list_for_each_safe(le, tle, &mdev->net_ee) {
 		e = list_entry(le, struct drbd_epoch_entry, w.list);
-		if (drbd_bio_has_active_page(e->private_bio))
+		if (drbd_ee_has_active_page(e))
 			break;
 		list_move(le, to_be_freed);
 	}
@@ -148,32 +242,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 }
 
 /**
- * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
+ * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
  * @mdev:	DRBD device.
- * @retry:	whether or not to retry allocation forever (or until signalled)
+ * @number:	number of pages requested
+ * @retry:	whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel, unless this allocation would exceed the max_buffers setting.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
  *
- * Tries to allocate a page, first from our own page pool, then from the
- * kernel, unless this allocation would exceed the max_buffers setting.
- * If @retry is non-zero, retry until DRBD frees a page somewhere else.
+ * Returns a page chain linked via page->private.
  */
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
 {
 	struct page *page = NULL;
 	DEFINE_WAIT(wait);
 
-	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
-		page = drbd_pp_first_page_or_try_alloc(mdev);
-		if (page)
-			return page;
-	}
+	/* Yes, we may run up to @number over max_buffers. If we
+	 * follow it strictly, the admin will get it wrong anyways. */
+	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
+		page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 
-	for (;;) {
+	while (page == NULL) {
 		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 
 		drbd_kick_lo_and_reclaim_net(mdev);
 
 		if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
-			page = drbd_pp_first_page_or_try_alloc(mdev);
+			page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 			if (page)
 				break;
 		}
@@ -190,62 +286,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
 	}
 	finish_wait(&drbd_pp_wait, &wait);
 
+	if (page)
+		atomic_add(number, &mdev->pp_in_use);
 	return page;
 }
 
 /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
 static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
 {
-	int free_it;
-
-	spin_lock(&drbd_pp_lock);
-	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
-		free_it = 1;
-	} else {
-		set_page_private(page, (unsigned long)drbd_pp_pool);
-		drbd_pp_pool = page;
-		drbd_pp_vacant++;
-		free_it = 0;
-	}
-	spin_unlock(&drbd_pp_lock);
-
-	atomic_dec(&mdev->pp_in_use);
-
-	if (free_it)
-		__free_page(page);
-
-	wake_up(&drbd_pp_wait);
-}
-
-static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
-{
-	struct page *p_to_be_freed = NULL;
-	struct page *page;
-	struct bio_vec *bvec;
 	int i;
-
-	spin_lock(&drbd_pp_lock);
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
-			set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
-			p_to_be_freed = bvec->bv_page;
-		} else {
-			set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
-			drbd_pp_pool = bvec->bv_page;
-			drbd_pp_vacant++;
-		}
-	}
-	spin_unlock(&drbd_pp_lock);
-	atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
-
-	while (p_to_be_freed) {
-		page = p_to_be_freed;
-		p_to_be_freed = (struct page *)page_private(page);
-		set_page_private(page, 0); /* just to be polite */
-		put_page(page);
+	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
+		i = page_chain_free(page);
+	else {
+		struct page *tmp;
+		tmp = page_chain_tail(page, &i);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
 	}
-
+	atomic_sub(i, &mdev->pp_in_use);
+	i = atomic_read(&mdev->pp_in_use);
+	if (i < 0)
+		dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
 	wake_up(&drbd_pp_wait);
 }
 
@@ -270,11 +336,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 				     unsigned int data_size,
 				     gfp_t gfp_mask) __must_hold(local)
 {
-	struct request_queue *q;
 	struct drbd_epoch_entry *e;
 	struct page *page;
-	struct bio *bio;
-	unsigned int ds;
+	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 
 	if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
 		return NULL;
@@ -286,84 +350,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 		return NULL;
 	}
 
-	bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
-	if (!bio) {
-		if (!(gfp_mask & __GFP_NOWARN))
-			dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
-		goto fail1;
-	}
-
-	bio->bi_bdev = mdev->ldev->backing_bdev;
-	bio->bi_sector = sector;
-
-	ds = data_size;
-	while (ds) {
-		page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
-		if (!page) {
-			if (!(gfp_mask & __GFP_NOWARN))
-				dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
-			goto fail2;
-		}
-		if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
-			drbd_pp_free(mdev, page);
-			dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
-			    "data_size=%u,ds=%u) failed\n",
-			    (unsigned long long)sector, data_size, ds);
-
-			q = bdev_get_queue(bio->bi_bdev);
-			if (q->merge_bvec_fn) {
-				struct bvec_merge_data bvm = {
-					.bi_bdev = bio->bi_bdev,
-					.bi_sector = bio->bi_sector,
-					.bi_size = bio->bi_size,
-					.bi_rw = bio->bi_rw,
-				};
-				int l = q->merge_bvec_fn(q, &bvm,
-						&bio->bi_io_vec[bio->bi_vcnt]);
-				dev_err(DEV, "merge_bvec_fn() = %d\n", l);
-			}
-
-			/* dump more of the bio. */
-			dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
-			dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
-			dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
-			dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
-
-			goto fail2;
-			break;
-		}
-		ds -= min_t(int, ds, PAGE_SIZE);
-	}
-
-	D_ASSERT(data_size == bio->bi_size);
-
-	bio->bi_private = e;
-	e->mdev = mdev;
-	e->sector = sector;
-	e->size = bio->bi_size;
+	page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+	if (!page)
+		goto fail;
 
-	e->private_bio = bio;
-	e->block_id = id;
 	INIT_HLIST_NODE(&e->colision);
 	e->epoch = NULL;
+	e->mdev = mdev;
+	e->pages = page;
+	atomic_set(&e->pending_bios, 0);
+	e->size = data_size;
 	e->flags = 0;
+	e->sector = sector;
+	e->sector = sector;
+	e->block_id = id;
 
 	return e;
 
- fail2:
-	drbd_pp_free_bio_pages(mdev, bio);
-	bio_put(bio);
- fail1:
+ fail:
 	mempool_free(e, drbd_ee_mempool);
-
 	return NULL;
 }
 
 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 {
-	struct bio *bio = e->private_bio;
-	drbd_pp_free_bio_pages(mdev, bio);
-	bio_put(bio);
+	drbd_pp_free(mdev, e->pages);
+	D_ASSERT(atomic_read(&e->pending_bios) == 0);
 	D_ASSERT(hlist_unhashed(&e->colision));
 	mempool_free(e, drbd_ee_mempool);
 }
@@ -1120,6 +1132,90 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
 }
 
+/**
+ * drbd_submit_ee()
+ * @mdev:	DRBD device.
+ * @e:		epoch entry
+ * @rw:		flag field, see bio->bi_rw
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+		const unsigned rw, const int fault_type)
+{
+	struct bio *bios = NULL;
+	struct bio *bio;
+	struct page *page = e->pages;
+	sector_t sector = e->sector;
+	unsigned ds = e->size;
+	unsigned n_bios = 0;
+	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+	/* In most cases, we will only need one bio.  But in case the lower
+	 * level restrictions happen to be different at this offset on this
+	 * side than those of the sending peer, we may need to submit the
+	 * request in more than one bio. */
+next_bio:
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+	if (!bio) {
+		dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
+		goto fail;
+	}
+	/* > e->sector, unless this is the first bio */
+	bio->bi_sector = sector;
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+	/* we special case some flags in the multi-bio case, see below
+	 * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
+	bio->bi_rw = rw;
+	bio->bi_private = e;
+	bio->bi_end_io = drbd_endio_sec;
+
+	bio->bi_next = bios;
+	bios = bio;
+	++n_bios;
+
+	page_chain_for_each(page) {
+		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
+		if (!bio_add_page(bio, page, len, 0)) {
+			/* a single page must always be possible! */
+			BUG_ON(bio->bi_vcnt == 0);
+			goto next_bio;
+		}
+		ds -= len;
+		sector += len >> 9;
+		--nr_pages;
+	}
+	D_ASSERT(page == NULL);
+	D_ASSERT(ds == 0);
+
+	atomic_set(&e->pending_bios, n_bios);
+	do {
+		bio = bios;
+		bios = bios->bi_next;
+		bio->bi_next = NULL;
+
+		/* strip off BIO_RW_UNPLUG unless it is the last bio */
+		if (bios)
+			bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
+
+		drbd_generic_make_request(mdev, fault_type, bio);
+
+		/* strip off BIO_RW_BARRIER,
+		 * unless it is the first or last bio */
+		if (bios && bios->bi_next)
+			bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
+	} while (bios);
+	maybe_kick_lo(mdev);
+	return 0;
+
+fail:
+	while (bios) {
+		bio = bios;
+		bios = bios->bi_next;
+		bio_put(bio);
+	}
+	return -ENOMEM;
+}
+
 /**
  * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
  * @mdev:	DRBD device.
@@ -1129,8 +1225,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
 {
 	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-	struct bio *bio = e->private_bio;
-
 	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
 	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
 	   so that we can finish that epoch in drbd_may_finish_epoch().
@@ -1144,33 +1238,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
 	if (previous_epoch(mdev, e->epoch))
 		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
 
-	/* prepare bio for re-submit,
-	 * re-init volatile members */
 	/* we still have a local reference,
 	 * get_ldev was done in receive_Data. */
-	bio->bi_bdev = mdev->ldev->backing_bdev;
-	bio->bi_sector = e->sector;
-	bio->bi_size = e->size;
-	bio->bi_idx = 0;
-
-	bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-	bio->bi_flags |= 1 << BIO_UPTODATE;
-
-	/* don't know whether this is necessary: */
-	bio->bi_phys_segments = 0;
-	bio->bi_next = NULL;
-
-	/* these should be unchanged: */
-	/* bio->bi_end_io = drbd_endio_write_sec; */
-	/* bio->bi_vcnt = whatever; */
 
 	e->w.cb = e_end_block;
-
-	/* This is no longer a barrier request. */
-	bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
-
-	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
-
+	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
+		/* drbd_submit_ee fails for one reason only:
+		 * if was not able to allocate sufficient bios.
+		 * requeue, try again later. */
+		e->w.cb = w_e_reissue;
+		drbd_queue_work(&mdev->data.work, &e->w);
+	}
 	return 1;
 }
 
@@ -1264,10 +1342,8 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 {
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct drbd_epoch_entry *e;
-	struct bio_vec *bvec;
 	struct page *page;
-	struct bio *bio;
-	int dgs, ds, i, rr;
+	int dgs, ds, rr;
 	void *dig_in = mdev->int_dig_in;
 	void *dig_vv = mdev->int_dig_vv;
 	unsigned long *data;
@@ -1304,28 +1380,29 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
 	if (!e)
 		return NULL;
-	bio = e->private_bio;
+
 	ds = data_size;
-	bio_for_each_segment(bvec, bio, i) {
-		page = bvec->bv_page;
+	page = e->pages;
+	page_chain_for_each(page) {
+		unsigned len = min_t(int, ds, PAGE_SIZE);
 		data = kmap(page);
-		rr = drbd_recv(mdev, data, min_t(int, ds, PAGE_SIZE));
+		rr = drbd_recv(mdev, data, len);
 		if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
 			dev_err(DEV, "Fault injection: Corrupting data on receive\n");
 			data[0] = data[0] ^ (unsigned long)-1;
 		}
 		kunmap(page);
-		if (rr != min_t(int, ds, PAGE_SIZE)) {
+		if (rr != len) {
 			drbd_free_ee(mdev, e);
 			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
-			     rr, min_t(int, ds, PAGE_SIZE));
+			     rr, len);
 			return NULL;
 		}
 		ds -= rr;
 	}
 
 	if (dgs) {
-		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
 		if (memcmp(dig_in, dig_vv, dgs)) {
 			dev_err(DEV, "Digest integrity check FAILED.\n");
 			drbd_bcast_ee(mdev, "digest failed",
@@ -1350,7 +1427,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 	if (!data_size)
 		return TRUE;
 
-	page = drbd_pp_alloc(mdev, 1);
+	page = drbd_pp_alloc(mdev, 1, 1);
 
 	data = kmap(page);
 	while (data_size) {
@@ -1414,7 +1491,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 	}
 
 	if (dgs) {
-		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
 		if (memcmp(dig_in, dig_vv, dgs)) {
 			dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
 			return 0;
@@ -1435,7 +1512,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
 
 	D_ASSERT(hlist_unhashed(&e->colision));
 
-	if (likely(drbd_bio_uptodate(e->private_bio))) {
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		drbd_set_in_sync(mdev, sector, e->size);
 		ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
 	} else {
@@ -1454,30 +1531,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
 	struct drbd_epoch_entry *e;
 
 	e = read_in_block(mdev, ID_SYNCER, sector, data_size);
-	if (!e) {
-		put_ldev(mdev);
-		return FALSE;
-	}
+	if (!e)
+		goto fail;
 
 	dec_rs_pending(mdev);
 
-	e->private_bio->bi_end_io = drbd_endio_write_sec;
-	e->private_bio->bi_rw = WRITE;
-	e->w.cb = e_end_resync_block;
-
 	inc_unacked(mdev);
 	/* corresponding dec_unacked() in e_end_resync_block()
 	 * respective _drbd_clear_done_ee */
 
+	e->w.cb = e_end_resync_block;
+
 	spin_lock_irq(&mdev->req_lock);
 	list_add(&e->w.list, &mdev->sync_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
-	drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
-	/* accounting done in endio */
+	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
+		return TRUE;
 
-	maybe_kick_lo(mdev);
-	return TRUE;
+	drbd_free_ee(mdev, e);
+fail:
+	put_ldev(mdev);
+	return FALSE;
 }
 
 static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
@@ -1572,7 +1647,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
-		if (likely(drbd_bio_uptodate(e->private_bio))) {
+		if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
 				mdev->state.conn <= C_PAUSED_SYNC_T &&
 				e->flags & EE_MAY_SET_IN_SYNC) ?
@@ -1718,7 +1793,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 		return FALSE;
 	}
 
-	e->private_bio->bi_end_io = drbd_endio_write_sec;
 	e->w.cb = e_end_block;
 
 	spin_lock(&mdev->epoch_lock);
@@ -1914,12 +1988,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 		drbd_al_begin_io(mdev, e->sector);
 	}
 
-	e->private_bio->bi_rw = rw;
-	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
-	/* accounting done in endio */
-
-	maybe_kick_lo(mdev);
-	return TRUE;
+	if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
+		return TRUE;
 
 out_interrupted:
 	/* yes, the epoch_size now is imbalanced.
@@ -1977,9 +2047,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		return FALSE;
 	}
 
-	e->private_bio->bi_rw = READ;
-	e->private_bio->bi_end_io = drbd_endio_read_sec;
-
 	switch (h->command) {
 	case P_DATA_REQUEST:
 		e->w.cb = w_e_end_data_req;
@@ -2073,10 +2140,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 
 	inc_unacked(mdev);
 
-	drbd_generic_make_request(mdev, fault_type, e->private_bio);
-	maybe_kick_lo(mdev);
-
-	return TRUE;
+	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
+		return TRUE;
 
 out_free_e:
 	kfree(di);
@@ -3837,7 +3902,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
 	i = atomic_read(&mdev->pp_in_use);
 	if (i)
-		dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
+		dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
 
 	D_ASSERT(list_empty(&mdev->read_ee));
 	D_ASSERT(list_empty(&mdev->active_ee));
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0bbecf45b485..d771b1e0424b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
 
 /* defined here:
    drbd_md_io_complete
-   drbd_endio_write_sec
-   drbd_endio_read_sec
+   drbd_endio_sec
    drbd_endio_pri
 
  * more endio handlers:
@@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error)
 /* reads on behalf of the partner,
  * "submitted" by the receiver
  */
-void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
+void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
 {
 	unsigned long flags = 0;
-	struct drbd_epoch_entry *e = NULL;
-	struct drbd_conf *mdev;
-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
-	e = bio->bi_private;
-	mdev = e->mdev;
-
-	if (error)
-		dev_warn(DEV, "read: error=%d s=%llus\n", error,
-				(unsigned long long)e->sector);
-	if (!error && !uptodate) {
-		dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
-				(unsigned long long)e->sector);
-		/* strange behavior of some lower level drivers...
-		 * fail the request by clearing the uptodate flag,
-		 * but do not return any error?! */
-		error = -EIO;
-	}
+	struct drbd_conf *mdev = e->mdev;
 
 	D_ASSERT(e->block_id != ID_VACANT);
 
@@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
 	list_del(&e->w.list);
 	if (list_empty(&mdev->read_ee))
 		wake_up(&mdev->ee_wait);
+	if (test_bit(__EE_WAS_ERROR, &e->flags))
+		__drbd_chk_io_error(mdev, FALSE);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
-	drbd_chk_io_error(mdev, error, FALSE);
 	drbd_queue_work(&mdev->data.work, &e->w);
 	put_ldev(mdev);
 }
 
+static int is_failed_barrier(int ee_flags)
+{
+	return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
+			== (EE_IS_BARRIER|EE_WAS_ERROR);
+}
+
 /* writes on behalf of the partner, or resync writes,
- * "submitted" by the receiver.
- */
-void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
+ * "submitted" by the receiver, final stage.  */
+static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
 {
 	unsigned long flags = 0;
-	struct drbd_epoch_entry *e = NULL;
-	struct drbd_conf *mdev;
+	struct drbd_conf *mdev = e->mdev;
 	sector_t e_sector;
 	int do_wake;
 	int is_syncer_req;
 	int do_al_complete_io;
-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
-	int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
 
-	e = bio->bi_private;
-	mdev = e->mdev;
-
-	if (error)
-		dev_warn(DEV, "write: error=%d s=%llus\n", error,
-				(unsigned long long)e->sector);
-	if (!error && !uptodate) {
-		dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
-				(unsigned long long)e->sector);
-		/* strange behavior of some lower level drivers...
-		 * fail the request by clearing the uptodate flag,
-		 * but do not return any error?! */
-		error = -EIO;
-	}
-
-	/* error == -ENOTSUPP would be a better test,
-	 * alas it is not reliable */
-	if (error && is_barrier && e->flags & EE_IS_BARRIER) {
+	/* if this is a failed barrier request, disable use of barriers,
+	 * and schedule for resubmission */
+	if (is_failed_barrier(e->flags)) {
 		drbd_bump_write_ordering(mdev, WO_bdev_flush);
 		spin_lock_irqsave(&mdev->req_lock, flags);
 		list_del(&e->w.list);
+		e->flags |= EE_RESUBMITTED;
 		e->w.cb = w_e_reissue;
 		/* put_ldev actually happens below, once we come here again. */
 		__release(local);
@@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
 	D_ASSERT(e->block_id != ID_VACANT);
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	mdev->writ_cnt += e->size >> 9;
-	is_syncer_req = is_syncer_block_id(e->block_id);
-
 	/* after we moved e to done_ee,
 	 * we may no longer access it,
 	 * it may be freed/reused already!
 	 * (as soon as we release the req_lock) */
 	e_sector = e->sector;
 	do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+	is_syncer_req = is_syncer_block_id(e->block_id);
 
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->writ_cnt += e->size >> 9;
 	list_del(&e->w.list); /* has been on active_ee or sync_ee */
 	list_add_tail(&e->w.list, &mdev->done_ee);
 
@@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 		? list_empty(&mdev->sync_ee)
 		: list_empty(&mdev->active_ee);
 
-	if (error)
+	if (test_bit(__EE_WAS_ERROR, &e->flags))
 		__drbd_chk_io_error(mdev, FALSE);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
@@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
 	wake_asender(mdev);
 	put_ldev(mdev);
+}
 
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_endio_sec(struct bio *bio, int error)
+{
+	struct drbd_epoch_entry *e = bio->bi_private;
+	struct drbd_conf *mdev = e->mdev;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+	int is_write = bio_data_dir(bio) == WRITE;
+
+	if (error)
+		dev_warn(DEV, "%s: error=%d s=%llus\n",
+				is_write ? "write" : "read", error,
+				(unsigned long long)e->sector);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
+				is_write ? "write" : "read",
+				(unsigned long long)e->sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	if (error)
+		set_bit(__EE_WAS_ERROR, &e->flags);
+
+	bio_put(bio); /* no need for the bio anymore */
+	if (atomic_dec_and_test(&e->pending_bios)) {
+		if (is_write)
+			drbd_endio_write_sec_final(e);
+		else
+			drbd_endio_read_sec_final(e);
+	}
 }
 
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
@@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	return 1; /* Simply ignore this! */
 }
 
-void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct page *page = e->pages;
+	struct page *tmp;
+	unsigned len;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	while ((tmp = page_chain_next(page))) {
+		/* all but the last page will be fully used */
+		sg_set_page(&sg, page, PAGE_SIZE, 0);
+		crypto_hash_update(&desc, &sg, sg.length);
+		page = tmp;
+	}
+	/* and now the last, possibly only partially used page */
+	len = e->size & (PAGE_SIZE - 1);
+	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+	crypto_hash_update(&desc, &sg, sg.length);
+	crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
 {
 	struct hash_desc desc;
 	struct scatterlist sg;
@@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel
 		return 1;
 	}
 
-	if (likely(drbd_bio_uptodate(e->private_bio))) {
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		digest_size = crypto_hash_digestsize(mdev->csums_tfm);
 		digest = kmalloc(digest_size, GFP_NOIO);
 		if (digest) {
-			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
 
 			inc_rs_pending(mdev);
 			ok = drbd_send_drequest_csum(mdev,
@@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
-	if (!e) {
-		put_ldev(mdev);
-		return 2;
-	}
+	if (!e)
+		goto fail;
 
 	spin_lock_irq(&mdev->req_lock);
 	list_add(&e->w.list, &mdev->read_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
-	e->private_bio->bi_end_io = drbd_endio_read_sec;
-	e->private_bio->bi_rw = READ;
 	e->w.cb = w_e_send_csum;
+	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+		return 1;
 
-	mdev->read_cnt += size >> 9;
-	drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
-
-	return 1;
+	drbd_free_ee(mdev, e);
+fail:
+	put_ldev(mdev);
+	return 2;
 }
 
 void resync_timer_fn(unsigned long data)
@@ -819,7 +849,7 @@ out:
 /* helper */
 static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 {
-	if (drbd_bio_has_active_page(e->private_bio)) {
+	if (drbd_ee_has_active_page(e)) {
 		/* This might happen if sendpage() has not finished */
 		spin_lock_irq(&mdev->req_lock);
 		list_add_tail(&e->w.list, &mdev->net_ee);
@@ -845,7 +875,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 		return 1;
 	}
 
-	if (likely(drbd_bio_uptodate(e->private_bio))) {
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		ok = drbd_send_block(mdev, P_DATA_REPLY, e);
 	} else {
 		if (__ratelimit(&drbd_ratelimit_state))
@@ -886,7 +916,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 		put_ldev(mdev);
 	}
 
-	if (likely(drbd_bio_uptodate(e->private_bio))) {
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
 			inc_rs_pending(mdev);
 			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -934,7 +964,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
 	di = (struct digest_info *)(unsigned long)e->block_id;
 
-	if (likely(drbd_bio_uptodate(e->private_bio))) {
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		/* quick hack to try to avoid a race against reconfiguration.
 		 * a real fix would be much more involved,
 		 * introducing more locking mechanisms */
@@ -944,7 +974,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 			digest = kmalloc(digest_size, GFP_NOIO);
 		}
 		if (digest) {
-			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
 			eq = !memcmp(digest, di->digest, digest_size);
 			kfree(digest);
 		}
@@ -986,14 +1016,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	if (unlikely(cancel))
 		goto out;
 
-	if (unlikely(!drbd_bio_uptodate(e->private_bio)))
+	if (unlikely((e->flags & EE_WAS_ERROR) != 0))
 		goto out;
 
 	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
 	/* FIXME if this allocation fails, online verify will not terminate! */
 	digest = kmalloc(digest_size, GFP_NOIO);
 	if (digest) {
-		drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+		drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
 		inc_rs_pending(mdev);
 		ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
 					     digest, digest_size, P_OV_REPLY);
@@ -1042,11 +1072,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
 	di = (struct digest_info *)(unsigned long)e->block_id;
 
-	if (likely(drbd_bio_uptodate(e->private_bio))) {
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
 		digest = kmalloc(digest_size, GFP_NOIO);
 		if (digest) {
-			drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+			drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
 
 			D_ASSERT(digest_size == di->digest_size);
 			eq = !memcmp(digest, di->digest, digest_size);
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index f93fa111ce50..defdb5013ea3 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
 
 #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
 
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (page_count(bvec->bv_page) > 1)
-			return 1;
-	}
-
-	return 0;
-}
-
 /* bi_end_io handlers */
 extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_endio_read_sec(struct bio *bio, int error);
-extern void drbd_endio_write_sec(struct bio *bio, int error);
+extern void drbd_endio_sec(struct bio *bio, int error);
 extern void drbd_endio_pri(struct bio *bio, int error);
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From bb3d000cb99aa0924b78c1ae5f5943484527868a Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 14 May 2010 19:08:55 +0200
Subject: drbd: allow resync requests to be larger than max_segment_size

this should allow for better background resync performance.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d771b1e0424b..91085c1ab52f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -462,7 +462,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
 	unsigned long bit;
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-	int max_segment_size = queue_max_segment_size(mdev->rq_queue);
+	int max_segment_size;
 	int number, i, size, pe, mx;
 	int align, queued, sndbuf;
 
@@ -488,6 +488,11 @@ int w_make_resync_request(struct drbd_conf *mdev,
 		return 1;
 	}
 
+	/* starting with drbd 8.3.8, we can handle multi-bio EEs,
+	 * if it should be necessary */
+	max_segment_size = mdev->agreed_pro_version < 94 ?
+		queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+
 	mdev->c_sync_rate = calc_resync_rate(mdev);
 	number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 	pe = atomic_read(&mdev->rs_pending_cnt);
@@ -552,12 +557,6 @@ next_sector:
 		 *
 		 * Additionally always align bigger requests, in order to
 		 * be prepared for all stripe sizes of software RAIDs.
-		 *
-		 * we _do_ care about the agreed-upon q->max_segment_size
-		 * here, as splitting up the requests on the other side is more
-		 * difficult.  the consequence is, that on lvm and md and other
-		 * "indirect" devices, this is dead code, since
-		 * q->max_segment_size will be PAGE_SIZE.
 		 */
 		align = 1;
 		for (;;) {
-- 
cgit v1.2.3-59-g8ed1b


From a1c88d0d7aa2ef427f78834c9a3b0a673a19dca6 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 14 May 2010 19:16:41 +0200
Subject: drbd: always use_bmbv, ignore setting

Now that the peer may handle multi-bio EEs,
we can ignore the peer's limit,
and concentrate on the limits of the local IO stack.

This is safe accross drbd protocol versions,
as our queue_max_sectors() will be adjusted accordingly.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       | 3 ---
 drivers/block/drbd/drbd_receiver.c | 6 +++++-
 drivers/block/drbd/drbd_req.c      | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 28ef76bd5230..f20336bc59c8 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
 	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
 	int max_segments = mdev->ldev->dc.max_bio_bvecs;
 
-	if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
-		max_seg_s = PAGE_SIZE;
-
 	max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
 
 	blk_queue_max_hw_sectors(q, max_seg_s >> 9);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 388a3e8bb0d0..a04ec01ab3ce 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3011,7 +3011,11 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 			ldsc = 1;
 		}
 
-		max_seg_s = be32_to_cpu(p->max_segment_size);
+		if (mdev->agreed_pro_version < 94)
+			max_seg_s = be32_to_cpu(p->max_segment_size);
+		else /* drbd 8.3.8 onwards */
+			max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+
 		if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
 			drbd_setup_queue_param(mdev, max_seg_s);
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index d8d9bbfca3b8..343e0e6dd532 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1110,7 +1110,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	} else if (limit && get_ldev(mdev)) {
 		struct request_queue * const b =
 			mdev->ldev->backing_bdev->bd_disk->queue;
-		if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
+		if (b->merge_bvec_fn) {
 			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
 			limit = min(limit, backing_limit);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 9a25a04c8079725c1b1ab756694a8e0757844b40 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 10 May 2010 16:42:23 +0200
Subject: drbd: If we detect late that IO got frozen, retry after we thawed.

If we detect late (= after grabing mdev->req_lock) that IO got frozen, we
return 1 to generic_make_request(), which simply will retry to make a
request for that bio.

In the subsequent call of generic_make_request() into drbd_make_request_26()
we sleep in inc_ap_bio().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h |  4 ++--
 drivers/block/drbd/drbd_req.c | 33 ++++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 4b97f30bb7c6..c194348a46ed 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2223,7 +2223,7 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
 /* I'd like to use wait_event_lock_irq,
  * but I'm not sure when it got introduced,
  * and not sure when it has 3 or 4 arguments */
-static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
+static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
 {
 	/* compare with after_state_ch,
 	 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
@@ -2245,7 +2245,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
 		finish_wait(&mdev->misc_wait, &wait);
 		spin_lock_irq(&mdev->req_lock);
 	}
-	atomic_add(one_or_two, &mdev->ap_bio_cnt);
+	atomic_add(count, &mdev->ap_bio_cnt);
 	spin_unlock_irq(&mdev->req_lock);
 }
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 343e0e6dd532..3397f11d0ba9 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 	struct drbd_request *req;
 	int local, remote;
 	int err = -EIO;
+	int ret = 0;
 
 	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
@@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 			    (mdev->state.pdsk == D_INCONSISTENT &&
 			     mdev->state.conn >= C_CONNECTED));
 
-	if (!(local || remote)) {
+	if (!(local || remote) && !mdev->state.susp) {
 		dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
 		goto fail_free_complete;
 	}
@@ -810,6 +811,16 @@ allocate_barrier:
 	/* GOOD, everything prepared, grab the spin_lock */
 	spin_lock_irq(&mdev->req_lock);
 
+	if (mdev->state.susp) {
+		/* If we got suspended, use the retry mechanism of
+		   generic_make_request() to restart processing of this
+		   bio. In the next call to drbd_make_request_26
+		   we sleep in inc_ap_bio() */
+		ret = 1;
+		spin_unlock_irq(&mdev->req_lock);
+		goto fail_free_complete;
+	}
+
 	if (remote) {
 		remote = (mdev->state.pdsk == D_UP_TO_DATE ||
 			    (mdev->state.pdsk == D_INCONSISTENT &&
@@ -947,12 +958,14 @@ fail_and_free_req:
 		req->private_bio = NULL;
 		put_ldev(mdev);
 	}
-	bio_endio(bio, err);
+	if (!ret)
+		bio_endio(bio, err);
+
 	drbd_req_free(req);
 	dec_ap_bio(mdev);
 	kfree(b);
 
-	return 0;
+	return ret;
 }
 
 /* helper function for drbd_make_request
@@ -1065,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 
 		/* we need to get a "reference count" (ap_bio_cnt)
 		 * to avoid races with the disconnect/reconnect/suspend code.
-		 * In case we need to split the bio here, we need to get two references
+		 * In case we need to split the bio here, we need to get three references
 		 * atomically, otherwise we might deadlock when trying to submit the
 		 * second one! */
-		inc_ap_bio(mdev, 2);
+		inc_ap_bio(mdev, 3);
 
 		D_ASSERT(e_enr == s_enr + 1);
 
-		drbd_make_request_common(mdev, &bp->bio1);
-		drbd_make_request_common(mdev, &bp->bio2);
+		while (drbd_make_request_common(mdev, &bp->bio1))
+			inc_ap_bio(mdev, 1);
+
+		while (drbd_make_request_common(mdev, &bp->bio2))
+			inc_ap_bio(mdev, 1);
+
+		dec_ap_bio(mdev);
+
 		bio_pair_release(bp);
 	}
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 0c3f34516e8c5a1a0ba3585a7777d32bbbdf4ecb Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 17 May 2010 16:10:43 +0200
Subject: drbd: Create new current UUID as late as possible

The choice was to either delay creation of the new UUID until
IO got thawed or to delay it until the first IO request.

Both are correct, the later is more friendly to users of
dual-primary setups, that actually only write on one side.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  9 ++++++++-
 drivers/block/drbd/drbd_main.c     | 25 ++++++++++++++++++++-----
 drivers/block/drbd/drbd_receiver.c | 11 +++++++++++
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c194348a46ed..e9654c8d5b62 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -943,7 +943,8 @@ struct drbd_conf {
 	struct drbd_work  resync_work,
 			  unplug_work,
 			  md_sync_work,
-			  delay_probe_work;
+			  delay_probe_work,
+			  uuid_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
 	struct timer_list delay_probe_timer;
@@ -1068,6 +1069,7 @@ struct drbd_conf {
 	struct timeval dps_time; /* delay-probes-start-time */
 	unsigned int dp_volume_last;  /* send_cnt of last delay probe */
 	int c_sync_rate; /* current resync rate after delay_probe magic */
+	atomic_t new_c_uuid;
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -2217,6 +2219,8 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
 		return 0;
 	if (test_bit(BITMAP_IO, &mdev->flags))
 		return 0;
+	if (atomic_read(&mdev->new_c_uuid))
+		return 0;
 	return 1;
 }
 
@@ -2237,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
 	 * to avoid races with the reconnect code,
 	 * we need to atomic_inc within the spinlock. */
 
+	if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
+		drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
+
 	spin_lock_irq(&mdev->req_lock);
 	while (!__inc_ap_bio_cond(mdev)) {
 		prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index d0fabace1452..c144509011b8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1217,17 +1217,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		mdev->p_uuid = NULL;
 		if (get_ldev(mdev)) {
 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-				drbd_uuid_new_current(mdev);
-				drbd_send_uuids(mdev);
-			}
+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE)
+				atomic_set(&mdev->new_c_uuid, 2);
 			put_ldev(mdev);
 		}
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+		/* Diskless peer becomes primary or got connected do diskless, primary peer. */
 		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
-			drbd_uuid_new_current(mdev);
+			atomic_set(&mdev->new_c_uuid, 2);
 
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1351,6 +1350,19 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	drbd_md_sync(mdev);
 }
 
+static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (get_ldev(mdev)) {
+		drbd_uuid_new_current(mdev);
+		drbd_send_uuids(mdev);
+		drbd_md_sync(mdev);
+		put_ldev(mdev);
+	}
+	atomic_dec(&mdev->new_c_uuid);
+	wake_up(&mdev->misc_wait);
+
+	return 1;
+}
 
 static int drbd_thread_setup(void *arg)
 {
@@ -2691,6 +2703,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->net_cnt, 0);
 	atomic_set(&mdev->packet_seq, 0);
 	atomic_set(&mdev->pp_in_use, 0);
+	atomic_set(&mdev->new_c_uuid, 0);
 
 	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
@@ -2721,12 +2734,14 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
 	INIT_LIST_HEAD(&mdev->delay_probes);
 	INIT_LIST_HEAD(&mdev->delay_probe_work.list);
+	INIT_LIST_HEAD(&mdev->uuid_work.list);
 
 	mdev->resync_work.cb  = w_resync_inactive;
 	mdev->unplug_work.cb  = w_send_write_hint;
 	mdev->md_sync_work.cb = w_md_sync;
 	mdev->bm_io_work.w.cb = w_bitmap_io;
 	mdev->delay_probe_work.cb = w_delay_probes;
+	mdev->uuid_work.cb = w_new_current_uuid;
 	init_timer(&mdev->resync_timer);
 	init_timer(&mdev->md_sync_timer);
 	init_timer(&mdev->delay_probe_timer);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index a04ec01ab3ce..461d9872d4d3 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1150,6 +1150,17 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 	unsigned n_bios = 0;
 	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
 
+	if (atomic_read(&mdev->new_c_uuid)) {
+		if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
+			drbd_uuid_new_current(mdev);
+			drbd_md_sync(mdev);
+
+			atomic_dec(&mdev->new_c_uuid);
+			wake_up(&mdev->misc_wait);
+		}
+		wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
+	}
+
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
-- 
cgit v1.2.3-59-g8ed1b


From 2db4e42eaceabec42f738f3895300632cd375e67 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 13 May 2010 22:02:21 +0200
Subject: drivers/block/drbd: Use kzalloc

Use kzalloc rather than the combination of kmalloc and memset.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression x,size,flags;
statement S;
@@

-x = kmalloc(size,flags);
+x = kzalloc(size,flags);
 if (x == NULL) S
-memset(x, 0, size);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index f20336bc59c8..632e3245d1bb 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1196,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	}
 
 	/* allocation not in the IO path, cqueue thread context */
-	new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
 	if (!new_conf) {
 		retcode = ERR_NOMEM;
 		goto fail;
 	}
 
-	memset(new_conf, 0, sizeof(struct net_conf));
 	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
 	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
 	new_conf->ping_int	   = DRBD_PING_INT_DEF;
-- 
cgit v1.2.3-59-g8ed1b


From 7c8a3554c683f512dbcee26faedb42e4c05f12fa Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 18 May 2010 14:29:29 +0200
Subject: writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync

Even if the writeout itself isn't a data integrity operation, we need
to ensure that the caller doesn't drop the sb umount sem before we
have actually done the writeback.

This is a fixup for commit e913fc82.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0f629571234f..76f546d56a64 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -193,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct wb_writeback_args *args)
+				 struct wb_writeback_args *args,
+				 int wait)
 {
 	struct bdi_work *work;
 
@@ -205,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	if (work) {
 		bdi_work_init(work, args);
 		bdi_queue_work(bdi, work);
+		if (wait)
+			bdi_wait_on_work_clear(work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
 
@@ -279,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 		args.for_background = 1;
 	}
 
-	bdi_alloc_queue_work(bdi, &args);
+	bdi_alloc_queue_work(bdi, &args, sb_locked);
 }
 
 /*
@@ -909,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
 		struct wb_writeback_args args = work->args;
+		int post_clear;
 
 		/*
 		 * Override sync mode, in case we must wait for completion
@@ -916,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
+		post_clear = WB_SYNC_ALL || args.sb_pinned;
+
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
 		 */
-		if (args.sync_mode == WB_SYNC_NONE)
+		if (!post_clear)
 			wb_clear_pending(wb, work);
 
 		wrote += wb_writeback(wb, &args);
@@ -929,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		 * This is a data integrity writeback, so only do the
 		 * notification when we have completed the work.
 		 */
-		if (args.sync_mode == WB_SYNC_ALL)
+		if (post_clear)
 			wb_clear_pending(wb, work);
 	}
 
@@ -1000,7 +1006,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_alloc_queue_work(bdi, &args);
+		bdi_alloc_queue_work(bdi, &args, 0);
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From f9eadbbd424c083b8005c7b738f644611b9ef489 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 18 May 2010 14:31:45 +0200
Subject: writeback: bdi_writeback_task() must set task state before calling
 schedule()

Calling schedule without setting the task state to non-running will
return immediately, so ensure that we set it properly and check our
sleep conditions after doing so.

This is a fixup for commit 69b62d01.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76f546d56a64..437a7431b4ea 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -978,8 +978,13 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		if (dirty_writeback_interval) {
 			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
 			schedule_timeout_interruptible(wait_jiffies);
-		} else
-			schedule();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (list_empty_careful(&wb->bdi->work_list) &&
+			    !kthread_should_stop())
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
 
 		try_to_freeze();
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 6423104b6a1e6f0c18be60e8c33f02d263331d5e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 21 May 2010 20:00:35 +0200
Subject: writeback: fixups for !dirty_writeback_centisecs

Commit 69b62d01 fixed up most of the places where we would enter
busy schedule() spins when disabling the periodic background
writeback. This fixes up the sb timer so that it doesn't get
hammered on with the delay disabled, and ensures that it gets
rearmed if needed when /proc/sys/vm/dirty_writeback_centisecs
gets modified.

bdi_forker_task() also needs to check for !dirty_writeback_centisecs
and use schedule() appropriately, fix that up too.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/backing-dev.h |  1 +
 mm/backing-dev.c            | 15 ++++++++++-----
 mm/page-writeback.c         |  1 +
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ff8bac63213f..e6e0cb5437e6 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -109,6 +109,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 				long nr_pages, int sb_locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
+void bdi_arm_supers_timer(void);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 707d0dc6da0f..660a87a22511 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -48,7 +48,6 @@ static struct timer_list sync_supers_timer;
 
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
 
 static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
 
@@ -252,7 +251,7 @@ static int __init default_bdi_init(void)
 
 	init_timer(&sync_supers_timer);
 	setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-	arm_supers_timer();
+	bdi_arm_supers_timer();
 
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
@@ -374,10 +373,13 @@ static int bdi_sync_supers(void *unused)
 	return 0;
 }
 
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
 {
 	unsigned long next;
 
+	if (!dirty_writeback_interval)
+		return;
+
 	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
 	mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }
@@ -385,7 +387,7 @@ static void arm_supers_timer(void)
 static void sync_supers_timer_fn(unsigned long unused)
 {
 	wake_up_process(sync_supers_tsk);
-	arm_supers_timer();
+	bdi_arm_supers_timer();
 }
 
 static int bdi_forker_task(void *ptr)
@@ -428,7 +430,10 @@ static int bdi_forker_task(void *ptr)
 
 			spin_unlock_bh(&bdi_lock);
 			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
-			schedule_timeout(wait);
+			if (wait)
+				schedule_timeout(wait);
+			else
+				schedule();
 			try_to_freeze();
 			continue;
 		}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 53b2fcf2d283..0d7bbe859550 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -690,6 +690,7 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
+	bdi_arm_supers_timer();
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f1ac2502e19c59e996242d406fcc60e4c563e8ce Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 19 May 2010 08:27:30 +0200
Subject: block: remove all rcu head initializations

Remove all rcu head inits. We don't care about the RCU head state before passing
it to call_rcu() anyway. Only leave the "on_stack" variants so debugobjects can
keep track of objects on stack.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 1 -
 block/genhd.c       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0f3eb70f9ce1..b1472bc2d49c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3788,7 +3788,6 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * second, in order to have larger depth for async operations.
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
-	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index 154b5f80b3ab..59a2db6fecef 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -988,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 	if (!new_ptbl)
 		return -ENOMEM;
 
-	INIT_RCU_HEAD(&new_ptbl->rcu_head);
 	new_ptbl->len = target;
 
 	for (i = 0; i < len; i++)
-- 
cgit v1.2.3-59-g8ed1b


From fa4b9074cd8428958c2adf9dc0c831f46e27c193 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 15 May 2010 20:09:27 +0200
Subject: buffer: make invalidate_bdev() drain all percpu LRU add caches

invalidate_bdev() should release all page cache pages which are clean
and not being used; however, if some pages are still in the percpu LRU
add caches on other cpus, those pages are considered in used and don't
get released.  Fix it by calling lru_add_drain_all() before trying to
invalidate pages.

This problem was discovered while testing block automatic native
capacity unlocking.  Null pages which were read before automatic
unlocking didn't get released by invalidate_bdev() and ended up
interfering with partition scan after unlocking.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..08e422d56996 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
 		return;
 
 	invalidate_bh_lrus();
+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 	invalidate_mapping_pages(mapping, 0, -1);
 }
 EXPORT_SYMBOL(invalidate_bdev);
-- 
cgit v1.2.3-59-g8ed1b


From 56bca01738733709bef076e2e97bbd01e5659f24 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 15 May 2010 20:09:28 +0200
Subject: block: restart partition scan after resizing a device

Device resize via ->set_capacity() can reveal new partitions (e.g. in
chained partition table formats such as dos extended parts).  Restart
partition scan from the beginning after resizing a device.  This
change also makes libata always revalidate the disk after resize which
makes lower layer native capacity unlocking implementation simpler and
more robust as resize can be handled in the usual path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..8f01df354f04 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -544,7 +544,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	struct hd_struct *part;
 	struct parsed_partitions *state;
 	int p, highest, res;
-
+rescan:
 	if (bdev->bd_part_count)
 		return -EBUSY;
 	res = invalidate_partition(disk, 0);
@@ -581,7 +581,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
-try_scan:
+
 		size = state->parts[p].size;
 		if (!size)
 			continue;
@@ -596,7 +596,6 @@ try_scan:
 
 		if (from + size > get_capacity(disk)) {
 			const struct block_device_operations *bdops = disk->fops;
-			unsigned long long capacity;
 
 			printk(KERN_WARNING
 			       "%s: p%d size %llu exceeds device capacity, ",
@@ -605,14 +604,11 @@ try_scan:
 			if (bdops->set_capacity &&
 			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
 				printk(KERN_CONT "enabling native capacity\n");
-				capacity = bdops->set_capacity(disk, ~0ULL);
+				bdops->set_capacity(disk, ~0ULL);
 				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-				if (capacity > get_capacity(disk)) {
-					set_capacity(disk, capacity);
-					check_disk_size_change(disk, bdev);
-					bdev->bd_invalidated = 0;
-				}
-				goto try_scan;
+				/* free state and restart */
+				kfree(state);
+				goto rescan;
 			} else {
 				/*
 				 * we can not ignore partitions of broken tables
-- 
cgit v1.2.3-59-g8ed1b


From c3e33e043f5e9c583aa59d5591a614b2a8243d3a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 15 May 2010 20:09:29 +0200
Subject: block,ide: simplify bdops->set_capacity() to
 ->unlock_native_capacity()

bdops->set_capacity() is unnecessarily generic.  All that's required
is a simple one way notification to lower level driver telling it to
try to unlock native capacity.  There's no reason to pass in target
capacity or return the new capacity.  The former is always the
inherent native capacity and the latter can be handled via the usual
device resize / revalidation path.  In fact, the current API is always
used that way.

Replace ->set_capacity() with ->unlock_native_capacity() which take
only @disk and doesn't return anything.  IDE which is the only current
user of the API is converted accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-disk.c | 40 ++++++++++++++++------------------------
 drivers/ide/ide-gd.c   | 11 ++++-------
 fs/partitions/check.c  |  4 ++--
 include/linux/blkdev.h |  3 +--
 include/linux/ide.h    |  2 +-
 5 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3b128dce9c3a..33d65039cce9 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -407,32 +407,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 	return 0;
 }
 
-static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
+static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 {
-	u64 set = min(capacity, drive->probed_capacity);
 	u16 *id = drive->id;
 	int lba48 = ata_id_lba48_enabled(id);
 
 	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
 	    ata_id_hpa_enabled(id) == 0)
-		goto out;
+		return;
 
 	/*
 	 * according to the spec the SET MAX ADDRESS command shall be
 	 * immediately preceded by a READ NATIVE MAX ADDRESS command
 	 */
-	capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
-	if (capacity == 0)
-		goto out;
-
-	set = ide_disk_hpa_set_capacity(drive, set, lba48);
-	if (set) {
-		/* needed for ->resume to disable HPA */
-		drive->dev_flags |= IDE_DFLAG_NOHPA;
-		return set;
-	}
-out:
-	return drive->capacity64;
+	if (!ide_disk_hpa_get_native_capacity(drive, lba48))
+		return;
+
+	if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
+		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
@@ -783,13 +775,13 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 }
 
 const struct ide_disk_ops ide_ata_disk_ops = {
-	.check		= ide_disk_check,
-	.set_capacity	= ide_disk_set_capacity,
-	.get_capacity	= ide_disk_get_capacity,
-	.setup		= ide_disk_setup,
-	.flush		= ide_disk_flush,
-	.init_media	= ide_disk_init_media,
-	.set_doorlock	= ide_disk_set_doorlock,
-	.do_request	= ide_do_rw_disk,
-	.ioctl		= ide_disk_ioctl,
+	.check			= ide_disk_check,
+	.unlock_native_capacity	= ide_disk_unlock_native_capacity,
+	.get_capacity		= ide_disk_get_capacity,
+	.setup			= ide_disk_setup,
+	.flush			= ide_disk_flush,
+	.init_media		= ide_disk_init_media,
+	.set_doorlock		= ide_disk_set_doorlock,
+	.do_request		= ide_do_rw_disk,
+	.ioctl			= ide_disk_ioctl,
 };
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c32d83996ae1..c102d23d9b38 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -288,17 +288,14 @@ static int ide_gd_media_changed(struct gendisk *disk)
 	return ret;
 }
 
-static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
-					      unsigned long long capacity)
+static void ide_gd_unlock_native_capacity(struct gendisk *disk)
 {
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 	const struct ide_disk_ops *disk_ops = drive->disk_ops;
 
-	if (disk_ops->set_capacity)
-		return disk_ops->set_capacity(drive, capacity);
-
-	return drive->capacity64;
+	if (disk_ops->unlock_native_capacity)
+		disk_ops->unlock_native_capacity(drive);
 }
 
 static int ide_gd_revalidate_disk(struct gendisk *disk)
@@ -329,7 +326,7 @@ static const struct block_device_operations ide_gd_ops = {
 	.locked_ioctl		= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
 	.media_changed		= ide_gd_media_changed,
-	.set_capacity		= ide_gd_set_capacity,
+	.unlock_native_capacity	= ide_gd_unlock_native_capacity,
 	.revalidate_disk	= ide_gd_revalidate_disk
 };
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8f01df354f04..4f1fee0355ad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -601,10 +601,10 @@ rescan:
 			       "%s: p%d size %llu exceeds device capacity, ",
 			       disk->disk_name, p, (unsigned long long) size);
 
-			if (bdops->set_capacity &&
+			if (bdops->unlock_native_capacity &&
 			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
 				printk(KERN_CONT "enabling native capacity\n");
-				bdops->set_capacity(disk, ~0ULL);
+				bdops->unlock_native_capacity(disk);
 				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
 				/* free state and restart */
 				kfree(state);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 346fd4856733..be411c12ebbe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1330,8 +1330,7 @@ struct block_device_operations {
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
-	unsigned long long (*set_capacity) (struct gendisk *,
-						unsigned long long);
+	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	struct module *owner;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 3239d1c10acb..b6d448048ae2 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,7 +362,7 @@ struct ide_drive_s;
 struct ide_disk_ops {
 	int		(*check)(struct ide_drive_s *, const char *);
 	int		(*get_capacity)(struct ide_drive_s *);
-	u64		(*set_capacity)(struct ide_drive_s *, u64);
+	void		(*unlock_native_capacity)(struct ide_drive_s *);
 	void		(*setup)(struct ide_drive_s *);
 	void		(*flush)(struct ide_drive_s *);
 	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
-- 
cgit v1.2.3-59-g8ed1b


From 1493bf217f7f59a5d9e2095a7dbcec00fb36ca8b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 15 May 2010 20:09:30 +0200
Subject: block: use struct parsed_partitions *state universally in partition
 check code

Make the following changes to partition check code.

* Add ->bdev to struct parsed_partitions.

* Introduce read_part_sector() which is a simple wrapper around
  read_dev_sector() which takes struct parsed_partitions *state
  instead of @bdev.

* For functions which used to take @state and @bdev, drop @bdev.  For
  functions which used to take @bdev, replace it with @state.

* While updating, drop superflous checks on NULL state/bdev in ldm.c.

This cleans up the API a bit and enables better handling of IO errors
during partition check as the generic partition check code now has
much better visibility into what went wrong in the low level code
paths.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/acorn.c  | 68 ++++++++++++++++++-------------------
 fs/partitions/acorn.h  | 10 +++---
 fs/partitions/amiga.c  | 13 ++++----
 fs/partitions/amiga.h  |  2 +-
 fs/partitions/atari.c  |  8 ++---
 fs/partitions/atari.h  |  2 +-
 fs/partitions/check.c  |  5 +--
 fs/partitions/check.h  |  7 ++++
 fs/partitions/efi.c    | 91 ++++++++++++++++++++++++--------------------------
 fs/partitions/efi.h    |  2 +-
 fs/partitions/ibm.c    | 21 ++++++------
 fs/partitions/ibm.h    |  2 +-
 fs/partitions/karma.c  |  4 +--
 fs/partitions/karma.h  |  2 +-
 fs/partitions/ldm.c    | 89 ++++++++++++++++++++++++------------------------
 fs/partitions/ldm.h    |  2 +-
 fs/partitions/mac.c    | 11 +++---
 fs/partitions/mac.h    |  2 +-
 fs/partitions/msdos.c  | 85 ++++++++++++++++++++--------------------------
 fs/partitions/msdos.h  |  2 +-
 fs/partitions/osf.c    |  4 +--
 fs/partitions/osf.h    |  2 +-
 fs/partitions/sgi.c    |  6 ++--
 fs/partitions/sgi.h    |  2 +-
 fs/partitions/sun.c    |  6 ++--
 fs/partitions/sun.h    |  2 +-
 fs/partitions/sysv68.c |  6 ++--
 fs/partitions/sysv68.h |  2 +-
 fs/partitions/ultrix.c |  4 +--
 fs/partitions/ultrix.h |  2 +-
 30 files changed, 225 insertions(+), 239 deletions(-)

diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
 
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
 	defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
-		unsigned long first_sect, int slot, unsigned long nr_sects)
+static int riscix_partition(struct parsed_partitions *state,
+			    unsigned long first_sect, int slot,
+			    unsigned long nr_sects)
 {
 	Sector sect;
 	struct riscix_record *rr;
 	
-	rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+	rr = read_part_sector(state, first_sect, &sect);
 	if (!rr)
 		return -1;
 
@@ -123,9 +123,9 @@ struct linux_part {
 
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
 	defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
-		unsigned long first_sect, int slot, unsigned long nr_sects)
+static int linux_partition(struct parsed_partitions *state,
+			   unsigned long first_sect, int slot,
+			   unsigned long nr_sects)
 {
 	Sector sect;
 	struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 
 	put_partition(state, slot++, first_sect, size);
 
-	linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+	linuxp = read_part_sector(state, first_sect, &sect);
 	if (!linuxp)
 		return -1;
 
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_CUMANA(struct parsed_partitions *state)
 {
 	unsigned long first_sector = 0;
 	unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 		struct adfs_discrecord *dr;
 		unsigned int nr_sects;
 
-		data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+		data = read_part_sector(state, start_blk * 2 + 6, &sect);
 		if (!data)
 			return -1;
 
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
 		case PARTITION_RISCIX_SCSI:
 			/* RISCiX - we don't know how to find the next one. */
-			slot = riscix_partition(state, bdev, first_sector,
-						 slot, nr_sects);
+			slot = riscix_partition(state, first_sector, slot,
+						nr_sects);
 			break;
 #endif
 
 		case PARTITION_LINUX:
-			slot = linux_partition(state, bdev, first_sector,
-						slot, nr_sects);
+			slot = linux_partition(state, first_sector, slot,
+					       nr_sects);
 			break;
 		}
 		put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
  *	    hda1 = ADFS partition on first drive.
  *	    hda2 = non-ADFS partition.
  */
-int
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ADFS(struct parsed_partitions *state)
 {
 	unsigned long start_sect, nr_sects, sectscyl, heads;
 	Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 	unsigned char id;
 	int slot = 1;
 
-	data = read_dev_sector(bdev, 6, &sect);
+	data = read_part_sector(state, 6, &sect);
 	if (!data)
 		return -1;
 
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 	/*
 	 * Work out start of non-adfs partition.
 	 */
-	nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+	nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
 
 	if (start_sect) {
 		switch (id) {
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
 		case PARTITION_RISCIX_SCSI:
 		case PARTITION_RISCIX_MFM:
-			slot = riscix_partition(state, bdev, start_sect,
-						 slot, nr_sects);
+			slot = riscix_partition(state, start_sect, slot,
+						nr_sects);
 			break;
 #endif
 
 		case PARTITION_LINUX:
-			slot = linux_partition(state, bdev, start_sect,
-						slot, nr_sects);
+			slot = linux_partition(state, start_sect, slot,
+					       nr_sects);
 			break;
 		}
 	}
@@ -308,10 +306,11 @@ struct ics_part {
 	__le32 size;
 };
 
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+				   unsigned long block)
 {
 	Sector sect;
-	unsigned char *data = read_dev_sector(bdev, block, &sect);
+	unsigned char *data = read_part_sector(state, block, &sect);
 	int result = 0;
 
 	if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
  *	    hda2 = ADFS partition 1 on first drive.
  *		..etc..
  */
-int
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ICS(struct parsed_partitions *state)
 {
 	const unsigned char *data;
 	const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 	/*
 	 * Try ICS style partitions - sector 0 contains partition info.
 	 */
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 	    	return -1;
 
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 			 * partition is.  We must not make this visible
 			 * to the filesystem.
 			 */
-			if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+			if (size > 1 && adfspart_check_ICSLinux(state, start)) {
 				start += 1;
 				size -= 1;
 			}
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
  *	    hda2 = ADFS partition 1 on first drive.
  *		..etc..
  */
-int
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
 {
 	Sector sect;
 	const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
 	int slot = 1;
 	int i;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
  *  1. The individual ADFS boot block entries that are placed on the disk.
  *  2. The start address of the next entry.
  */
-int
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_EESOX(struct parsed_partitions *state)
 {
 	Sector sect;
 	const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 	sector_t start = 0;
 	int i, slot = 1;
 
-	data = read_dev_sector(bdev, 7, &sect);
+	data = read_part_sector(state, 7, &sect);
 	if (!data)
 		return -1;
 
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 	if (i != 0) {
 		sector_t size;
 
-		size = get_capacity(bdev->bd_disk);
+		size = get_capacity(state->bdev->bd_disk);
 		put_partition(state, slot++, start, size - start);
 		printk("\n");
 	}
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
  *  format, and everyone stick to it?
  */
 
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
 	return sum;
 }
 
-int
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
+int amiga_partition(struct parsed_partitions *state)
 {
 	Sector sect;
 	unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 	for (blk = 0; ; blk++, put_dev_sector(sect)) {
 		if (blk == RDB_ALLOCATION_LIMIT)
 			goto rdb_done;
-		data = read_dev_sector(bdev, blk, &sect);
+		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
 				printk("Dev %s: unable to read RDB block %d\n",
-				       bdevname(bdev, b), blk);
+				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
 		}
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 		}
 
 		printk("Dev %s: RDB in block %d has bad checksum\n",
-			       bdevname(bdev, b), blk);
+		       bdevname(state->bdev, b), blk);
 	}
 
 	/* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 	put_dev_sector(sect);
 	for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
 		blk *= blksize;	/* Read in terms partition table understands */
-		data = read_dev_sector(bdev, blk, &sect);
+		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
 				printk("Dev %s: unable to read partition block %d\n",
-				       bdevname(bdev, b), blk);
+				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
 		}
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
  *  fs/partitions/amiga.h
  */
 
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
 
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
 		memcmp (s, "RAW", 3) == 0 ;
 }
 
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
 {
 	Sector sect;
 	struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 	int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
 
-	rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+	rs = read_part_sector(state, 0, &sect);
 	if (!rs)
 		return -1;
 
 	/* Verify this is an Atari rootsector: */
-	hd_size = bdev->bd_inode->i_size >> 9;
+	hd_size = state->bdev->bd_inode->i_size >> 9;
 	if (!VALID_PARTITION(&rs->part[0], hd_size) &&
 	    !VALID_PARTITION(&rs->part[1], hd_size) &&
 	    !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 		printk(" XGM<");
 		partsect = extensect = be32_to_cpu(pi->st);
 		while (1) {
-			xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+			xrs = read_part_sector(state, partsect, &sect2);
 			if (!xrs) {
 				printk (" block %ld read failed\n", partsect);
 				put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
   u16 checksum;			/* checksum for bootable disks */
 } __attribute__((__packed__));
 
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 4f1fee0355ad..a19995c6f6af 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
 
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
 	/*
 	 * Probe partition formats with tables at disk address 0
 	 * that also have an ADFS boot block at 0xdc0.
@@ -165,6 +165,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (!state)
 		return NULL;
 
+	state->bdev = bdev;
 	disk_name(hd, 0, state->name);
 	printk(KERN_INFO " %s:", state->name);
 	if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
-		res = check_part[i++](state, bdev);
+		res = check_part[i++](state);
 		if (res < 0) {
 			/* We have hit an I/O error which we don't report now.
 		 	* But record it, and let the others do their job.
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..4b31a97775be 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
  * description.
  */
 struct parsed_partitions {
+	struct block_device *bdev;
 	char name[BDEVNAME_SIZE];
 	struct {
 		sector_t from;
@@ -16,6 +17,12 @@ struct parsed_partitions {
 	int limit;
 };
 
+static inline void *read_part_sector(struct parsed_partitions *state,
+				     sector_t n, Sector *p)
+{
+	return read_dev_sector(state->bdev, n, p);
+}
+
 static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..9e346c19bbba 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
  *  the part[0] entry for this disk, and is the number of
  *  physical sectors available on the disk.
  */
-static u64
-last_lba(struct block_device *bdev)
+static u64 last_lba(struct block_device *bdev)
 {
 	if (!bdev || !bdev->bd_inode)
 		return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
  * @lba
  * @buffer
  * @size_t
  *
- * Description:  Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
  */
-static size_t
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+static size_t read_lba(struct parsed_partitions *state,
+		       u64 lba, u8 *buffer, size_t count)
 {
 	size_t totalreadcount = 0;
+	struct block_device *bdev = state->bdev;
 	sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
 
-	if (!bdev || !buffer || lba > last_lba(bdev))
+	if (!buffer || lba > last_lba(bdev))
                 return 0;
 
 	while (count) {
 		int copied = 512;
 		Sector sect;
-		unsigned char *data = read_dev_sector(bdev, n++, &sect);
+		unsigned char *data = read_part_sector(state, n++, &sect);
 		if (!data)
 			break;
 		if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
  * @gpt - GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
  * Notes: remember to free pte when you're done!
  */
-static gpt_entry *
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+					 gpt_header *gpt)
 {
 	size_t count;
 	gpt_entry *pte;
-	if (!bdev || !gpt)
+
+	if (!gpt)
 		return NULL;
 
 	count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 	if (!pte)
 		return NULL;
 
-	if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
                      (u8 *) pte,
 		     count) < count) {
 		kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
  * @lba is the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
  * Note: remember to free gpt when finished with it.
  */
-static gpt_header *
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+					 u64 lba)
 {
 	gpt_header *gpt;
-	unsigned ssz = bdev_logical_block_size(bdev);
-
-	if (!bdev)
-		return NULL;
+	unsigned ssz = bdev_logical_block_size(state->bdev);
 
 	gpt = kzalloc(ssz, GFP_KERNEL);
 	if (!gpt)
 		return NULL;
 
-	if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+	if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
 		kfree(gpt);
                 gpt=NULL;
 		return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
  * @lba is the logical block address of the GPT header to test
  * @gpt is a GPT header ptr, filled on return.
  * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  */
-static int
-is_gpt_valid(struct block_device *bdev, u64 lba,
-	     gpt_header **gpt, gpt_entry **ptes)
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+			gpt_header **gpt, gpt_entry **ptes)
 {
 	u32 crc, origcrc;
 	u64 lastlba;
 
-	if (!bdev || !gpt || !ptes)
+	if (!ptes)
 		return 0;
-	if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+	if (!(*gpt = alloc_read_gpt_header(state, lba)))
 		return 0;
 
 	/* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	/* Check the first_usable_lba and last_usable_lba are
 	 * within the disk.
 	 */
-	lastlba = last_lba(bdev);
+	lastlba = last_lba(state->bdev);
 	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
 		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
 			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 		goto fail;
 	}
 
-	if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+	if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
 		goto fail;
 
 	/* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
  * @gpt is a GPT header ptr, filled on return.
  * @ptes is a PTEs ptr, filled on return.
  * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
  * This protects against devices which misreport their size, and forces
  * the user to decide to use the Alternate GPT.
  */
-static int
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+			  gpt_entry **ptes)
 {
 	int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
 	gpt_header *pgpt = NULL, *agpt = NULL;
 	gpt_entry *pptes = NULL, *aptes = NULL;
 	legacy_mbr *legacymbr;
 	u64 lastlba;
-	if (!bdev || !gpt || !ptes)
+
+	if (!ptes)
 		return 0;
 
-	lastlba = last_lba(bdev);
+	lastlba = last_lba(state->bdev);
         if (!force_gpt) {
                 /* This will be added to the EFI Spec. per Intel after v1.02. */
                 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                 if (legacymbr) {
-                        read_lba(bdev, 0, (u8 *) legacymbr,
-                                 sizeof (*legacymbr));
+                        read_lba(state, 0, (u8 *) legacymbr,
+				 sizeof (*legacymbr));
                         good_pmbr = is_pmbr_valid(legacymbr);
                         kfree(legacymbr);
                 }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
                         goto fail;
         }
 
-	good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
 				 &pgpt, &pptes);
         if (good_pgpt)
-		good_agpt = is_gpt_valid(bdev,
+		good_agpt = is_gpt_valid(state,
 					 le64_to_cpu(pgpt->alternate_lba),
 					 &agpt, &aptes);
         if (!good_agpt && force_gpt)
-                good_agpt = is_gpt_valid(bdev, lastlba,
-                                         &agpt, &aptes);
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
 
         /* The obviously unsuccessful case */
         if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 }
 
 /**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
  * @state
- * @bdev
  *
  * Description: called from check.c, if the disk contains GPT
  * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
  *  1 if successful
  *
  */
-int
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int efi_partition(struct parsed_partitions *state)
 {
 	gpt_header *gpt = NULL;
 	gpt_entry *ptes = NULL;
 	u32 i;
-	unsigned ssz = bdev_logical_block_size(bdev) / 512;
+	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
 
-	if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
 		kfree(ptes);
 		return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
 
-		if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+		if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
 			continue;
 
 		put_partition(state, i+1, start * ssz, size * ssz);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
 } __attribute__ ((packed)) legacy_mbr;
 
 /* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
 
 #endif
 
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 
 /*
  */
-int
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ibm_partition(struct parsed_partitions *state)
 {
+	struct block_device *bdev = state->bdev;
 	int blocksize, res;
 	loff_t i_size, offset, size, fmt_size;
 	dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	/*
 	 * Get volume label, extract name and type.
 	 */
-	data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+	data = read_part_sector(state, info->label_block*(blocksize/512),
+				&sect);
 	if (data == NULL)
 		goto out_readerr;
 
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 			 */
 			blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
 			counter = 0;
-			data = read_dev_sector(bdev, blk * (blocksize/512),
-					       &sect);
+			data = read_part_sector(state, blk * (blocksize/512),
+						&sect);
 			while (data != NULL) {
 				struct vtoc_format1_label f1;
 
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 				    || f1.DS1FMTID == _ascebc['7']
 				    || f1.DS1FMTID == _ascebc['9']) {
 					blk++;
-					data = read_dev_sector(bdev, blk *
-							       (blocksize/512),
-								&sect);
+					data = read_part_sector(state,
+						blk * (blocksize/512), &sect);
 					continue;
 				}
 
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 					      size * (blocksize >> 9));
 				counter++;
 				blk++;
-				data = read_dev_sector(bdev,
-						       blk * (blocksize/512),
-						       &sect);
+				data = read_part_sector(state,
+						blk * (blocksize/512), &sect);
 			}
 
 			if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "karma.h"
 
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
 {
 	int i;
 	int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
 	} __attribute__((packed)) *label;
 	struct d_partition *p;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
 
 #define KARMA_LABEL_MAGIC		0xAB56
 
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
 
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..3ceca05b668c 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 
 /**
  * ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  * @ph1:   Memory struct to fill with ph contents
  *
  * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
  * Return:  'true'   Success
  *          'false'  Error
  */
-static bool ldm_validate_privheads (struct block_device *bdev,
-				    struct privhead *ph1)
+static bool ldm_validate_privheads(struct parsed_partitions *state,
+				   struct privhead *ph1)
 {
 	static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
 	struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 	long num_sects;
 	int i;
 
-	BUG_ON (!bdev || !ph1);
+	BUG_ON (!state || !ph1);
 
 	ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
 	ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 
 	/* Read and parse privheads */
 	for (i = 0; i < 3; i++) {
-		data = read_dev_sector (bdev,
-			ph[0]->config_start + off[i], &sect);
+		data = read_part_sector(state, ph[0]->config_start + off[i],
+					&sect);
 		if (!data) {
 			ldm_crit ("Disk read failed.");
 			goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 		}
 	}
 
-	num_sects = bdev->bd_inode->i_size >> 9;
+	num_sects = state->bdev->bd_inode->i_size >> 9;
 
 	if ((ph[0]->config_start > num_sects) ||
 	   ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
 
 /**
  * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev:  Device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @state->bdev, of the database
  * @ldb:   Cache of the database structures
  *
  * Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
  *
  * The offsets and sizes of the configs are range-checked against a privhead.
  *
  * Return:  'true'   @toc1 contains validated TOCBLOCK info
  *          'false'  @toc1 contents are undefined
  */
-static bool ldm_validate_tocblocks(struct block_device *bdev,
-	unsigned long base, struct ldmdb *ldb)
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
+				   unsigned long base, struct ldmdb *ldb)
 {
 	static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
 	struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
 	int i, nr_tbs;
 	bool result = false;
 
-	BUG_ON(!bdev || !ldb);
+	BUG_ON(!state || !ldb);
 	ph = &ldb->ph;
 	tb[0] = &ldb->toc;
 	tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
 	 * skip any that fail as long as we get at least one valid TOCBLOCK.
 	 */
 	for (nr_tbs = i = 0; i < 4; i++) {
-		data = read_dev_sector(bdev, base + off[i], &sect);
+		data = read_part_sector(state, base + off[i], &sect);
 		if (!data) {
 			ldm_error("Disk read failed for TOCBLOCK %d.", i);
 			continue;
@@ -473,7 +473,7 @@ err:
 
 /**
  * ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  * @base:  Offset, into @bdev, of the database
  * @ldb:   Cache of the database structures
  *
@@ -483,8 +483,8 @@ err:
  * Return:  'true'   @ldb contains validated VBDB info
  *          'false'  @ldb contents are undefined
  */
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
-			       struct ldmdb *ldb)
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
+			      unsigned long base, struct ldmdb *ldb)
 {
 	Sector sect;
 	u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 	struct vmdb *vm;
 	struct tocblock *toc;
 
-	BUG_ON (!bdev || !ldb);
+	BUG_ON (!state || !ldb);
 
 	vm  = &ldb->vm;
 	toc = &ldb->toc;
 
-	data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
+	data = read_part_sector(state, base + OFF_VMDB, &sect);
 	if (!data) {
 		ldm_crit ("Disk read failed.");
 		return false;
@@ -534,21 +534,21 @@ out:
 
 /**
  * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  *
  * This function provides a weak test to decide whether the device is a dynamic
  * disk or not.  It looks for an MS-DOS-style partition table containing at
  * least one partition of type 0x42 (formerly SFS, now used by Windows for
  * dynamic disks).
  *
- * N.B.  The only possible error can come from the read_dev_sector and that is
+ * N.B.  The only possible error can come from the read_part_sector and that is
  *       only likely to happen if the underlying device is strange.  If that IS
  *       the case we should return zero to let someone else try.
  *
- * Return:  'true'   @bdev is a dynamic disk
- *          'false'  @bdev is not a dynamic disk, or an error occurred
+ * Return:  'true'   @state->bdev is a dynamic disk
+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
  */
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
 {
 	Sector sect;
 	u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
 	int i;
 	bool result = false;
 
-	BUG_ON (!bdev);
+	BUG_ON(!state);
 
-	data = read_dev_sector (bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data) {
 		ldm_crit ("Disk read failed.");
 		return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 
 /**
  * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev:  Device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @state->bdev, of the database
  * @ldb:   Cache of the database structures
  *
  * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
  * Return:  'true'   All the VBLKs were read successfully
  *          'false'  An error occurred
  */
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
-			   struct ldmdb *ldb)
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
+			  struct ldmdb *ldb)
 {
 	int size, perbuf, skip, finish, s, v, recs;
 	u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 	bool result = false;
 	LIST_HEAD (frags);
 
-	BUG_ON (!bdev || !ldb);
+	BUG_ON(!state || !ldb);
 
 	size   = ldb->vm.vblk_size;
 	perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 	finish = (size * ldb->vm.last_vblk_seq) >> 9;
 
 	for (s = skip; s < finish; s++) {		/* For each sector */
-		data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
+		data = read_part_sector(state, base + OFF_VMDB + s, &sect);
 		if (!data) {
 			ldm_crit ("Disk read failed.");
 			goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
 
 /**
  * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp:    List of the partitions parsed so far
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  *
  * This determines whether the device @bdev is a dynamic disk and if so creates
  * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
  * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
  * and so on: the actual data containing partitions.
  *
- * Return:  1 Success, @bdev is a dynamic disk and we handled it
- *          0 Success, @bdev is not a dynamic disk
+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
+ *          0 Success, @state->bdev is not a dynamic disk
  *         -1 An error occurred before enough information had been read
- *            Or @bdev is a dynamic disk, but it may be corrupted
+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
  */
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
 {
 	struct ldmdb  *ldb;
 	unsigned long base;
 	int result = -1;
 
-	BUG_ON (!pp || !bdev);
+	BUG_ON(!state);
 
 	/* Look for signs of a Dynamic Disk */
-	if (!ldm_validate_partition_table (bdev))
+	if (!ldm_validate_partition_table(state))
 		return 0;
 
 	ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
 	}
 
 	/* Parse and check privheads. */
-	if (!ldm_validate_privheads (bdev, &ldb->ph))
+	if (!ldm_validate_privheads(state, &ldb->ph))
 		goto out;		/* Already logged */
 
 	/* All further references are relative to base (database start). */
 	base = ldb->ph.config_start;
 
 	/* Parse and check tocs and vmdb. */
-	if (!ldm_validate_tocblocks (bdev, base, ldb) ||
-	    !ldm_validate_vmdb      (bdev, base, ldb))
+	if (!ldm_validate_tocblocks(state, base, ldb) ||
+	    !ldm_validate_vmdb(state, base, ldb))
 	    	goto out;		/* Already logged */
 
 	/* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
 	INIT_LIST_HEAD (&ldb->v_comp);
 	INIT_LIST_HEAD (&ldb->v_part);
 
-	if (!ldm_get_vblks (bdev, base, ldb)) {
+	if (!ldm_get_vblks(state, base, ldb)) {
 		ldm_crit ("Failed to read the VBLKs from the database.");
 		goto cleanup;
 	}
 
 	/* Finally, create the data partition devices. */
-	if (ldm_create_data_partitions (pp, ldb)) {
+	if (ldm_create_data_partitions(state, ldb)) {
 		ldm_debug ("Parsed LDM database successfully.");
 		result = 1;
 	}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb {				/* Cache of the database */
 	struct list_head v_part;
 };
 
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
 
 #endif /* _FS_PT_LDM_H_ */
 
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..13e27b0082f2 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
 		stg[i] = 0;
 }
 
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
 {
 	int slot = 1;
 	Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct mac_driver_desc *md;
 
 	/* Get 0th block and look at the first partition map entry. */
-	md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
+	md = read_part_sector(state, 0, &sect);
 	if (!md)
 		return -1;
 	if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 	secsize = be16_to_cpu(md->block_size);
 	put_dev_sector(sect);
-	data = read_dev_sector(bdev, secsize/512, &sect);
+	data = read_part_sector(state, secsize/512, &sect);
 	if (!data)
 		return -1;
 	part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	for (blk = 1; blk <= blocks_in_map; ++blk) {
 		int pos = blk * secsize;
 		put_dev_sector(sect);
-		data = read_dev_sector(bdev, pos/512, &sect);
+		data = read_part_sector(state, pos/512, &sect);
 		if (!data)
 			return -1;
 		part = (struct mac_partition *) (data + pos%512);
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 #ifdef CONFIG_PPC_PMAC
 	if (found_root_goodness)
-		note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+		note_bootable_part(state->bdev->bd_dev, found_root,
+				   found_root_goodness);
 #endif
 
 	put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
     /* ... more stuff */
 };
 
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..645a68d8c055 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
 #define AIX_LABEL_MAGIC2	0xC2
 #define AIX_LABEL_MAGIC3	0xD4
 #define AIX_LABEL_MAGIC4	0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 {
 	struct partition *pt = (struct partition *) (p + 0x1be);
 	Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 			is_extended_partition(pt))
 			return 0;
 	}
-	d = read_dev_sector(bdev, 7, &sect);
+	d = read_part_sector(state, 7, &sect);
 	if (d) {
 		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
 			ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
  * only for the actual data partitions.
  */
 
-static void
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
-			sector_t first_sector, sector_t first_size)
+static void parse_extended(struct parsed_partitions *state,
+			   sector_t first_sector, sector_t first_size)
 {
 	struct partition *p;
 	Sector sect;
 	unsigned char *data;
 	sector_t this_sector, this_size;
-	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 			return;
 		if (state->next == state->limit)
 			return;
-		data = read_dev_sector(bdev, this_sector, &sect);
+		data = read_part_sector(state, this_sector, &sect);
 		if (!data)
 			return;
 
@@ -198,9 +197,8 @@ done:
 /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
    indicates linux swap.  Be careful before believing this is Solaris. */
 
-static void
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
-			sector_t offset, sector_t size, int origin)
+static void parse_solaris_x86(struct parsed_partitions *state,
+			      sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
 	Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 	int i;
 	short max_nparts;
 
-	v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
+	v = read_part_sector(state, offset + 1, &sect);
 	if (!v)
 		return;
 	if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
  * Create devices for BSD partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
-static void
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
-		sector_t offset, sector_t size, int origin, char *flavour,
-		int max_partitions)
+static void parse_bsd(struct parsed_partitions *state,
+		      sector_t offset, sector_t size, int origin, char *flavour,
+		      int max_partitions)
 {
 	Sector sect;
 	struct bsd_disklabel *l;
 	struct bsd_partition *p;
 
-	l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
+	l = read_part_sector(state, offset + 1, &sect);
 	if (!l)
 		return;
 	if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 }
 #endif
 
-static void
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
-		sector_t offset, sector_t size, int origin)
+static void parse_freebsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-	parse_bsd(state, bdev, offset, size, origin,
-			"bsd", BSD_MAXPARTITIONS);
+	parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
 #endif
 }
 
-static void
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
-		sector_t offset, sector_t size, int origin)
+static void parse_netbsd(struct parsed_partitions *state,
+			 sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-	parse_bsd(state, bdev, offset, size, origin,
-			"netbsd", BSD_MAXPARTITIONS);
+	parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
 #endif
 }
 
-static void
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
-		sector_t offset, sector_t size, int origin)
+static void parse_openbsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-	parse_bsd(state, bdev, offset, size, origin,
-			"openbsd", OPENBSD_MAXPARTITIONS);
+	parse_bsd(state, offset, size, origin, "openbsd",
+		  OPENBSD_MAXPARTITIONS);
 #endif
 }
 
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
  * Create devices for Unixware partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
-static void
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
-		sector_t offset, sector_t size, int origin)
+static void parse_unixware(struct parsed_partitions *state,
+			   sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
 	Sector sect;
 	struct unixware_disklabel *l;
 	struct unixware_slice *p;
 
-	l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
+	l = read_part_sector(state, offset + 29, &sect);
 	if (!l)
 		return;
 	if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
  * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
  * Rajeev V. Pillai    <rajeevvp@yahoo.com>
  */
-static void
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
-		sector_t offset, sector_t size, int origin)
+static void parse_minix(struct parsed_partitions *state,
+			sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
 	Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 	struct partition *p;
 	int i;
 
-	data = read_dev_sector(bdev, offset, &sect);
+	data = read_part_sector(state, offset, &sect);
 	if (!data)
 		return;
 
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 
 static struct {
 	unsigned char id;
-	void (*parse)(struct parsed_partitions *, struct block_device *,
-			sector_t, sector_t, int);
+	void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
 } subtypes[] = {
 	{FREEBSD_PARTITION, parse_freebsd},
 	{NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
 	{0, NULL},
 };
  
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
 {
-	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
 	struct fat_boot_sector *fb;
 	int slot;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 	if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		return 0;
 	}
 
-	if (aix_magic_present(data, bdev)) {
+	if (aix_magic_present(state, data)) {
 		put_dev_sector(sect);
 		printk( " [AIX]");
 		return 0;
@@ -503,7 +492,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 			put_partition(state, slot, start, n);
 
 			printk(" <");
-			parse_extended(state, bdev, start, size);
+			parse_extended(state, start, size);
 			printk(" >");
 			continue;
 		}
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 		if (!subtypes[n].parse)
 			continue;
-		subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
-						nr_sects(p)*sector_size, slot);
+		subtypes[n].parse(state, start_sect(p) * sector_size,
+				  nr_sects(p) * sector_size, slot);
 	}
 	put_dev_sector(sect);
 	return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
 
 #define MSDOS_LABEL_MAGIC		0xAA55
 
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
 
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
 
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
 {
 	int i;
 	int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
 	} * label;
 	struct d_partition * partition;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
 
 #define DISKLABELMAGIC (0x82564557UL)
 
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
 	__be32 _unused1;			/* Padding */
 };
 
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
 {
 	int i, csum;
 	__be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct sgi_partition *p;
 	char b[BDEVNAME_SIZE];
 
-	label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
+	label = read_part_sector(state, 0, &sect);
 	if (!label)
 		return -1;
 	p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 	if(csum) {
 		printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
-		       bdevname(bdev, b));
+		       bdevname(state->bdev, b));
 		put_dev_sector(sect);
 		return 0;
 	}
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
  *  fs/partitions/sgi.h
  */
 
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
 
 #define SGI_LABEL_MAGIC 0x0be5a941
 
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "sun.h"
 
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
 {
 	int i;
 	__be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
 	int use_vtoc;
 	int nparts;
 
-	label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
+	label = read_part_sector(state, 0, &sect);
 	if (!label)
 		return -1;
 
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
 		csum ^= *ush--;
 	if (csum) {
 		printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
-		       bdevname(bdev, b));
+		       bdevname(state->bdev, b));
 		put_dev_sector(sect);
 		return 0;
 	}
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
 #define SUN_LABEL_MAGIC          0xDABE
 #define SUN_VTOC_SANITY          0x600DDEEE
 
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
 };
 
 
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
 {
 	int i, slices;
 	int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct dkblk0 *b;
 	struct slice *slice;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
 	i = be32_to_cpu(b->dk_ios.ios_slcblk);
 	put_dev_sector(sect);
 
-	data = read_dev_sector(bdev, i, &sect);
+	data = read_part_sector(state, i, &sect);
 	if (!data)
 		return -1;
 
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "ultrix.h"
 
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
 {
 	int i;
 	Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
 #define PT_MAGIC	0x032957	/* Partition magic number */
 #define PT_VALID	1		/* Indicates if struct is valid */
 
-	data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
+	data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
 	if (!data)
 		return -1;
 	
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
  *  fs/partitions/ultrix.h
  */
 
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
-- 
cgit v1.2.3-59-g8ed1b


From b403a98e260f3a8c7c33f58a07c7ae549852170f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 15 May 2010 20:09:31 +0200
Subject: block: improve automatic native capacity unlocking

Currently, native capacity unlocking is initiated only when a
recognized partition extends beyond the end of the disk.  However,
there are several other unhandled cases where truncated capacity can
lead to misdetection of partitions.

* Partition table is fully beyond EOD.

* Partition table is partially beyond EOD (daisy chained ones).

* Recognized partition starts beyond EOD.

This patch updates generic partition check code such that all the
above three cases are handled too.  For the first two, @state tracks
whether low level partition check code tried to read beyond EOD during
partition scan and triggers native capacity unlocking accordingly.
The third is now handled similarly to the original unlocking case.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 69 ++++++++++++++++++++++++++++++++++++++++-----------
 fs/partitions/check.h |  5 ++++
 2 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index a19995c6f6af..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -161,7 +161,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	struct parsed_partitions *state;
 	int i, res, err;
 
-	state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+	state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
 	if (!state)
 		return NULL;
 
@@ -187,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	}
 	if (res > 0)
 		return state;
+	if (state->access_beyond_eod)
+		err = -ENOSPC;
 	if (err)
 	/* The partition is unrecognized. So report I/O errors if there were any */
 		res = err;
@@ -539,13 +541,34 @@ exit:
 	disk_part_iter_exit(&piter);
 }
 
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+	const struct block_device_operations *bdops = disk->fops;
+
+	if (bdops->unlock_native_capacity &&
+	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+		printk(KERN_CONT "enabling native capacity\n");
+		bdops->unlock_native_capacity(disk);
+		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+		return true;
+	} else {
+		printk(KERN_CONT "truncated\n");
+		return false;
+	}
+}
+
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+	struct parsed_partitions *state = NULL;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
-	struct parsed_partitions *state;
 	int p, highest, res;
 rescan:
+	if (state && !IS_ERR(state)) {
+		kfree(state);
+		state = NULL;
+	}
+
 	if (bdev->bd_part_count)
 		return -EBUSY;
 	res = invalidate_partition(disk, 0);
@@ -563,8 +586,32 @@ rescan:
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
-	if (IS_ERR(state))	/* I/O error reading the partition table */
+	if (IS_ERR(state)) {
+		/*
+		 * I/O error reading the partition table.  If any
+		 * partition code tried to read beyond EOD, retry
+		 * after unlocking native capacity.
+		 */
+		if (PTR_ERR(state) == -ENOSPC) {
+			printk(KERN_WARNING "%s: partition table beyond EOD, ",
+			       disk->disk_name);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
+		}
 		return -EIO;
+	}
+	/*
+	 * If any partition code tried to read beyond EOD, try
+	 * unlocking native capacity even if partition table is
+	 * sucessfully read as we could be missing some partitions.
+	 */
+	if (state->access_beyond_eod) {
+		printk(KERN_WARNING
+		       "%s: partition table partially beyond EOD, ",
+		       disk->disk_name);
+		if (disk_unlock_native_capacity(disk))
+			goto rescan;
+	}
 
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -590,25 +637,20 @@ rescan:
 		from = state->parts[p].from;
 		if (from >= get_capacity(disk)) {
 			printk(KERN_WARNING
-			       "%s: p%d ignored, start %llu is behind the end of the disk\n",
+			       "%s: p%d start %llu is beyond EOD, ",
 			       disk->disk_name, p, (unsigned long long) from);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
 			continue;
 		}
 
 		if (from + size > get_capacity(disk)) {
-			const struct block_device_operations *bdops = disk->fops;
-
 			printk(KERN_WARNING
-			       "%s: p%d size %llu exceeds device capacity, ",
+			       "%s: p%d size %llu extends beyond EOD, ",
 			       disk->disk_name, p, (unsigned long long) size);
 
-			if (bdops->unlock_native_capacity &&
-			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
-				printk(KERN_CONT "enabling native capacity\n");
-				bdops->unlock_native_capacity(disk);
-				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+			if (disk_unlock_native_capacity(disk)) {
 				/* free state and restart */
-				kfree(state);
 				goto rescan;
 			} else {
 				/*
@@ -617,7 +659,6 @@ rescan:
 				 * we limit them to the end of the disk to avoid
 				 * creating invalid block devices
 				 */
-				printk(KERN_CONT "limited to end of disk\n");
 				size = get_capacity(disk) - from;
 			}
 		}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 4b31a97775be..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -15,11 +15,16 @@ struct parsed_partitions {
 	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
+	bool access_beyond_eod;
 };
 
 static inline void *read_part_sector(struct parsed_partitions *state,
 				     sector_t n, Sector *p)
 {
+	if (n >= get_capacity(state->bdev->bd_disk)) {
+		state->access_beyond_eod = true;
+		return NULL;
+	}
 	return read_dev_sector(state->bdev, n, p);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c2c4986eddaa7dc3d036cb2bfa5c8c5f1f2492a0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 20 May 2010 09:18:47 +0200
Subject: writeback: fix problem with !CONFIG_BLOCK compilation

When CONFIG_BLOCK isn't enabled:

mm/page-writeback.c: In function 'laptop_mode_timer_fn':
mm/page-writeback.c:708: error: dereferencing pointer to incomplete type
mm/page-writeback.c:709: error: dereferencing pointer to incomplete type

Fix this by essentially eliminating the laptop sync handlers when
CONFIG_BLOCK isn't set, as most are only used from the block layer code.
The exception is laptop_sync_completion() which is used from sys_sync(),
make that an empty declaration in that case.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/super.c                | 1 +
 include/linux/writeback.h | 4 ++++
 mm/page-writeback.c       | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/fs/super.c b/fs/super.c
index dc72491a19f9..1527e6a0ee35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 47e1c686cb02..cc97d6caf2b3 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -106,10 +106,14 @@ static inline void inode_sync_wait(struct inode *inode)
 /*
  * mm/page-writeback.c
  */
+#ifdef CONFIG_BLOCK
 void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
 void laptop_mode_sync(struct work_struct *work);
 void laptop_mode_timer_fn(unsigned long data);
+#else
+static inline void laptop_sync_completion(void) { }
+#endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 
 /* These are exported to sysctl. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0d7bbe859550..9886424e1864 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -694,6 +694,7 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	return 0;
 }
 
+#ifdef CONFIG_BLOCK
 void laptop_mode_timer_fn(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
@@ -735,6 +736,7 @@ void laptop_sync_completion(void)
 
 	rcu_read_unlock();
 }
+#endif
 
 /*
  * If ratelimit_pages is too high then we can get into dirty-data overload
-- 
cgit v1.2.3-59-g8ed1b


From df96e96f76571c30d903829a7b2ab2b421028790 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 21 May 2010 20:01:54 +0200
Subject: writeback: fix mixed up arguments to bdi_start_writeback()

The laptop mode timer had the nr_pages and sb_locked arguments
mixed up.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 mm/page-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9886424e1864..b289310e2c89 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -707,7 +707,7 @@ void laptop_mode_timer_fn(unsigned long data)
 	 */
 
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, NULL, 0, nr_pages);
+		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 31a31dccdd308d5ec7f9a1197a7875a246a348dc Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 19 May 2010 16:28:28 +0200
Subject: drbd: Do not Oops when C_STANDALONE when uuid gets generated

Got introduces with

commit 0c3f34516e8c5a1a0ba3585a7777d32bbbdf4ecb
Author: Philipp Reisner <philipp.reisner@linbit.com>
Date:   Mon May 17 16:10:43 2010 +0200

    drbd: Create new current UUID as late as possible

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/drbd/drbd_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index c144509011b8..7e057b074bbd 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1354,7 +1354,10 @@ static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int c
 {
 	if (get_ldev(mdev)) {
 		drbd_uuid_new_current(mdev);
-		drbd_send_uuids(mdev);
+		if (get_net_conf(mdev)) {
+			drbd_send_uuids(mdev);
+			put_net_conf(mdev);
+		}
 		drbd_md_sync(mdev);
 		put_ldev(mdev);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 4604d6366859f781ad16c07a6c65b16fc96e26c5 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 19 May 2010 17:37:02 +0200
Subject: drbd: Ensure to not trigger late-new-UUID creation multiple times

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/drbd/drbd_main.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7e057b074bbd..a949fc45f7ef 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1217,7 +1217,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		mdev->p_uuid = NULL;
 		if (get_ldev(mdev)) {
 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE)
+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
+			    !atomic_read(&mdev->new_c_uuid))
 				atomic_set(&mdev->new_c_uuid, 2);
 			put_ldev(mdev);
 		}
@@ -1225,7 +1226,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
 		/* Diskless peer becomes primary or got connected do diskless, primary peer. */
-		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
+		    !atomic_read(&mdev->new_c_uuid))
 			atomic_set(&mdev->new_c_uuid, 2);
 
 		/* D_DISKLESS Peer becomes secondary */
@@ -1353,12 +1355,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	if (get_ldev(mdev)) {
-		drbd_uuid_new_current(mdev);
-		if (get_net_conf(mdev)) {
-			drbd_send_uuids(mdev);
-			put_net_conf(mdev);
+		if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+			drbd_uuid_new_current(mdev);
+			if (get_net_conf(mdev)) {
+				drbd_send_uuids(mdev);
+				put_net_conf(mdev);
+			}
+			drbd_md_sync(mdev);
 		}
-		drbd_md_sync(mdev);
 		put_ldev(mdev);
 	}
 	atomic_dec(&mdev->new_c_uuid);
-- 
cgit v1.2.3-59-g8ed1b


From fc8ce1941d668c70e57a07f13f5a63e73e5dbff3 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 20 May 2010 10:04:17 +0200
Subject: drbd: Fix: Do not detach, if a bio with a barrier fails

Introduced a few days ago:
  commit 45bb912bd5ea4d2b3a270a93cbdf767a0e2df6f5
  Author: Lars Ellenberg <lars.ellenberg@linbit.com>
  Date:   Fri May 14 17:10:48 2010 +0200

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/drbd/drbd_worker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 91085c1ab52f..15c96207e626 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -127,7 +127,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
 		drbd_bump_write_ordering(mdev, WO_bdev_flush);
 		spin_lock_irqsave(&mdev->req_lock, flags);
 		list_del(&e->w.list);
-		e->flags |= EE_RESUBMITTED;
+		e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
 		e->w.cb = w_e_reissue;
 		/* put_ldev actually happens below, once we come here again. */
 		__release(local);
-- 
cgit v1.2.3-59-g8ed1b


From 23ce422748def9652fdc019f740cd7900fa2fe3b Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 20 May 2010 13:35:31 +0200
Subject: drbd: Null pointer deref fix to the large "multi bio rewrite"

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/drbd/drbd_receiver.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 461d9872d4d3..bc9ab7fb2cc7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -98,6 +98,10 @@ static struct page *page_chain_del(struct page **head, int n)
 	BUG_ON(!head);
 
 	page = *head;
+
+	if (!page)
+		return NULL;
+
 	while (page) {
 		tmp = page_chain_next(page);
 		if (--n == 0)
-- 
cgit v1.2.3-59-g8ed1b


From 4e23a59ed1c5f12e14f7899855f8379b3d42e578 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 20 May 2010 14:45:07 +0200
Subject: drbd: Do not free p_uuid early, this is done in the exit code of the
 receiver

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/drbd/drbd_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a949fc45f7ef..be2d2da9cdba 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1213,8 +1213,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	&&  (ns.pdsk < D_INCONSISTENT ||
 	     ns.pdsk == D_UNKNOWN ||
 	     ns.pdsk == D_OUTDATED)) {
-		kfree(mdev->p_uuid);
-		mdev->p_uuid = NULL;
 		if (get_ldev(mdev)) {
 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
 			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
-- 
cgit v1.2.3-59-g8ed1b


From 3d42b3612891baecf709d93f28655a6882a65d41 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 20 May 2010 12:14:54 +0200
Subject: drbd: This is now equivalent to drbd release 8.3.8rc1

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/drbd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 7944dd36ee3e..68530521ad00 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,10 +53,10 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.7"
+#define REL_VERSION "8.3.8rc1"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 93
+#define PRO_VERSION_MAX 94
 
 
 enum drbd_io_error_p {
-- 
cgit v1.2.3-59-g8ed1b


From 35f3d14dbbc58447c61e38a162ea10add6b31dc7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 20 May 2010 10:43:18 +0200
Subject: pipe: add support for shrinking and growing pipes

This patch adds F_GETPIPE_SZ and F_SETPIPE_SZ fcntl() actions for
growing and shrinking the size of a pipe and adjusts pipe.c and splice.c
(and relay and network splice) usage to work with these larger (or smaller)
pipes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fcntl.c                |   5 ++
 fs/pipe.c                 | 107 ++++++++++++++++++++++++++++----
 fs/splice.c               | 151 ++++++++++++++++++++++++++++++++--------------
 include/linux/fcntl.h     |   6 ++
 include/linux/pipe_fs_i.h |  11 ++--
 include/linux/splice.h    |   7 +++
 kernel/relay.c            |  15 +++--
 kernel/trace/trace.c      |  60 ++++++++++--------
 net/core/skbuff.c         |  38 ++++++------
 9 files changed, 292 insertions(+), 108 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..bcba960328fa 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_NOTIFY:
 		err = fcntl_dirnotify(fd, filp, arg);
 		break;
+	case F_SETPIPE_SZ:
+	case F_GETPIPE_SZ:
+		err = pipe_fcntl(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..054b8a6a2c7a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -390,7 +391,7 @@ redo:
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
-				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
 				pipe->curbuf = curbuf;
 				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
@@ -472,7 +473,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 	if (pipe->nrbufs && chars != 0) {
 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-							(PIPE_BUFFERS-1);
+							(pipe->buffers - 1);
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		const struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
@@ -518,8 +519,8 @@ redo1:
 			break;
 		}
 		bufs = pipe->nrbufs;
-		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+		if (bufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
 			char *src;
@@ -580,7 +581,7 @@ redo2:
 			if (!total_len)
 				break;
 		}
-		if (bufs < PIPE_BUFFERS)
+		if (bufs < pipe->buffers)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
 			if (!ret)
@@ -640,7 +641,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
 				count += pipe->bufs[buf].len;
-				buf = (buf+1) & (PIPE_BUFFERS-1);
+				buf = (buf+1) & (pipe->buffers - 1);
 			}
 			mutex_unlock(&inode->i_mutex);
 
@@ -671,7 +672,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
 		/*
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
@@ -877,25 +878,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
-		init_waitqueue_head(&pipe->wait);
-		pipe->r_counter = pipe->w_counter = 1;
-		pipe->inode = inode;
+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		if (pipe->bufs) {
+			init_waitqueue_head(&pipe->wait);
+			pipe->r_counter = pipe->w_counter = 1;
+			pipe->inode = inode;
+			pipe->buffers = PIPE_DEF_BUFFERS;
+			return pipe;
+		}
+		kfree(pipe);
 	}
 
-	return pipe;
+	return NULL;
 }
 
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
 			buf->ops->release(pipe, buf);
 	}
 	if (pipe->tmp_page)
 		__free_page(pipe->tmp_page);
+	kfree(pipe->bufs);
 	kfree(pipe);
 }
 
@@ -1093,6 +1101,81 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 	return sys_pipe2(fildes, 0);
 }
 
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+	struct pipe_buffer *bufs;
+
+	/*
+	 * Must be a power-of-2 currently
+	 */
+	if (!is_power_of_2(arg))
+		return -EINVAL;
+
+	/*
+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+	 * expect a lot of shrink+grow operations, just free and allocate
+	 * again like we would do for growing. If the pipe currently
+	 * contains more buffers than arg, then return busy.
+	 */
+	if (arg < pipe->nrbufs)
+		return -EBUSY;
+
+	bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (unlikely(!bufs))
+		return -ENOMEM;
+
+	/*
+	 * The pipe array wraps around, so just start the new one at zero
+	 * and adjust the indexes.
+	 */
+	if (pipe->nrbufs) {
+		const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+		const unsigned int head = pipe->nrbufs - tail;
+
+		if (head)
+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+		if (tail)
+			memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+	}
+
+	pipe->curbuf = 0;
+	kfree(pipe->bufs);
+	pipe->bufs = bufs;
+	pipe->buffers = arg;
+	return arg;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe;
+	long ret;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+	if (!pipe)
+		return -EBADF;
+
+	mutex_lock(&pipe->inode->i_mutex);
+
+	switch (cmd) {
+	case F_SETPIPE_SZ:
+		ret = pipe_set_size(pipe, arg);
+		break;
+	case F_GETPIPE_SZ:
+		ret = pipe->buffers;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&pipe->inode->i_mutex);
+	return ret;
+}
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			break;
 		}
 
-		if (pipe->nrbufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+		if (pipe->nrbufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 
 			buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
 			if (!--spd->nr_pages)
 				break;
-			if (pipe->nrbufs < PIPE_BUFFERS)
+			if (pipe->nrbufs < pipe->buffers)
 				continue;
 
 			break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 	page_cache_release(spd->pages[i]);
 }
 
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return 0;
+
+	spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+	if (spd->pages && spd->partial)
+		return 0;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+	return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+}
+
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages, req_pages;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+	nr_pages = min(req_pages, pipe->buffers);
 
 	/*
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
-	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 	index += spd.nr_pages;
 
 	/*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			unlock_page(page);
 		}
 
-		pages[spd.nr_pages++] = page;
+		spd.pages[spd.nr_pages++] = page;
 		index++;
 	}
 
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * this_len is the max we'll use from this page
 		 */
 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-		page = pages[page_nr];
+		page = spd.pages[page_nr];
 
 		if (PageReadahead(page))
 			page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 					error = -ENOMEM;
 					break;
 				}
-				page_cache_release(pages[page_nr]);
-				pages[page_nr] = page;
+				page_cache_release(spd.pages[page_nr]);
+				spd.pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
 			len = this_len;
 		}
 
-		partial[page_nr].offset = loff;
-		partial[page_nr].len = this_len;
+		spd.partial[page_nr].offset = loff;
+		spd.partial[page_nr].len = this_len;
 		len -= this_len;
 		loff = 0;
 		spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
 	 * we got, 'nr_pages' is how many pages are in the map.
 	 */
 	while (page_nr < nr_pages)
-		page_cache_release(pages[page_nr++]);
+		page_cache_release(spd.pages[page_nr++]);
 	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 
 	if (spd.nr_pages)
-		return splice_to_pipe(pipe, &spd);
+		error = splice_to_pipe(pipe, &spd);
 
+	splice_shrink_spd(pipe, &spd);
 	return error;
 }
 
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	unsigned int nr_pages;
 	unsigned int nr_freed;
 	size_t offset;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
-	struct iovec vec[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
 	pgoff_t index;
 	ssize_t res;
 	size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	res = -ENOMEM;
+	vec = __vec;
+	if (pipe->buffers > PIPE_DEF_BUFFERS) {
+		vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+		if (!vec)
+			goto shrink_ret;
+	}
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+	for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
 		struct page *page;
 
 		page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
-		pages[i] = page;
+		spd.pages[i] = page;
 		spd.nr_pages++;
 		len -= this_len;
 		offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
 		this_len = min_t(size_t, vec[i].iov_len, res);
-		partial[i].offset = 0;
-		partial[i].len = this_len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = this_len;
 		if (!this_len) {
-			__free_page(pages[i]);
-			pages[i] = NULL;
+			__free_page(spd.pages[i]);
+			spd.pages[i] = NULL;
 			nr_freed++;
 		}
 		res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	if (res > 0)
 		*ppos += res;
 
+shrink_ret:
+	if (vec != __vec)
+		kfree(vec);
+	splice_shrink_spd(pipe, &spd);
 	return res;
 
 err:
 	for (i = 0; i < spd.nr_pages; i++)
-		__free_page(pages[i]);
+		__free_page(spd.pages[i]);
 
-	return error;
+	res = error;
+	goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
 
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 		if (!buf->len) {
 			buf->ops = NULL;
 			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
 			if (pipe->inode)
 				sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
 	 * If we did an incomplete transfer we must release
 	 * the pipe buffers in question:
 	 */
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 
 		if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned)
+				struct partial_page *partial, int aligned,
+				unsigned int pipe_buffers)
 {
 	int buffers = 0, error = 0;
 
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 			break;
 
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		if (npages > PIPE_BUFFERS - buffers)
-			npages = PIPE_BUFFERS - buffers;
+		if (npages > pipe_buffers - buffers)
+			npages = pipe_buffers - buffers;
 
 		error = get_user_pages_fast((unsigned long)base, npages,
 					0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * or if we mapped the max number of pages that we have
 		 * room for.
 		 */
-		if (error < npages || buffers == PIPE_BUFFERS)
+		if (error < npages || buffers == pipe_buffers)
 			break;
 
 		nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		.ops = &user_page_pipe_buf_ops,
 		.spd_release = spd_release_page,
 	};
+	long ret;
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
 		return -EBADF;
 
-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
-					    flags & SPLICE_F_GIFT);
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+					    spd.partial, flags & SPLICE_F_GIFT,
+					    pipe->buffers);
 	if (spd.nr_pages <= 0)
-		return spd.nr_pages;
+		ret = spd.nr_pages;
+	else
+		ret = splice_to_pipe(pipe, &spd);
 
-	return splice_to_pipe(pipe, &spd);
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe->nrbufs < PIPE_BUFFERS)
+	if (pipe->nrbufs < pipe->buffers)
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (pipe->nrbufs >= PIPE_BUFFERS) {
+	while (pipe->nrbufs >= pipe->buffers) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
 		 * Cannot make any progress, because either the input
 		 * pipe is empty or the output pipe is full.
 		 */
-		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
 			/* Already processed some buffers, break */
 			if (ret)
 				break;
@@ -1831,7 +1890,7 @@ retry:
 		}
 
 		ibuf = ipipe->bufs + ipipe->curbuf;
-		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 		obuf = opipe->bufs + nbuf;
 
 		if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
 			*obuf = *ibuf;
 			ibuf->ops = NULL;
 			opipe->nrbufs++;
-			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
 			ipipe->nrbufs--;
 			input_wakeup = true;
 		} else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 * If we have iterated all input buffers or ran out of
 		 * output room, break.
 		 */
-		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
 			break;
 
-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 
 		/*
 		 * Get a reference to this pipe buffer,
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 86037400a6e3..afc00af3229b 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -21,6 +21,12 @@
  */
 #define F_NOTIFY	(F_LINUX_SPECIFIC_BASE+2)
 
+/*
+ * Set and get of pipe page size array
+ */
+#define F_SETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 7)
+#define F_GETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 8)
+
 /*
  * Types of directory notifications that may be requested.
  */
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b43a9e039059..65f4282fcbaf 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -3,7 +3,7 @@
 
 #define PIPEFS_MAGIC 0x50495045
 
-#define PIPE_BUFFERS (16)
+#define PIPE_DEF_BUFFERS	16
 
 #define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
 #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
@@ -44,17 +44,17 @@ struct pipe_buffer {
  **/
 struct pipe_inode_info {
 	wait_queue_head_t wait;
-	unsigned int nrbufs, curbuf;
-	struct page *tmp_page;
+	unsigned int nrbufs, curbuf, buffers;
 	unsigned int readers;
 	unsigned int writers;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
+	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 	struct inode *inode;
-	struct pipe_buffer bufs[PIPE_BUFFERS];
+	struct pipe_buffer *bufs;
 };
 
 /*
@@ -154,4 +154,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
+/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
+
 #endif
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 18e7c7c0cae6..997c3b4c212b 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 				      splice_direct_actor *);
 
+/*
+ * for dynamic pipe sizing
+ */
+extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
+extern void splice_shrink_spd(struct pipe_inode_info *,
+				struct splice_pipe_desc *);
+
 #endif
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..4268287148c1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	size_t read_subbuf = read_start / subbuf_size;
 	size_t padding = rbuf->padding[read_subbuf];
 	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
 	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
 		return 0;
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
 
 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
 		unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
 		}
 	}
 
+	ret = 0;
 	if (!spd.nr_pages)
-		return 0;
+		goto out;
 
 	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
 	if (ret < 0 || ret < total_len)
-		return ret;
+		goto out;
 
         if (read_start + ret == nonpad_end)
                 ret += padding;
 
+out:
+	splice_shrink_spd(pipe, &spd);
         return ret;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..7b155a0e6f31 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3269,12 +3269,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 					size_t len,
 					unsigned int flags)
 {
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.nr_pages	= 0, /* This gets updated below. */
 		.flags		= flags,
 		.ops		= &tracing_pipe_buf_ops,
@@ -3285,6 +3285,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	size_t rem;
 	unsigned int i;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
 	if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3315,23 +3318,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	trace_access_lock(iter->cpu_file);
 
 	/* Fill as many pages as possible. */
-	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
-		pages[i] = alloc_page(GFP_KERNEL);
-		if (!pages[i])
+	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+		spd.pages[i] = alloc_page(GFP_KERNEL);
+		if (!spd.pages[i])
 			break;
 
 		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-					  page_address(pages[i]),
+					  page_address(spd.pages[i]),
 					  iter->seq.len);
 		if (ret < 0) {
-			__free_page(pages[i]);
+			__free_page(spd.pages[i]);
 			break;
 		}
-		partial[i].offset = 0;
-		partial[i].len = iter->seq.len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = iter->seq.len;
 
 		trace_seq_init(&iter->seq);
 	}
@@ -3342,12 +3345,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	spd.nr_pages = i;
 
-	return splice_to_pipe(pipe, &spd);
+	ret = splice_to_pipe(pipe, &spd);
+out:
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 
 out_err:
 	mutex_unlock(&iter->mutex);
-
-	return ret;
+	goto out;
 }
 
 static ssize_t
@@ -3746,11 +3751,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			    unsigned int flags)
 {
 	struct ftrace_buffer_info *info = file->private_data;
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.flags		= flags,
 		.ops		= &buffer_pipe_buf_ops,
 		.spd_release	= buffer_spd_release,
@@ -3759,22 +3764,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	int entries, size, i;
 	size_t ret;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	if (*ppos & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
 	if (len & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
-		if (len < PAGE_SIZE)
-			return -EINVAL;
+		if (len < PAGE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
 		len &= PAGE_MASK;
 	}
 
 	trace_access_lock(info->cpu);
 	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 
-	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3829,11 +3840,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		else
 			ret = 0;
 		/* TODO: block */
-		return ret;
+		goto out;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
-
+	splice_shrink_spd(pipe, &spd);
+out:
 	return ret;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e060c91e..931981774b1a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1417,12 +1417,13 @@ new_page:
 /*
  * Fill page/offset/length into spd, if it can hold more pages.
  */
-static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+static inline int spd_fill_page(struct splice_pipe_desc *spd,
+				struct pipe_inode_info *pipe, struct page *page,
 				unsigned int *len, unsigned int offset,
 				struct sk_buff *skb, int linear,
 				struct sock *sk)
 {
-	if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+	if (unlikely(spd->nr_pages == pipe->buffers))
 		return 1;
 
 	if (linear) {
@@ -1458,7 +1459,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 				   unsigned int plen, unsigned int *off,
 				   unsigned int *len, struct sk_buff *skb,
 				   struct splice_pipe_desc *spd, int linear,
-				   struct sock *sk)
+				   struct sock *sk,
+				   struct pipe_inode_info *pipe)
 {
 	if (!*len)
 		return 1;
@@ -1481,7 +1483,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 		/* the linear region may spread across several pages  */
 		flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
 
-		if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
+		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
 			return 1;
 
 		__segment_seek(&page, &poff, &plen, flen);
@@ -1496,9 +1498,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
  * Map linear and fragment data from the skb to spd. It reports failure if the
  * pipe is full or if we already spliced the requested length.
  */
-static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
-			     unsigned int *len, struct splice_pipe_desc *spd,
-			     struct sock *sk)
+static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+			     unsigned int *offset, unsigned int *len,
+			     struct splice_pipe_desc *spd, struct sock *sk)
 {
 	int seg;
 
@@ -1508,7 +1510,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 	if (__splice_segment(virt_to_page(skb->data),
 			     (unsigned long) skb->data & (PAGE_SIZE - 1),
 			     skb_headlen(skb),
-			     offset, len, skb, spd, 1, sk))
+			     offset, len, skb, spd, 1, sk, pipe))
 		return 1;
 
 	/*
@@ -1518,7 +1520,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
 
 		if (__splice_segment(f->page, f->page_offset, f->size,
-				     offset, len, skb, spd, 0, sk))
+				     offset, len, skb, spd, 0, sk, pipe))
 			return 1;
 	}
 
@@ -1535,8 +1537,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
 		    unsigned int flags)
 {
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1546,12 +1548,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	};
 	struct sk_buff *frag_iter;
 	struct sock *sk = skb->sk;
+	int ret = 0;
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * __skb_splice_bits() only fails if the output has no room left,
 	 * so no point in going over the frag_list for the error case.
 	 */
-	if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
+	if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
 		goto done;
 	else if (!tlen)
 		goto done;
@@ -1562,14 +1568,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	skb_walk_frags(skb, frag_iter) {
 		if (!tlen)
 			break;
-		if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk))
+		if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
 			break;
 	}
 
 done:
 	if (spd.nr_pages) {
-		int ret;
-
 		/*
 		 * Drop the socket lock, otherwise we have reverse
 		 * locking dependencies between sk_lock and i_mutex
@@ -1582,10 +1586,10 @@ done:
 		release_sock(sk);
 		ret = splice_to_pipe(pipe, &spd);
 		lock_sock(sk);
-		return ret;
 	}
 
-	return 0;
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From b492e95be0ae672922f4734acf3f5d35c30be948 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 19 May 2010 21:03:16 +0200
Subject: pipe: set lower and upper limit on max pages in the pipe page array

We need at least two to guarantee proper POSIX behaviour, so
never allow a smaller limit than that.

Also expose a /proc/sys/fs/pipe-max-pages sysctl file that allows
root to define a sane upper limit. Make it default to 16 times the
default size, which is 16 pages.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/pipe.c                 | 15 +++++++++++++++
 include/linux/pipe_fs_i.h |  2 ++
 kernel/sysctl.c           |  9 +++++++++
 3 files changed, 26 insertions(+)

diff --git a/fs/pipe.c b/fs/pipe.c
index 054b8a6a2c7a..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -19,10 +19,17 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -1162,6 +1169,14 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
+		if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+			return -EINVAL;
+		/*
+		 * The pipe needs to be at least 2 pages large to
+		 * guarantee POSIX behaviour.
+		 */
+		if (arg < 2)
+			return -EINVAL;
 		ret = pipe_set_size(pipe, arg);
 		break;
 	case F_GETPIPE_SZ:
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 65f4282fcbaf..16de3933c45e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *);
 void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
+extern unsigned int pipe_max_pages;
+
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..c649d1c5fe09 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,7 @@
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1423,6 +1424,14 @@ static struct ctl_table fs_table[] = {
 		.child		= binfmt_misc_table,
 	},
 #endif
+	{
+		.procname	= "pipe-max-pages",
+		.data		= &pipe_max_pages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &two,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-59-g8ed1b