From 1e3335de05da3dfbe48b8caa03db1834a2133256 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 14 Feb 2007 19:59:49 +0100
Subject: cfq-iosched: improve preemption for cooperating tasks

When testing the syslet async io approach, I discovered that CFQ
sometimes didn't perform as well as expected. cfq_should_preempt()
needs to better check for cooperating tasks, so fix that by allowing
preemption of an equal priority queue if the recently queued request
is as good a candidate for IO as the one we are currently waiting for.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f92ba2a869b4..bfb396774cbb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -867,15 +867,11 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd)
 
 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_remove_request(rq);
 	cfqq->on_dispatch[rq_is_sync(rq)]++;
 	elv_dispatch_sort(q, rq);
-
-	rq = list_entry(q->queue_head.prev, struct request, queuelist);
-	cfqd->last_sector = rq->sector + rq->nr_sectors;
 }
 
 /*
@@ -1585,6 +1581,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		   struct request *rq)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
+	sector_t dist;
 
 	if (cfq_class_idle(new_cfqq))
 		return 0;
@@ -1594,14 +1591,14 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 
 	if (cfq_class_idle(cfqq))
 		return 1;
-	if (!cfq_cfqq_wait_request(new_cfqq))
-		return 0;
+
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return 1;
+
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
@@ -1609,6 +1606,21 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (rq_is_meta(rq) && !cfqq->meta_pending)
 		return 1;
 
+	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
+		return 0;
+
+	/*
+	 * if this request is as-good as one we would expect from the
+	 * current cfqq, let it preempt
+	 */
+	if (rq->sector > cfqd->last_sector)
+		dist = rq->sector - cfqd->last_sector;
+	else
+		dist = cfqd->last_sector - rq->sector;
+
+	if (dist <= cfqd->active_cic->seek_mean)
+		return 1;
+
 	return 0;
 }
 
@@ -1719,6 +1731,8 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	cfqq->on_dispatch[sync]--;
 	cfqq->service_last = now;
 
+	cfqd->last_sector = rq->hard_sector + rq->hard_nr_sectors;
+
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6d048f5310aa2dda2b5acd947eab3598c25e269f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 25 Apr 2007 12:44:27 +0200
Subject: cfq-iosched: development update

- Implement logic for detecting cooperating processes, so we
  choose the best available queue whenever possible.

- Improve residual slice time accounting.

- Remove dead code: we no longer see async requests coming in on
  sync queues. That part was removed a long time ago. That means
  that we can also remove the difference between cfq_cfqq_sync()
  and cfq_cfqq_class_sync(), they are now indentical. And we can
  kill the on_dispatch array, just make it a counter.

- Allow a process to go into the current list, if it hasn't been
  serviced in this scheduler tick yet.

Possible future improvements including caching the cfqq lookup
in cfq_close_cooperator(), so we don't have to look it up twice.
cfq_get_best_queue() should just use that last decision instead
of doing it again.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 381 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 261 insertions(+), 120 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index bfb396774cbb..28236f2cd908 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -56,13 +56,7 @@ static struct completion *ioc_gone;
 #define ASYNC			(0)
 #define SYNC			(1)
 
-#define cfq_cfqq_dispatched(cfqq)	\
-	((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
-
-#define cfq_cfqq_class_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
-
-#define cfq_cfqq_sync(cfqq)		\
-	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
+#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
 
 #define sample_valid(samples)	((samples) > 80)
 
@@ -79,6 +73,7 @@ struct cfq_data {
 	struct list_head busy_rr;
 	struct list_head cur_rr;
 	struct list_head idle_rr;
+	unsigned long cur_rr_tick;
 	unsigned int busy_queues;
 
 	/*
@@ -98,11 +93,12 @@ struct cfq_data {
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
 	int cur_prio, cur_end_prio;
+	unsigned long prio_time;
 	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
 
-	sector_t last_sector;
+	sector_t last_position;
 	unsigned long last_end_request;
 
 	/*
@@ -117,6 +113,9 @@ struct cfq_data {
 	unsigned int cfq_slice_idle;
 
 	struct list_head cic_list;
+
+	sector_t new_seek_mean;
+	u64 new_seek_total;
 };
 
 /*
@@ -133,6 +132,8 @@ struct cfq_queue {
 	unsigned int key;
 	/* member of the rr/busy/cur/idle cfqd list */
 	struct list_head cfq_list;
+	/* in what tick we were last serviced */
+	unsigned long rr_tick;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
@@ -148,10 +149,11 @@ struct cfq_queue {
 
 	unsigned long slice_end;
 	unsigned long service_last;
+	unsigned long slice_start;
 	long slice_resid;
 
-	/* number of requests that are on the dispatch list */
-	int on_dispatch[2];
+	/* number of requests that are on the dispatch list or inside driver */
+	int dispatched;
 
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
@@ -159,6 +161,8 @@ struct cfq_queue {
 
 	/* various state flags, see below */
 	unsigned int flags;
+
+	sector_t last_request_pos;
 };
 
 enum cfqq_state_flags {
@@ -259,6 +263,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * easily introduce oscillations.
 	 */
 	cfqq->slice_resid = 0;
+
+	cfqq->slice_start = jiffies;
 }
 
 /*
@@ -307,7 +313,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 	s1 = rq1->sector;
 	s2 = rq2->sector;
 
-	last = cfqd->last_sector;
+	last = cfqd->last_position;
 
 	/*
 	 * by definition, 1KiB is 2 sectors
@@ -398,39 +404,42 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return cfq_choose_req(cfqd, next, prev);
 }
 
-static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
+/*
+ * This function finds out where to insert a BE queue in the service hierarchy
+ */
+static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+				int preempted)
 {
-	struct cfq_data *cfqd = cfqq->cfqd;
 	struct list_head *list, *n;
 	struct cfq_queue *__cfqq;
+	int add_tail = 0;
 
 	/*
-	 * Resorting requires the cfqq to be on the RR list already.
+	 * if cfqq has requests in flight, don't allow it to be
+	 * found in cfq_set_active_queue before it has finished them.
+	 * this is done to increase fairness between a process that
+	 * has lots of io pending vs one that only generates one
+	 * sporadically or synchronously
 	 */
-	if (!cfq_cfqq_on_rr(cfqq))
-		return;
-
-	list_del(&cfqq->cfq_list);
-
-	if (cfq_class_rt(cfqq))
+	if (cfqq->dispatched)
+		list = &cfqd->busy_rr;
+	else if (cfqq->ioprio == (cfqd->cur_prio + 1) &&
+		 cfq_cfqq_sync(cfqq) &&
+		 (time_before(cfqd->prio_time, cfqq->service_last) ||
+		  cfq_cfqq_queue_new(cfqq) || preempted)) {
 		list = &cfqd->cur_rr;
-	else if (cfq_class_idle(cfqq))
-		list = &cfqd->idle_rr;
-	else {
+		add_tail = 1;
+	} else
+		list = &cfqd->rr_list[cfqq->ioprio];
+
+	if (!cfq_cfqq_sync(cfqq) || add_tail) {
 		/*
-		 * if cfqq has requests in flight, don't allow it to be
-		 * found in cfq_set_active_queue before it has finished them.
-		 * this is done to increase fairness between a process that
-		 * has lots of io pending vs one that only generates one
-		 * sporadically or synchronously
+		 * async queue always goes to the end. this wont be overly
+		 * unfair to writes, as the sort of the sync queue wont be
+		 * allowed to pass the async queue again.
 		 */
-		if (cfq_cfqq_dispatched(cfqq))
-			list = &cfqd->busy_rr;
-		else
-			list = &cfqd->rr_list[cfqq->ioprio];
-	}
-
-	if (preempted || cfq_cfqq_queue_new(cfqq)) {
+		list_add_tail(&cfqq->cfq_list, list);
+	} else if (preempted || cfq_cfqq_queue_new(cfqq)) {
 		/*
 		 * If this queue was preempted or is new (never been serviced),
 		 * let it be added first for fairness but beind other new
@@ -444,14 +453,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 
 			n = n->next;
 		}
-		list_add_tail(&cfqq->cfq_list, n);
-	} else if (!cfq_cfqq_class_sync(cfqq)) {
-		/*
-		 * async queue always goes to the end. this wont be overly
-		 * unfair to writes, as the sort of the sync queue wont be
-		 * allowed to pass the async queue again.
-		 */
-		list_add_tail(&cfqq->cfq_list, list);
+		list_add(&cfqq->cfq_list, n);
 	} else {
 		/*
 		 * sort by last service, but don't cross a new or async
@@ -461,17 +463,54 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 		 */
 		n = list;
 		while ((n = n->prev) != list) {
-			struct cfq_queue *__cfqq = list_entry_cfqq(n);
+			struct cfq_queue *__c = list_entry_cfqq(n);
 
-			if (!cfq_cfqq_class_sync(cfqq) || !__cfqq->service_last)
+			if (!cfq_cfqq_sync(__c) || !__c->service_last)
 				break;
-			if (time_before(__cfqq->service_last, cfqq->service_last))
+			if (time_before(__c->service_last, cfqq->service_last))
 				break;
 		}
 		list_add(&cfqq->cfq_list, n);
 	}
 }
 
+static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
+{
+	struct cfq_data *cfqd = cfqq->cfqd;
+	struct list_head *n;
+
+	/*
+	 * Resorting requires the cfqq to be on the RR list already.
+	 */
+	if (!cfq_cfqq_on_rr(cfqq))
+		return;
+
+	list_del(&cfqq->cfq_list);
+
+	if (cfq_class_rt(cfqq)) {
+		/*
+		 * At to the front of the current list, but behind other
+		 * RT queues.
+		 */
+		n = &cfqd->cur_rr;
+		while (n->next != &cfqd->cur_rr)
+			if (!cfq_class_rt(cfqq))
+				break;
+
+		list_add(&cfqq->cfq_list, n);
+	} else if (cfq_class_idle(cfqq)) {
+		/*
+		 * IDLE goes to the tail of the idle list
+		 */
+		list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr);
+	} else {
+		/*
+		 * So we get here, ergo the queue is a regular best-effort queue
+		 */
+		cfq_resort_be_queue(cfqd, cfqq, preempted);
+	}
+}
+
 /*
  * add to busy list of queues for service, trying to be fair in ordering
  * the pending list according to last request service
@@ -579,6 +618,8 @@ static void cfq_activate_request(request_queue_t *q, struct request *rq)
 	 */
 	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
 		cfqd->hw_tag = 1;
+
+	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
 }
 
 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
@@ -684,6 +725,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
+		cfqq->rr_tick = cfqd->cur_rr_tick;
 	}
 
 	cfqd->active_queue = cfqq;
@@ -786,10 +828,46 @@ static int cfq_get_next_prio_level(struct cfq_data *cfqd)
 		cfqd->cur_end_prio = 0;
 	}
 
+	cfqd->cur_rr_tick++;
+	cfqd->prio_time = jiffies;
 	return prio;
 }
 
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+					  struct request *rq)
+{
+	if (rq->sector >= cfqd->last_position)
+		return rq->sector - cfqd->last_position;
+	else
+		return cfqd->last_position - rq->sector;
+}
+
+static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq = NULL, *__cfqq;
+	sector_t best = -1, dist;
+
+	list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) {
+		if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq))
+			continue;
+
+		dist = cfq_dist_from_last(cfqd, __cfqq->next_rq);
+		if (dist < best) {
+			best = dist;
+			cfqq = __cfqq;
+		}
+	}
+
+	/*
+	 * Only async queue(s) available, grab first entry
+	 */
+	if (!cfqq)
+		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+
+	return cfqq;
+}
+
+static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = NULL;
 
@@ -799,7 +877,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 		 * empty, get next prio level and grab first entry then if any
 		 * are spliced
 		 */
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+		cfqq = cfq_get_best_queue(cfqd);
 	} else if (!list_empty(&cfqd->busy_rr)) {
 		/*
 		 * If no new queues are available, check if the busy list has
@@ -820,49 +898,128 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 			mod_timer(&cfqd->idle_class_timer, end);
 	}
 
+	return cfqq;
+}
+
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq;
+
+	do {
+		long prio;
+
+		cfqq = cfq_get_next_queue(cfqd);
+		if (!cfqq)
+			break;
+
+		prio = cfq_prio_to_slice(cfqd, cfqq);
+		if (cfqq->slice_resid > -prio)
+			break;
+
+		cfqq->slice_resid += prio;
+		list_del_init(&cfqq->cfq_list);
+		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
+		cfqq = NULL;
+	} while (1);
+
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 
-#define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
+{
+	struct cfq_io_context *cic = cfqd->active_cic;
+
+	if (!sample_valid(cic->seek_samples))
+		return 0;
+
+	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
+}
+
+static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd,
+						struct cfq_queue *cur_cfqq,
+						struct list_head *list)
+{
+	struct cfq_queue *cfqq;
+
+	list_for_each_entry(cfqq, list, cfq_list) {
+		if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq))
+			continue;
+
+		BUG_ON(!cfqq->next_rq);
+
+		if (cfq_rq_close(cfqd, cfqq->next_rq))
+			return cfqq;
+	}
+
+	return NULL;
+}
+
+static int cfq_close_cooperator(struct cfq_data *cfqd,
+				struct cfq_queue *cur_cfqq)
+{
+	struct cfq_queue *cfqq;
+
+	if (!cfqd->busy_queues)
+		return 0;
+
+	/*
+	 * check cur_rr and same-prio rr_list for candidates
+	 */
+	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr);
+	if (cfqq)
+		return 1;
+
+	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]);
+	if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick))
+		cfqq = NULL;
+
+	return cfqq != NULL;
+}
+
+#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
 
-static int cfq_arm_slice_timer(struct cfq_data *cfqd)
+static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	struct cfq_io_context *cic;
 	unsigned long sl;
 
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
+	WARN_ON(cfq_cfqq_slice_new(cfqq));
 
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
-	if (!cfqd->cfq_slice_idle)
-		return 0;
-	if (!cfq_cfqq_idle_window(cfqq))
-		return 0;
+	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
+		return;
+
 	/*
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
 	if (!cic || !cic->ioc->task)
-		return 0;
+		return;
+
+	/*
+	 * See if this prio level has a good candidate
+	 */
+	if (cfq_close_cooperator(cfqd, cfqq))
+		return;
 
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
-	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
-
 	/*
 	 * we don't want to idle for seeks, but we do want to allow
 	 * fair distribution of slice time for a process doing back-to-back
 	 * seeks. so allow a little bit of time for him to submit a new rq
 	 */
+	sl = cfqd->cfq_slice_idle;
 	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
 		sl = min(sl, msecs_to_jiffies(2));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	return 1;
 }
 
 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
@@ -870,7 +1027,7 @@ static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_remove_request(rq);
-	cfqq->on_dispatch[rq_is_sync(rq)]++;
+	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
 }
 
@@ -891,13 +1048,13 @@ static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 	if (list_empty(&cfqq->fifo))
 		return NULL;
 
-	fifo = cfq_cfqq_class_sync(cfqq);
+	fifo = cfq_cfqq_sync(cfqq);
 	rq = rq_entry_fifo(cfqq->fifo.next);
 
-	if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
-		return rq;
+	if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
+		return NULL;
 
-	return NULL;
+	return rq;
 }
 
 static inline int
@@ -922,23 +1079,26 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		goto new_queue;
 
 	/*
-	 * slice has expired
+	 * The active queue has run out of time, expire it and select new.
 	 */
-	if (!cfq_cfqq_must_dispatch(cfqq) && cfq_slice_used(cfqq))
+	if (cfq_slice_used(cfqq))
 		goto expire;
 
 	/*
-	 * if queue has requests, dispatch one. if not, check if
-	 * enough slice is left to wait for one
+	 * The active queue has requests and isn't expired, allow it to
+	 * dispatch.
 	 */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
-	else if (cfq_cfqq_slice_new(cfqq) || cfq_cfqq_dispatched(cfqq)) {
+
+	/*
+	 * No requests pending. If the active queue still has requests in
+	 * flight or is idling for a new request, allow either of these
+	 * conditions to happen (or time out) before selecting a new queue.
+	 */
+	if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) {
 		cfqq = NULL;
 		goto keep_queue;
-	} else if (cfq_cfqq_class_sync(cfqq)) {
-		if (cfq_arm_slice_timer(cfqd))
-			return NULL;
 	}
 
 expire:
@@ -1039,7 +1199,7 @@ static int
 cfq_dispatch_requests(request_queue_t *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq, *prev_cfqq;
+	struct cfq_queue *cfqq;
 	int dispatched;
 
 	if (!cfqd->busy_queues)
@@ -1049,23 +1209,19 @@ cfq_dispatch_requests(request_queue_t *q, int force)
 		return cfq_forced_dispatch(cfqd);
 
 	dispatched = 0;
-	prev_cfqq = NULL;
 	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
 		int max_dispatch;
 
 		if (cfqd->busy_queues > 1) {
-			/*
-			 * Don't repeat dispatch from the previous queue.
-			 */
-			if (prev_cfqq == cfqq)
-				break;
-
 			/*
 			 * So we have dispatched before in this round, if the
 			 * next queue has idling enabled (must be sync), don't
-			 * allow it service until the previous have continued.
+			 * allow it service until the previous have completed.
 			 */
-			if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq))
+			if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) &&
+			    dispatched)
+				break;
+			if (cfqq->dispatched >= cfqd->cfq_quantum)
 				break;
 		}
 
@@ -1078,7 +1234,6 @@ cfq_dispatch_requests(request_queue_t *q, int force)
 			max_dispatch = 1;
 
 		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
-		prev_cfqq = cfqq;
 	}
 
 	return dispatched;
@@ -1520,7 +1675,8 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 }
 
 static void
-cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq)
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		       struct request *rq)
 {
 	sector_t sdist;
 	u64 total;
@@ -1530,6 +1686,11 @@ cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq)
 	else
 		sdist = cic->last_request_pos - rq->sector;
 
+	if (!cic->seek_samples) {
+		cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+		cfqd->new_seek_mean = cfqd->new_seek_total / 256;
+	}
+
 	/*
 	 * Don't allow the seek distance to get too large from the
 	 * odd fragment, pagein, etc
@@ -1580,13 +1741,16 @@ static int
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		   struct request *rq)
 {
-	struct cfq_queue *cfqq = cfqd->active_queue;
-	sector_t dist;
+	struct cfq_queue *cfqq;
 
-	if (cfq_class_idle(new_cfqq))
+	cfqq = cfqd->active_queue;
+	if (!cfqq)
 		return 0;
 
-	if (!cfqq)
+	if (cfq_slice_used(cfqq))
+		return 1;
+
+	if (cfq_class_idle(new_cfqq))
 		return 0;
 
 	if (cfq_class_idle(cfqq))
@@ -1613,12 +1777,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (rq->sector > cfqd->last_sector)
-		dist = rq->sector - cfqd->last_sector;
-	else
-		dist = cfqd->last_sector - rq->sector;
-
-	if (dist <= cfqd->active_cic->seek_mean)
+	if (cfq_rq_close(cfqd, rq))
 		return 1;
 
 	return 0;
@@ -1656,28 +1815,12 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (rq_is_meta(rq))
 		cfqq->meta_pending++;
 
-	/*
-	 * we never wait for an async request and we don't allow preemption
-	 * of an async request. so just return early
-	 */
-	if (!rq_is_sync(rq)) {
-		/*
-		 * sync process issued an async request, if it's waiting
-		 * then expire it and kick rq handling.
-		 */
-		if (cic == cfqd->active_cic &&
-		    del_timer(&cfqd->idle_slice_timer)) {
-			cfq_slice_expired(cfqd, 0, 0);
-			blk_start_queueing(cfqd->queue);
-		}
-		return;
-	}
-
 	cfq_update_io_thinktime(cfqd, cic);
-	cfq_update_io_seektime(cic, rq);
+	cfq_update_io_seektime(cfqd, cic, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_request_pos = rq->sector + rq->nr_sectors;
+	cfqq->last_request_pos = cic->last_request_pos;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1726,13 +1869,11 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	now = jiffies;
 
 	WARN_ON(!cfqd->rq_in_driver);
-	WARN_ON(!cfqq->on_dispatch[sync]);
+	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
-	cfqq->on_dispatch[sync]--;
+	cfqq->dispatched--;
 	cfqq->service_last = now;
 
-	cfqd->last_sector = rq->hard_sector + rq->hard_nr_sectors;
-
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
@@ -1752,11 +1893,12 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 		}
 		if (cfq_slice_used(cfqq))
 			cfq_slice_expired(cfqd, 0, 1);
-		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
-			if (!cfq_arm_slice_timer(cfqd))
-				cfq_schedule_dispatch(cfqd);
-		}
+		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+			cfq_arm_slice_timer(cfqd);
 	}
+
+	if (!cfqd->rq_in_driver)
+		cfq_schedule_dispatch(cfqd);
 }
 
 /*
@@ -2101,7 +2243,6 @@ fail:
 /*
  * sysfs parts below -->
  */
-
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 1afba0451c83cbff622a08f2d86fbb2e680dfd5f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Apr 2007 12:47:55 +0200
Subject: cfq-iosched: minor updates

- Move the queue_new flag clear to when the queue is selected
- Only select the non-first queue in cfq_get_best_queue(), if there's
  a substantial difference between the best and first.
- Get rid of ->busy_rr
- Only select a close cooperator, if the current queue is known to take
  a while to "think".

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 81 ++++++++++++-----------------------------------------
 1 file changed, 18 insertions(+), 63 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 28236f2cd908..9d6f04103f01 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -70,7 +70,6 @@ struct cfq_data {
 	 * rr list of queues with requests and the count of them
 	 */
 	struct list_head rr_list[CFQ_PRIO_LISTS];
-	struct list_head busy_rr;
 	struct list_head cur_rr;
 	struct list_head idle_rr;
 	unsigned long cur_rr_tick;
@@ -410,59 +409,18 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				int preempted)
 {
-	struct list_head *list, *n;
-	struct cfq_queue *__cfqq;
-	int add_tail = 0;
-
-	/*
-	 * if cfqq has requests in flight, don't allow it to be
-	 * found in cfq_set_active_queue before it has finished them.
-	 * this is done to increase fairness between a process that
-	 * has lots of io pending vs one that only generates one
-	 * sporadically or synchronously
-	 */
-	if (cfqq->dispatched)
-		list = &cfqd->busy_rr;
-	else if (cfqq->ioprio == (cfqd->cur_prio + 1) &&
-		 cfq_cfqq_sync(cfqq) &&
-		 (time_before(cfqd->prio_time, cfqq->service_last) ||
-		  cfq_cfqq_queue_new(cfqq) || preempted)) {
-		list = &cfqd->cur_rr;
-		add_tail = 1;
-	} else
-		list = &cfqd->rr_list[cfqq->ioprio];
-
-	if (!cfq_cfqq_sync(cfqq) || add_tail) {
-		/*
-		 * async queue always goes to the end. this wont be overly
-		 * unfair to writes, as the sort of the sync queue wont be
-		 * allowed to pass the async queue again.
-		 */
-		list_add_tail(&cfqq->cfq_list, list);
-	} else if (preempted || cfq_cfqq_queue_new(cfqq)) {
-		/*
-		 * If this queue was preempted or is new (never been serviced),
-		 * let it be added first for fairness but beind other new
-		 * queues.
-		 */
-		n = list;
-		while (n->next != list) {
-			__cfqq = list_entry_cfqq(n->next);
-			if (!cfq_cfqq_queue_new(__cfqq))
-				break;
+	if (!cfq_cfqq_sync(cfqq))
+		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
+	else {
+		struct list_head *n = &cfqd->rr_list[cfqq->ioprio];
 
-			n = n->next;
-		}
-		list_add(&cfqq->cfq_list, n);
-	} else {
 		/*
 		 * sort by last service, but don't cross a new or async
-		 * queue. we don't cross a new queue because it hasn't been
-		 * service before, and we don't cross an async queue because
-		 * it gets added to the end on expire.
+		 * queue. we don't cross a new queue because it hasn't
+		 * been service before, and we don't cross an async
+		 * queue because it gets added to the end on expire.
 		 */
-		n = list;
-		while ((n = n->prev) != list) {
+		while ((n = n->prev) != &cfqd->rr_list[cfqq->ioprio]) {
 			struct cfq_queue *__c = list_entry_cfqq(n);
 
 			if (!cfq_cfqq_sync(__c) || !__c->service_last)
@@ -725,6 +683,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
+		cfq_clear_cfqq_queue_new(cfqq);
 		cfqq->rr_tick = cfqd->cur_rr_tick;
 	}
 
@@ -743,7 +702,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	cfq_clear_cfqq_must_dispatch(cfqq);
 	cfq_clear_cfqq_wait_request(cfqq);
-	cfq_clear_cfqq_queue_new(cfqq);
 
 	/*
 	 * store what was left of this slice, if the queue idled out
@@ -845,13 +803,15 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = NULL, *__cfqq;
-	sector_t best = -1, dist;
+	sector_t best = -1, first = -1, dist;
 
 	list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) {
 		if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq))
 			continue;
 
 		dist = cfq_dist_from_last(cfqd, __cfqq->next_rq);
+		if (first == -1)
+			first = dist;
 		if (dist < best) {
 			best = dist;
 			cfqq = __cfqq;
@@ -859,9 +819,11 @@ static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd)
 	}
 
 	/*
-	 * Only async queue(s) available, grab first entry
+	 * Only async queue(s) available, grab first entry. Do the same
+	 * if the difference between the first and best isn't more than
+	 * twice, to obey fairness.
 	 */
-	if (!cfqq)
+	if (!cfqq || (best && first != best && ((first / best) < 4)))
 		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
 
 	return cfqq;
@@ -878,12 +840,6 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		 * are spliced
 		 */
 		cfqq = cfq_get_best_queue(cfqd);
-	} else if (!list_empty(&cfqd->busy_rr)) {
-		/*
-		 * If no new queues are available, check if the busy list has
-		 * some before falling back to idle io.
-		 */
-		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
 	} else if (!list_empty(&cfqd->idle_rr)) {
 		/*
 		 * if we have idle queues and no rt or be queues had pending
@@ -1004,7 +960,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	/*
 	 * See if this prio level has a good candidate
 	 */
-	if (cfq_close_cooperator(cfqd, cfqq))
+	if (cfq_close_cooperator(cfqd, cfqq) &&
+	    (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
 		return;
 
 	cfq_mark_cfqq_must_dispatch(cfqq);
@@ -1184,7 +1141,6 @@ cfq_forced_dispatch(struct cfq_data *cfqd)
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
 
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
 
@@ -2174,7 +2130,6 @@ static void *cfq_init_queue(request_queue_t *q)
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		INIT_LIST_HEAD(&cfqd->rr_list[i]);
 
-	INIT_LIST_HEAD(&cfqd->busy_rr);
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->cic_list);
-- 
cgit v1.2.3-59-g8ed1b


From d9e7620e60bc6648c3dcabbc8d1a320b69c846f9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 20 Apr 2007 14:27:50 +0200
Subject: cfq-iosched: rework the whole round-robin list concept

Drawing on some inspiration from the CFS CPU scheduler design, overhaul
the pending cfq_queue concept list management. Currently CFQ uses a
doubly linked list per priority level for sorting and service uses.
Kill those lists and maintain an rbtree of cfq_queue's, sorted by when
to service them.

This unfortunately means that the ionice levels aren't as strong
anymore, will work on improving those later. We only scale the slice
time now, not the number of times we service. This means that latency
is better (for all priority levels), but that the distinction between
the highest and lower levels aren't as big.

The diffstat speaks for itself.

 cfq-iosched.c |  363 +++++++++++++++++---------------------------------
 1 file changed, 125 insertions(+), 238 deletions(-)

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 361 ++++++++++++++++++----------------------------------
 1 file changed, 123 insertions(+), 238 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9d6f04103f01..4838c2b16f2c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -26,7 +26,16 @@ static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 
+/*
+ * grace period before allowing idle class to get disk access
+ */
 #define CFQ_IDLE_GRACE		(HZ / 10)
+
+/*
+ * below this threshold, we consider thinktime immediate
+ */
+#define CFQ_MIN_TT		(2)
+
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
@@ -69,10 +78,9 @@ struct cfq_data {
 	/*
 	 * rr list of queues with requests and the count of them
 	 */
-	struct list_head rr_list[CFQ_PRIO_LISTS];
+	struct rb_root service_tree;
 	struct list_head cur_rr;
 	struct list_head idle_rr;
-	unsigned long cur_rr_tick;
 	unsigned int busy_queues;
 
 	/*
@@ -91,8 +99,6 @@ struct cfq_data {
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
-	int cur_prio, cur_end_prio;
-	unsigned long prio_time;
 	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
@@ -131,8 +137,10 @@ struct cfq_queue {
 	unsigned int key;
 	/* member of the rr/busy/cur/idle cfqd list */
 	struct list_head cfq_list;
-	/* in what tick we were last serviced */
-	unsigned long rr_tick;
+	/* service_tree member */
+	struct rb_node rb_node;
+	/* service_tree key */
+	unsigned long rb_key;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
@@ -147,8 +155,6 @@ struct cfq_queue {
 	struct list_head fifo;
 
 	unsigned long slice_end;
-	unsigned long service_last;
-	unsigned long slice_start;
 	long slice_resid;
 
 	/* number of requests that are on the dispatch list or inside driver */
@@ -240,30 +246,26 @@ static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync)
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
-static inline int
-cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
+				 unsigned short prio)
 {
-	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+	const int base_slice = cfqd->cfq_slice[sync];
 
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
+	WARN_ON(prio >= IOPRIO_BE_NR);
+
+	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
+}
 
-	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
-	cfqq->slice_end += cfqq->slice_resid;
-
-	/*
-	 * Don't carry over residual for more than one slice, we only want
-	 * to slightly correct the fairness. Carrying over forever would
-	 * easily introduce oscillations.
-	 */
-	cfqq->slice_resid = 0;
-
-	cfqq->slice_start = jiffies;
 }
 
 /*
@@ -403,33 +405,50 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return cfq_choose_req(cfqd, next, prev);
 }
 
-/*
- * This function finds out where to insert a BE queue in the service hierarchy
- */
-static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-				int preempted)
+static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+				      struct cfq_queue *cfqq)
 {
-	if (!cfq_cfqq_sync(cfqq))
-		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
-	else {
-		struct list_head *n = &cfqd->rr_list[cfqq->ioprio];
+	/*
+	 * just an approximation, should be ok.
+	 */
+	return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0));
+}
+
+static void cfq_service_tree_add(struct cfq_data *cfqd,
+				    struct cfq_queue *cfqq)
+{
+	struct rb_node **p = &cfqd->service_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_queue *__cfqq;
+	unsigned long rb_key;
+
+	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
+	rb_key += cfqq->slice_resid;
+	cfqq->slice_resid = 0;
 
+	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		/*
-		 * sort by last service, but don't cross a new or async
-		 * queue. we don't cross a new queue because it hasn't
-		 * been service before, and we don't cross an async
-		 * queue because it gets added to the end on expire.
+		 * same position, nothing more to do
 		 */
-		while ((n = n->prev) != &cfqd->rr_list[cfqq->ioprio]) {
-			struct cfq_queue *__c = list_entry_cfqq(n);
+		if (rb_key == cfqq->rb_key)
+			return;
 
-			if (!cfq_cfqq_sync(__c) || !__c->service_last)
-				break;
-			if (time_before(__c->service_last, cfqq->service_last))
-				break;
-		}
-		list_add(&cfqq->cfq_list, n);
+		rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 	}
+
+	while (*p) {
+		parent = *p;
+		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+
+		if (rb_key < __cfqq->rb_key)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	cfqq->rb_key = rb_key;
+	rb_link_node(&cfqq->rb_node, parent, p);
+	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree);
 }
 
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
@@ -443,7 +462,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 	if (!cfq_cfqq_on_rr(cfqq))
 		return;
 
-	list_del(&cfqq->cfq_list);
+	list_del_init(&cfqq->cfq_list);
 
 	if (cfq_class_rt(cfqq)) {
 		/*
@@ -465,7 +484,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 		/*
 		 * So we get here, ergo the queue is a regular best-effort queue
 		 */
-		cfq_resort_be_queue(cfqd, cfqq, preempted);
+		cfq_service_tree_add(cfqd, cfqq);
 	}
 }
 
@@ -490,6 +509,11 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_clear_cfqq_on_rr(cfqq);
 	list_del_init(&cfqq->cfq_list);
 
+	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+		rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+		RB_CLEAR_NODE(&cfqq->rb_node);
+	}
+
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 }
@@ -684,7 +708,6 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
 		cfq_clear_cfqq_queue_new(cfqq);
-		cfqq->rr_tick = cfqd->cur_rr_tick;
 	}
 
 	cfqd->active_queue = cfqq;
@@ -732,114 +755,19 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
 		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
 }
 
-/*
- * 0
- * 0,1
- * 0,1,2
- * 0,1,2,3
- * 0,1,2,3,4
- * 0,1,2,3,4,5
- * 0,1,2,3,4,5,6
- * 0,1,2,3,4,5,6,7
- */
-static int cfq_get_next_prio_level(struct cfq_data *cfqd)
-{
-	int prio, wrap;
-
-	prio = -1;
-	wrap = 0;
-	do {
-		int p;
-
-		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
-			if (!list_empty(&cfqd->rr_list[p])) {
-				prio = p;
-				break;
-			}
-		}
-
-		if (prio != -1)
-			break;
-		cfqd->cur_prio = 0;
-		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-			cfqd->cur_end_prio = 0;
-			if (wrap)
-				break;
-			wrap = 1;
-		}
-	} while (1);
-
-	if (unlikely(prio == -1))
-		return -1;
-
-	BUG_ON(prio >= CFQ_PRIO_LISTS);
-
-	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
-
-	cfqd->cur_prio = prio + 1;
-	if (cfqd->cur_prio > cfqd->cur_end_prio) {
-		cfqd->cur_end_prio = cfqd->cur_prio;
-		cfqd->cur_prio = 0;
-	}
-	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-		cfqd->cur_prio = 0;
-		cfqd->cur_end_prio = 0;
-	}
-
-	cfqd->cur_rr_tick++;
-	cfqd->prio_time = jiffies;
-	return prio;
-}
-
-static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
-					  struct request *rq)
-{
-	if (rq->sector >= cfqd->last_position)
-		return rq->sector - cfqd->last_position;
-	else
-		return cfqd->last_position - rq->sector;
-}
-
-static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq = NULL, *__cfqq;
-	sector_t best = -1, first = -1, dist;
-
-	list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) {
-		if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq))
-			continue;
-
-		dist = cfq_dist_from_last(cfqd, __cfqq->next_rq);
-		if (first == -1)
-			first = dist;
-		if (dist < best) {
-			best = dist;
-			cfqq = __cfqq;
-		}
-	}
-
-	/*
-	 * Only async queue(s) available, grab first entry. Do the same
-	 * if the difference between the first and best isn't more than
-	 * twice, to obey fairness.
-	 */
-	if (!cfqq || (best && first != best && ((first / best) < 4)))
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-
-	return cfqq;
-}
-
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = NULL;
 
-	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
+	if (!list_empty(&cfqd->cur_rr)) {
 		/*
-		 * if current list is non-empty, grab first entry. if it is
-		 * empty, get next prio level and grab first entry then if any
-		 * are spliced
+		 * if current list is non-empty, grab first entry.
 		 */
-		cfqq = cfq_get_best_queue(cfqd);
+		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree)) {
+		struct rb_node *n = rb_first(&cfqd->service_tree);
+
+		cfqq = rb_entry(n, struct cfq_queue, rb_node);
 	} else if (!list_empty(&cfqd->idle_rr)) {
 		/*
 		 * if we have idle queues and no rt or be queues had pending
@@ -861,27 +789,20 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 
-	do {
-		long prio;
-
-		cfqq = cfq_get_next_queue(cfqd);
-		if (!cfqq)
-			break;
-
-		prio = cfq_prio_to_slice(cfqd, cfqq);
-		if (cfqq->slice_resid > -prio)
-			break;
-
-		cfqq->slice_resid += prio;
-		list_del_init(&cfqq->cfq_list);
-		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
-		cfqq = NULL;
-	} while (1);
-
+	cfqq = cfq_get_next_queue(cfqd);
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 
+static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+					  struct request *rq)
+{
+	if (rq->sector >= cfqd->last_position)
+		return rq->sector - cfqd->last_position;
+	else
+		return cfqd->last_position - rq->sector;
+}
+
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 {
 	struct cfq_io_context *cic = cfqd->active_cic;
@@ -892,45 +813,15 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
 }
 
-static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd,
-						struct cfq_queue *cur_cfqq,
-						struct list_head *list)
-{
-	struct cfq_queue *cfqq;
-
-	list_for_each_entry(cfqq, list, cfq_list) {
-		if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq))
-			continue;
-
-		BUG_ON(!cfqq->next_rq);
-
-		if (cfq_rq_close(cfqd, cfqq->next_rq))
-			return cfqq;
-	}
-
-	return NULL;
-}
-
-static int cfq_close_cooperator(struct cfq_data *cfqd,
-				struct cfq_queue *cur_cfqq)
+static int cfq_close_cooperator(struct cfq_data *cfq_data,
+				struct cfq_queue *cfqq)
 {
-	struct cfq_queue *cfqq;
-
-	if (!cfqd->busy_queues)
-		return 0;
-
 	/*
-	 * check cur_rr and same-prio rr_list for candidates
+	 * We should notice if some of the queues are cooperating, eg
+	 * working closely on the same area of the disk. In that case,
+	 * we can group them together and don't waste time idling.
 	 */
-	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr);
-	if (cfqq)
-		return 1;
-
-	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]);
-	if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick))
-		cfqq = NULL;
-
-	return cfqq != NULL;
+	return 0;
 }
 
 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
@@ -974,7 +865,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 */
 	sl = cfqd->cfq_slice_idle;
 	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
-		sl = min(sl, msecs_to_jiffies(2));
+		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 }
@@ -1115,31 +1006,41 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return dispatched;
 }
 
-static int
-cfq_forced_dispatch_cfqqs(struct list_head *list)
+static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+{
+	int dispatched = 0;
+
+	while (cfqq->next_rq) {
+		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
+		dispatched++;
+	}
+
+	BUG_ON(!list_empty(&cfqq->fifo));
+	return dispatched;
+}
+
+static int cfq_forced_dispatch_cfqqs(struct list_head *list)
 {
 	struct cfq_queue *cfqq, *next;
 	int dispatched;
 
 	dispatched = 0;
-	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
-		while (cfqq->next_rq) {
-			cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
-			dispatched++;
-		}
-		BUG_ON(!list_empty(&cfqq->fifo));
-	}
+	list_for_each_entry_safe(cfqq, next, list, cfq_list)
+		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
 	return dispatched;
 }
 
-static int
-cfq_forced_dispatch(struct cfq_data *cfqd)
+static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
-	int i, dispatched = 0;
+	int dispatched = 0;
+	struct rb_node *n;
+
+	while ((n = rb_first(&cfqd->service_tree)) != NULL) {
+		struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
 
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
+		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	}
 
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
@@ -1151,8 +1052,7 @@ cfq_forced_dispatch(struct cfq_data *cfqd)
 	return dispatched;
 }
 
-static int
-cfq_dispatch_requests(request_queue_t *q, int force)
+static int cfq_dispatch_requests(request_queue_t *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
@@ -1222,7 +1122,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	/*
 	 * it's on the empty list and still hashed
 	 */
-	list_del(&cfqq->cfq_list);
 	hlist_del(&cfqq->cfq_hash);
 	kmem_cache_free(cfq_pool, cfqq);
 }
@@ -1391,8 +1290,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
-
-	cfq_resort_rr_list(cfqq, 0);
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
@@ -1478,6 +1375,7 @@ retry:
 
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_NODE(&cfqq->rb_node);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
 		cfqq->key = key;
@@ -1752,7 +1650,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
-	list_move(&cfqq->cfq_list, &cfqd->cur_rr);
+	list_del_init(&cfqq->cfq_list);
+	list_add(&cfqq->cfq_list, &cfqd->cur_rr);
 
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
@@ -1828,13 +1727,10 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
-	cfqq->service_last = now;
 
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-	cfq_resort_rr_list(cfqq, 0);
-
 	if (sync)
 		RQ_CIC(rq)->last_end_request = now;
 
@@ -1863,9 +1759,6 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
-	const int ioprio_class = cfqq->ioprio_class;
-	const int ioprio = cfqq->ioprio;
-
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
@@ -1884,12 +1777,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		if (cfqq->ioprio != cfqq->org_ioprio)
 			cfqq->ioprio = cfqq->org_ioprio;
 	}
-
-	/*
-	 * refile between round-robin lists if we moved the priority class
-	 */
-	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio))
-		cfq_resort_rr_list(cfqq, 0);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
@@ -2127,9 +2014,7 @@ static void *cfq_init_queue(request_queue_t *q)
 
 	memset(cfqd, 0, sizeof(*cfqd));
 
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		INIT_LIST_HEAD(&cfqd->rr_list[i]);
-
+	cfqd->service_tree = RB_ROOT;
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->cic_list);
-- 
cgit v1.2.3-59-g8ed1b


From cc09e2990fdd96d25fdbb9db6bc9b4c82d9e4a3c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 26 Apr 2007 12:53:50 +0200
Subject: [PATCH] cfq-iosched: speed up rbtree handling

For cases where the rbtree is mainly used for sorting and min retrieval,
a nice speedup of the rbtree code is to maintain a cache of the leftmost
node in the tree.

Also spotted in the CFS CPU scheduler code.

Improved by Alan D. Brunelle <Alan.Brunelle@hp.com> by updating the
leftmost hint in cfq_rb_first() if it isn't set, instead of only
updating it on insert.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 62 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4838c2b16f2c..55c476baa692 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -69,6 +69,18 @@ static struct completion *ioc_gone;
 
 #define sample_valid(samples)	((samples) > 80)
 
+/*
+ * Most of our rbtree usage is for sorting with min extraction, so
+ * if we cache the leftmost node we don't have to walk down the tree
+ * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
+ * move this into the elevator for the rq sorting as well.
+ */
+struct cfq_rb_root {
+	struct rb_root rb;
+	struct rb_node *left;
+};
+#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
+
 /*
  * Per block device queue structure
  */
@@ -78,7 +90,7 @@ struct cfq_data {
 	/*
 	 * rr list of queues with requests and the count of them
 	 */
-	struct rb_root service_tree;
+	struct cfq_rb_root service_tree;
 	struct list_head cur_rr;
 	struct list_head idle_rr;
 	unsigned int busy_queues;
@@ -378,6 +390,23 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 	}
 }
 
+static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
+{
+	if (!root->left)
+		root->left = rb_first(&root->rb);
+
+	return root->left;
+}
+
+static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
+{
+	if (root->left == n)
+		root->left = NULL;
+
+	rb_erase(n, &root->rb);
+	RB_CLEAR_NODE(n);
+}
+
 /*
  * would be nice to take fifo expire time into account as well
  */
@@ -417,10 +446,10 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 static void cfq_service_tree_add(struct cfq_data *cfqd,
 				    struct cfq_queue *cfqq)
 {
-	struct rb_node **p = &cfqd->service_tree.rb_node;
+	struct rb_node **p = &cfqd->service_tree.rb.rb_node;
 	struct rb_node *parent = NULL;
-	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
+	int left = 1;
 
 	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 	rb_key += cfqq->slice_resid;
@@ -433,22 +462,29 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 		if (rb_key == cfqq->rb_key)
 			return;
 
-		rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 	}
 
 	while (*p) {
+		struct cfq_queue *__cfqq;
+
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
 		if (rb_key < __cfqq->rb_key)
 			p = &(*p)->rb_left;
-		else
+		else {
 			p = &(*p)->rb_right;
+			left = 0;
+		}
 	}
 
+	if (left)
+		cfqd->service_tree.left = &cfqq->rb_node;
+
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree);
+	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
 }
 
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
@@ -509,10 +545,8 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_clear_cfqq_on_rr(cfqq);
 	list_del_init(&cfqq->cfq_list);
 
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
-		rb_erase(&cfqq->rb_node, &cfqd->service_tree);
-		RB_CLEAR_NODE(&cfqq->rb_node);
-	}
+	if (!RB_EMPTY_NODE(&cfqq->rb_node))
+		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -764,8 +798,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		 * if current list is non-empty, grab first entry.
 		 */
 		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree)) {
-		struct rb_node *n = rb_first(&cfqd->service_tree);
+	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
+		struct rb_node *n = cfq_rb_first(&cfqd->service_tree);
 
 		cfqq = rb_entry(n, struct cfq_queue, rb_node);
 	} else if (!list_empty(&cfqd->idle_rr)) {
@@ -1036,7 +1070,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	int dispatched = 0;
 	struct rb_node *n;
 
-	while ((n = rb_first(&cfqd->service_tree)) != NULL) {
+	while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) {
 		struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
 
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
@@ -2014,7 +2048,7 @@ static void *cfq_init_queue(request_queue_t *q)
 
 	memset(cfqd, 0, sizeof(*cfqd));
 
-	cfqd->service_tree = RB_ROOT;
+	cfqd->service_tree = CFQ_RB_ROOT;
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->cic_list);
-- 
cgit v1.2.3-59-g8ed1b


From 0c534e0a463e2eeafc97ba25ab23c14f3cdf2bdb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 18 Apr 2007 20:01:57 +0200
Subject: cfq-iosched: sort RT queues into the rbtree

Currently CFQ does a linked insert into the current list for RT
queues. We can just factor the class into the rb insertion,
and then we don't have to treat RT queues in a special way. It's
faster, too.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 55c476baa692..81c057eadfcc 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -471,7 +471,16 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
-		if (rb_key < __cfqq->rb_key)
+		/*
+		 * sort RT queues first, we always want to give
+		 * preference to them. after that, sort on the next
+		 * service time.
+		 */
+		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
+			p = &(*p)->rb_left;
+		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
+			p = &(*p)->rb_right;
+		else if (rb_key < __cfqq->rb_key)
 			p = &(*p)->rb_left;
 		else {
 			p = &(*p)->rb_right;
@@ -490,7 +499,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct list_head *n;
 
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
@@ -500,25 +508,14 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 
 	list_del_init(&cfqq->cfq_list);
 
-	if (cfq_class_rt(cfqq)) {
-		/*
-		 * At to the front of the current list, but behind other
-		 * RT queues.
-		 */
-		n = &cfqd->cur_rr;
-		while (n->next != &cfqd->cur_rr)
-			if (!cfq_class_rt(cfqq))
-				break;
-
-		list_add(&cfqq->cfq_list, n);
-	} else if (cfq_class_idle(cfqq)) {
+	if (cfq_class_idle(cfqq)) {
 		/*
 		 * IDLE goes to the tail of the idle list
 		 */
 		list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr);
 	} else {
 		/*
-		 * So we get here, ergo the queue is a regular best-effort queue
+		 * RT and BE queues, sort into the rbtree
 		 */
 		cfq_service_tree_add(cfqd, cfqq);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 67060e37994444ee9c0bd2413c8baa6cc58e7adb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 18 Apr 2007 20:13:32 +0200
Subject: cfq-iosched: sort IDLE queues into the rbtree

Same treatment as the RT conversion, just put the sorted idle
branch at the end of the tree.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 67 +++++++++++++++++++++++++----------------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 81c057eadfcc..6a6a5f7930d8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -92,7 +92,6 @@ struct cfq_data {
 	 */
 	struct cfq_rb_root service_tree;
 	struct list_head cur_rr;
-	struct list_head idle_rr;
 	unsigned int busy_queues;
 
 	/*
@@ -467,25 +466,33 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 
 	while (*p) {
 		struct cfq_queue *__cfqq;
+		struct rb_node **n;
 
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
 		/*
 		 * sort RT queues first, we always want to give
-		 * preference to them. after that, sort on the next
-		 * service time.
+		 * preference to them. IDLE queues goes to the back.
+		 * after that, sort on the next service time.
 		 */
 		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
-			p = &(*p)->rb_left;
+			n = &(*p)->rb_left;
 		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
-			p = &(*p)->rb_right;
+			n = &(*p)->rb_right;
+		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
+			n = &(*p)->rb_left;
+		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
+			n = &(*p)->rb_right;
 		else if (rb_key < __cfqq->rb_key)
-			p = &(*p)->rb_left;
-		else {
-			p = &(*p)->rb_right;
+			n = &(*p)->rb_left;
+		else
+			n = &(*p)->rb_right;
+
+		if (n == &(*p)->rb_right)
 			left = 0;
-		}
+
+		p = n;
 	}
 
 	if (left)
@@ -506,19 +513,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 	if (!cfq_cfqq_on_rr(cfqq))
 		return;
 
-	list_del_init(&cfqq->cfq_list);
-
-	if (cfq_class_idle(cfqq)) {
-		/*
-		 * IDLE goes to the tail of the idle list
-		 */
-		list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr);
-	} else {
-		/*
-		 * RT and BE queues, sort into the rbtree
-		 */
-		cfq_service_tree_add(cfqd, cfqq);
-	}
+	cfq_service_tree_add(cfqd, cfqq);
 }
 
 /*
@@ -797,20 +792,22 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
 	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
 		struct rb_node *n = cfq_rb_first(&cfqd->service_tree);
+		unsigned long end;
 
 		cfqq = rb_entry(n, struct cfq_queue, rb_node);
-	} else if (!list_empty(&cfqd->idle_rr)) {
-		/*
-		 * if we have idle queues and no rt or be queues had pending
-		 * requests, either allow immediate service if the grace period
-		 * has passed or arm the idle grace timer
-		 */
-		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
-
-		if (time_after_eq(jiffies, end))
-			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
-		else
-			mod_timer(&cfqd->idle_class_timer, end);
+		if (cfq_class_idle(cfqq)) {
+			/*
+			 * if we have idle queues and no rt or be queues had
+			 * pending requests, either allow immediate service if
+			 * the grace period has passed or arm the idle grace
+			 * timer
+			 */
+			end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+			if (time_before(jiffies, end)) {
+				mod_timer(&cfqd->idle_class_timer, end);
+				cfqq = NULL;
+			}
+		}
 	}
 
 	return cfqq;
@@ -1074,7 +1071,6 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	}
 
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
 
 	cfq_slice_expired(cfqd, 0, 0);
 
@@ -2047,7 +2043,6 @@ static void *cfq_init_queue(request_queue_t *q)
 
 	cfqd->service_tree = CFQ_RB_ROOT;
 	INIT_LIST_HEAD(&cfqd->cur_rr);
-	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->cic_list);
 
 	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
-- 
cgit v1.2.3-59-g8ed1b


From 498d3aa2b4f791059acd8c942ee8fa15c2ce36c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 26 Apr 2007 12:54:48 +0200
Subject: [PATCH] cfq-iosched: style cleanups and comments

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 66 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6a6a5f7930d8..29284fa06e6b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -222,7 +222,7 @@ CFQ_CFQQ_FNS(slice_new);
 
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_insert(request_queue_t *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t);
 
 /*
  * scheduler run of queue, if there are requests pending and no one in the
@@ -389,6 +389,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 	}
 }
 
+/*
+ * The below is leftmost cache rbtree addon
+ */
 static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
 {
 	if (!root->left)
@@ -442,13 +445,18 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 	return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0));
 }
 
+/*
+ * The cfqd->service_tree holds all pending cfq_queue's that have
+ * requests waiting to be processed. It is sorted in the order that
+ * we will service the queues.
+ */
 static void cfq_service_tree_add(struct cfq_data *cfqd,
 				    struct cfq_queue *cfqq)
 {
 	struct rb_node **p = &cfqd->service_tree.rb.rb_node;
 	struct rb_node *parent = NULL;
 	unsigned long rb_key;
-	int left = 1;
+	int left;
 
 	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 	rb_key += cfqq->slice_resid;
@@ -464,6 +472,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 	}
 
+	left = 1;
 	while (*p) {
 		struct cfq_queue *__cfqq;
 		struct rb_node **n;
@@ -503,17 +512,16 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
 }
 
+/*
+ * Update cfqq's position in the service tree.
+ */
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
-	struct cfq_data *cfqd = cfqq->cfqd;
-
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
 	 */
-	if (!cfq_cfqq_on_rr(cfqq))
-		return;
-
-	cfq_service_tree_add(cfqd, cfqq);
+	if (cfq_cfqq_on_rr(cfqq))
+		cfq_service_tree_add(cfqq->cfqd, cfqq);
 }
 
 /*
@@ -530,6 +538,10 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_resort_rr_list(cfqq, 0);
 }
 
+/*
+ * Called when the cfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
@@ -654,8 +666,7 @@ static void cfq_remove_request(struct request *rq)
 	}
 }
 
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
+static int cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
@@ -781,6 +792,10 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
 		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
 }
 
+/*
+ * Get next queue for service. Unless we have a queue preemption,
+ * we'll simply select the first cfqq in the service tree.
+ */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = NULL;
@@ -792,10 +807,11 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
 	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
 		struct rb_node *n = cfq_rb_first(&cfqd->service_tree);
-		unsigned long end;
 
 		cfqq = rb_entry(n, struct cfq_queue, rb_node);
 		if (cfq_class_idle(cfqq)) {
+			unsigned long end;
+
 			/*
 			 * if we have idle queues and no rt or be queues had
 			 * pending requests, either allow immediate service if
@@ -813,6 +829,9 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 	return cfqq;
 }
 
+/*
+ * Get and set a new active queue for service.
+ */
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
@@ -898,6 +917,9 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 }
 
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -944,7 +966,8 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 }
 
 /*
- * get next queue for service
+ * Select a queue for service. If we have a current active queue,
+ * check whether to continue servicing it, or retrieve and set a new one.
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
@@ -985,6 +1008,10 @@ keep_queue:
 	return cfqq;
 }
 
+/*
+ * Dispatch some requests from cfqq, moving them to the request queue
+ * dispatch list.
+ */
 static int
 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			int max_dispatch)
@@ -1059,6 +1086,10 @@ static int cfq_forced_dispatch_cfqqs(struct list_head *list)
 	return dispatched;
 }
 
+/*
+ * Drain our current requests. Used for barriers and when switching
+ * io schedulers on-the-fly.
+ */
 static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	int dispatched = 0;
@@ -1224,10 +1255,6 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 	}
 }
 
-
-/*
- * Called with interrupts disabled
- */
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
@@ -1241,6 +1268,10 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 	}
 }
 
+/*
+ * The process that ioc belongs to has exited, we need to clean up
+ * and put the internal structures we have that belongs to that process.
+ */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
@@ -1427,6 +1458,9 @@ out:
 	return cfqq;
 }
 
+/*
+ * We drop cfq io contexts lazily, so we may find a dead one.
+ */
 static void
 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 67e6b49e39e9b9bf5ce1351ef21dad391856183f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 20 Apr 2007 14:18:00 +0200
Subject: cfq-iosched: slice offset should take ioprio into account

Use the max_slice-cur_slice as the multipler for the insertion offset.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 29284fa06e6b..4a0397022f5b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -442,7 +442,8 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 	/*
 	 * just an approximation, should be ok.
 	 */
-	return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0));
+	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From edd75ffd92a5b7f6244431e8ff6c32b846f9ba86 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Apr 2007 12:03:34 +0200
Subject: cfq-iosched: get rid of ->cur_rr and ->cfq_list

It's only used for preemption now that the IDLE and RT queues also
use the rbtree. If we pass an 'add_front' variable to
cfq_service_tree_add(), we can set ->rb_key to 0 to force insertion
at the front of the tree.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 87 ++++++++++++++++++++---------------------------------
 1 file changed, 32 insertions(+), 55 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4a0397022f5b..a8437042e28a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -45,9 +45,6 @@ static int cfq_slice_idle = HZ / 125;
  */
 #define CFQ_QHASH_SHIFT		6
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
-
-#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
 
 #define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
 #define RQ_CFQQ(rq)		((rq)->elevator_private2)
@@ -91,7 +88,6 @@ struct cfq_data {
 	 * rr list of queues with requests and the count of them
 	 */
 	struct cfq_rb_root service_tree;
-	struct list_head cur_rr;
 	unsigned int busy_queues;
 
 	/*
@@ -146,8 +142,6 @@ struct cfq_queue {
 	struct hlist_node cfq_hash;
 	/* hash key */
 	unsigned int key;
-	/* member of the rr/busy/cur/idle cfqd list */
-	struct list_head cfq_list;
 	/* service_tree member */
 	struct rb_node rb_node;
 	/* service_tree key */
@@ -452,16 +446,19 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
  * we will service the queues.
  */
 static void cfq_service_tree_add(struct cfq_data *cfqd,
-				    struct cfq_queue *cfqq)
+				    struct cfq_queue *cfqq, int add_front)
 {
 	struct rb_node **p = &cfqd->service_tree.rb.rb_node;
 	struct rb_node *parent = NULL;
 	unsigned long rb_key;
 	int left;
 
-	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
-	rb_key += cfqq->slice_resid;
-	cfqq->slice_resid = 0;
+	if (!add_front) {
+		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
+		rb_key += cfqq->slice_resid;
+		cfqq->slice_resid = 0;
+	} else
+		rb_key = 0;
 
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		/*
@@ -516,13 +513,13 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 /*
  * Update cfqq's position in the service tree.
  */
-static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
+static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
 	 */
 	if (cfq_cfqq_on_rr(cfqq))
-		cfq_service_tree_add(cfqq->cfqd, cfqq);
+		cfq_service_tree_add(cfqd, cfqq, 0);
 }
 
 /*
@@ -536,7 +533,7 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 
-	cfq_resort_rr_list(cfqq, 0);
+	cfq_resort_rr_list(cfqd, cfqq);
 }
 
 /*
@@ -548,7 +545,6 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
-	list_del_init(&cfqq->cfq_list);
 
 	if (!RB_EMPTY_NODE(&cfqq->rb_node))
 		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
@@ -771,7 +767,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (timed_out && !cfq_cfqq_slice_new(cfqq))
 		cfqq->slice_resid = cfqq->slice_end - jiffies;
 
-	cfq_resort_rr_list(cfqq, preempted);
+	cfq_resort_rr_list(cfqd, cfqq);
 
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
@@ -799,31 +795,28 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue *cfqq;
+	struct rb_node *n;
 
-	if (!list_empty(&cfqd->cur_rr)) {
-		/*
-		 * if current list is non-empty, grab first entry.
-		 */
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
-		struct rb_node *n = cfq_rb_first(&cfqd->service_tree);
+	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
+		return NULL;
 
-		cfqq = rb_entry(n, struct cfq_queue, rb_node);
-		if (cfq_class_idle(cfqq)) {
-			unsigned long end;
+	n = cfq_rb_first(&cfqd->service_tree);
+	cfqq = rb_entry(n, struct cfq_queue, rb_node);
 
-			/*
-			 * if we have idle queues and no rt or be queues had
-			 * pending requests, either allow immediate service if
-			 * the grace period has passed or arm the idle grace
-			 * timer
-			 */
-			end = cfqd->last_end_request + CFQ_IDLE_GRACE;
-			if (time_before(jiffies, end)) {
-				mod_timer(&cfqd->idle_class_timer, end);
-				cfqq = NULL;
-			}
+	if (cfq_class_idle(cfqq)) {
+		unsigned long end;
+
+		/*
+		 * if we have idle queues and no rt or be queues had
+		 * pending requests, either allow immediate service if
+		 * the grace period has passed or arm the idle grace
+		 * timer
+		 */
+		end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+		if (time_before(jiffies, end)) {
+			mod_timer(&cfqd->idle_class_timer, end);
+			cfqq = NULL;
 		}
 	}
 
@@ -1075,18 +1068,6 @@ static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 	return dispatched;
 }
 
-static int cfq_forced_dispatch_cfqqs(struct list_head *list)
-{
-	struct cfq_queue *cfqq, *next;
-	int dispatched;
-
-	dispatched = 0;
-	list_for_each_entry_safe(cfqq, next, list, cfq_list)
-		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
-
-	return dispatched;
-}
-
 /*
  * Drain our current requests. Used for barriers and when switching
  * io schedulers on-the-fly.
@@ -1102,8 +1083,6 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 	}
 
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
-
 	cfq_slice_expired(cfqd, 0, 0);
 
 	BUG_ON(cfqd->busy_queues);
@@ -1433,7 +1412,6 @@ retry:
 		memset(cfqq, 0, sizeof(*cfqq));
 
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
-		INIT_LIST_HEAD(&cfqq->cfq_list);
 		RB_CLEAR_NODE(&cfqq->rb_node);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
@@ -1712,8 +1690,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
-	list_del_init(&cfqq->cfq_list);
-	list_add(&cfqq->cfq_list, &cfqd->cur_rr);
+
+	cfq_service_tree_add(cfqd, cfqq, 1);
 
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
@@ -2077,7 +2055,6 @@ static void *cfq_init_queue(request_queue_t *q)
 	memset(cfqd, 0, sizeof(*cfqd));
 
 	cfqd->service_tree = CFQ_RB_ROOT;
-	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->cic_list);
 
 	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
-- 
cgit v1.2.3-59-g8ed1b


From 6084cdda0ea4561feb68e00a8c50068bba98006d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 23 Apr 2007 08:25:00 +0200
Subject: cfq-iosched: don't pass unused preemption variable around

We don't use it anymore in the slice expiry handling.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a8437042e28a..f089eeecdf32 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -752,7 +752,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		    int preempted, int timed_out)
+		    int timed_out)
 {
 	if (cfq_cfqq_wait_request(cfqq))
 		del_timer(&cfqd->idle_slice_timer);
@@ -761,8 +761,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfq_clear_cfqq_wait_request(cfqq);
 
 	/*
-	 * store what was left of this slice, if the queue idled out
-	 * or was preempted
+	 * store what was left of this slice, if the queue idled/timed out
 	 */
 	if (timed_out && !cfq_cfqq_slice_new(cfqq))
 		cfqq->slice_resid = cfqq->slice_end - jiffies;
@@ -780,13 +779,12 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqd->dispatch_slice = 0;
 }
 
-static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
-				     int timed_out)
+static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
 	if (cfqq)
-		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
+		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 
 /*
@@ -995,7 +993,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	}
 
 expire:
-	cfq_slice_expired(cfqd, 0, 0);
+	cfq_slice_expired(cfqd, 0);
 new_queue:
 	cfqq = cfq_set_active_queue(cfqd);
 keep_queue:
@@ -1049,7 +1047,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
-		cfq_slice_expired(cfqd, 0, 0);
+		cfq_slice_expired(cfqd, 0);
 	}
 
 	return dispatched;
@@ -1083,7 +1081,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 	}
 
-	cfq_slice_expired(cfqd, 0, 0);
+	cfq_slice_expired(cfqd, 0);
 
 	BUG_ON(cfqd->busy_queues);
 
@@ -1153,7 +1151,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		__cfq_slice_expired(cfqd, cfqq, 0, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -1210,7 +1208,7 @@ static void cfq_free_io_context(struct io_context *ioc)
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	if (unlikely(cfqq == cfqd->active_queue)) {
-		__cfq_slice_expired(cfqd, cfqq, 0, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -1683,7 +1681,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	cfq_slice_expired(cfqd, 1, 1);
+	cfq_slice_expired(cfqd, 1);
 
 	/*
 	 * Put the new queue at the front of the of the current list,
@@ -1784,7 +1782,7 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
 		if (cfq_slice_used(cfqq))
-			cfq_slice_expired(cfqd, 0, 1);
+			cfq_slice_expired(cfqd, 1);
 		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
 			cfq_arm_slice_timer(cfqd);
 	}
@@ -1979,7 +1977,7 @@ static void cfq_idle_slice_timer(unsigned long data)
 		}
 	}
 expire:
-	cfq_slice_expired(cfqd, 0, timed_out);
+	cfq_slice_expired(cfqd, timed_out);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
@@ -2025,7 +2023,7 @@ static void cfq_exit_queue(elevator_t *e)
 	spin_lock_irq(q->queue_lock);
 
 	if (cfqd->active_queue)
-		__cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0);
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
 	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
-- 
cgit v1.2.3-59-g8ed1b


From 20e493a8d03b3b2f51b619a453f7bbbebedd6bda Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 23 Apr 2007 08:26:36 +0200
Subject: cfq-iosched: get rid of ->dispatch_slice

We can track it fairly accurately locally, let the slice handling
take care of the rest.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f089eeecdf32..839086dcb0af 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -106,7 +106,6 @@ struct cfq_data {
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
-	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
 
@@ -775,8 +774,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
 	}
-
-	cfqd->dispatch_slice = 0;
 }
 
 static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
@@ -1026,7 +1023,6 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 */
 		cfq_dispatch_insert(cfqd->queue, rq);
 
-		cfqd->dispatch_slice++;
 		dispatched++;
 
 		if (!cfqd->active_cic) {
@@ -1044,7 +1040,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	 * queue always expire after 1 dispatch round.
 	 */
 	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
-	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
+	    dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
 		cfq_slice_expired(cfqd, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 1be92f2fc7b563db3a8909d2d1c6a6520aeca323 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Apr 2007 14:32:26 +0200
Subject: cfq-iosched: never allow an async queue idling

We don't enable it by default, don't let it get enabled during
runtime.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 839086dcb0af..df82755ac40b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1603,7 +1603,12 @@ static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
-	int enable_idle = cfq_cfqq_idle_window(cfqq);
+	int enable_idle;
+
+	if (!cfq_cfqq_sync(cfqq))
+		return;
+
+	enable_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
 	    (cfqd->hw_tag && CIC_SEEKY(cic)))
-- 
cgit v1.2.3-59-g8ed1b


From 3ed9a2965c47636bc0ebafab31a39f1c105492ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 23 Apr 2007 08:33:33 +0200
Subject: cfq-iosched: improve sync vs async workloads

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index df82755ac40b..a8237be97a28 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -96,6 +96,7 @@ struct cfq_data {
 	struct hlist_head *cfq_hash;
 
 	int rq_in_driver;
+	int sync_flight;
 	int hw_tag;
 
 	/*
@@ -911,11 +912,15 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
  */
 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
+
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->sync_flight++;
 }
 
 /*
@@ -1100,27 +1105,24 @@ static int cfq_dispatch_requests(request_queue_t *q, int force)
 	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
 		int max_dispatch;
 
-		if (cfqd->busy_queues > 1) {
-			/*
-			 * So we have dispatched before in this round, if the
-			 * next queue has idling enabled (must be sync), don't
-			 * allow it service until the previous have completed.
-			 */
-			if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) &&
-			    dispatched)
+		max_dispatch = cfqd->cfq_quantum;
+		if (cfq_class_idle(cfqq))
+			max_dispatch = 1;
+
+		if (cfqq->dispatched >= max_dispatch) {
+			if (cfqd->busy_queues > 1)
 				break;
-			if (cfqq->dispatched >= cfqd->cfq_quantum)
+			if (cfqq->dispatched >= 4 * max_dispatch)
 				break;
 		}
 
+		if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+			break;
+
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
 
-		max_dispatch = cfqd->cfq_quantum;
-		if (cfq_class_idle(cfqq))
-			max_dispatch = 1;
-
 		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
 	}
 
@@ -1767,6 +1769,9 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->sync_flight--;
+
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-- 
cgit v1.2.3-59-g8ed1b


From cc19747977824ece6aa1c56a29e974fef5ec2b32 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 20 Apr 2007 20:45:39 +0200
Subject: cfq-iosched: tighten queue request overlap condition

For tagged devices, allow overlap of requests if the idle window
isn't enabled on the current active queue.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a8237be97a28..e859b4966e4c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -989,7 +989,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * flight or is idling for a new request, allow either of these
 	 * conditions to happen (or time out) before selecting a new queue.
 	 */
-	if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) {
+	if (timer_pending(&cfqd->idle_slice_timer) ||
+	    (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 91fac317a34859986a2359a5a5c0e37dc17a9c3d Mon Sep 17 00:00:00 2001
From: Vasily Tarasov <vtaras@openvz.org>
Date: Wed, 25 Apr 2007 12:29:51 +0200
Subject: cfq-iosched: get rid of cfqq hash

cfq hash is no more necessary.  We always can get cfqq from io context.
cfq_get_io_context_noalloc() function is introduced, because we don't
want to allocate cic on merging and checking may_queue.  In order to
identify sync queue we've used hash key = CFQ_KEY_ASYNC. Since hash is
eliminated we need to use other criterion: sync flag for queue is added.
In all places where we dig in rb_tree we're in current context, so no
additional locking is required.

Advantages of this patch: no additional memory for hash, no seeking in
hash, code is cleaner. But it is necessary now to seek cic in per-ioc
rbtree, but it is faster:
- most processes work only with few devices
- most systems have only few block devices
- it is a rb-tree

Signed-off-by: Vasily Tarasov <vtaras@openvz.org>

Changes by me:

- Merge into CFQ devel branch
- Get rid of cfq_get_io_context_noalloc()
- Fix various bugs with dereferencing cic->cfqq[] with offset other
  than 0 or 1.
- Fix bug in cfqq setup, is_sync condition was reversed.
- Fix bug where only bio_sync() is used, we need to check for a READ too

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 167 +++++++++++++++++++++-------------------------------
 1 file changed, 67 insertions(+), 100 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e859b4966e4c..94f53a1f4677 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
-#include <linux/hash.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 
@@ -38,14 +37,6 @@ static int cfq_slice_idle = HZ / 125;
 
 #define CFQ_SLICE_SCALE		(5)
 
-#define CFQ_KEY_ASYNC		(0)
-
-/*
- * for the hash of cfqq inside the cfqd
- */
-#define CFQ_QHASH_SHIFT		6
-#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-
 #define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
 #define RQ_CFQQ(rq)		((rq)->elevator_private2)
 
@@ -62,8 +53,6 @@ static struct completion *ioc_gone;
 #define ASYNC			(0)
 #define SYNC			(1)
 
-#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
-
 #define sample_valid(samples)	((samples) > 80)
 
 /*
@@ -90,11 +79,6 @@ struct cfq_data {
 	struct cfq_rb_root service_tree;
 	unsigned int busy_queues;
 
-	/*
-	 * cfqq lookup hash
-	 */
-	struct hlist_head *cfq_hash;
-
 	int rq_in_driver;
 	int sync_flight;
 	int hw_tag;
@@ -138,10 +122,6 @@ struct cfq_queue {
 	atomic_t ref;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
-	/* cfqq lookup hash */
-	struct hlist_node cfq_hash;
-	/* hash key */
-	unsigned int key;
 	/* service_tree member */
 	struct rb_node rb_node;
 	/* service_tree key */
@@ -186,6 +166,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
 	CFQ_CFQQ_FLAG_queue_new,	/* queue never been serviced */
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
+	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -212,11 +193,38 @@ CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(queue_new);
 CFQ_CFQQ_FNS(slice_new);
+CFQ_CFQQ_FNS(sync);
 #undef CFQ_CFQQ_FNS
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_insert(request_queue_t *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
+				       struct task_struct *, gfp_t);
+static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
+						struct io_context *);
+
+static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
+					    int is_sync)
+{
+	return cic->cfqq[!!is_sync];
+}
+
+static inline void cic_set_cfqq(struct cfq_io_context *cic,
+				struct cfq_queue *cfqq, int is_sync)
+{
+	cic->cfqq[!!is_sync] = cfqq;
+}
+
+/*
+ * We regard a request as SYNC, if it's either a read or has the SYNC bit
+ * set (in which case it could also be direct WRITE).
+ */
+static inline int cfq_bio_sync(struct bio *bio)
+{
+	if (bio_data_dir(bio) == READ || bio_sync(bio))
+		return 1;
+
+	return 0;
+}
 
 /*
  * scheduler run of queue, if there are requests pending and no one in the
@@ -235,17 +243,6 @@ static int cfq_queue_empty(request_queue_t *q)
 	return !cfqd->busy_queues;
 }
 
-static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync)
-{
-	/*
-	 * Use the per-process queue, for read requests and syncronous writes
-	 */
-	if (!(rw & REQ_RW) || is_sync)
-		return task->pid;
-
-	return CFQ_KEY_ASYNC;
-}
-
 /*
  * Scale schedule slice based on io priority. Use the sync time slice only
  * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -608,10 +605,14 @@ static struct request *
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
 	struct task_struct *tsk = current;
-	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio));
+	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 
-	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
+	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	if (!cic)
+		return NULL;
+
+	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	if (cfqq) {
 		sector_t sector = bio->bi_sector + bio_sectors(bio);
 
@@ -705,23 +706,24 @@ static int cfq_allow_merge(request_queue_t *q, struct request *rq,
 			   struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	const int rw = bio_data_dir(bio);
+	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
-	pid_t key;
 
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
-	if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq))
+	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
 		return 0;
 
 	/*
 	 * Lookup the cfqq that this bio will be queued with. Allow
 	 * merge only if rq is queued there.
 	 */
-	key = cfq_queue_pid(current, rw, bio_sync(bio));
-	cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio);
+	cic = cfq_cic_rb_lookup(cfqd, current->io_context);
+	if (!cic)
+		return 0;
 
+	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	if (cfqq == RQ_CFQQ(rq))
 		return 1;
 
@@ -1154,37 +1156,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 		cfq_schedule_dispatch(cfqd);
 	}
 
-	/*
-	 * it's on the empty list and still hashed
-	 */
-	hlist_del(&cfqq->cfq_hash);
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
-static struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
-		    const int hashval)
-{
-	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct hlist_node *entry;
-	struct cfq_queue *__cfqq;
-
-	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
-		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
-
-		if (__cfqq->key == key && (__p == prio || !prio))
-			return __cfqq;
-	}
-
-	return NULL;
-}
-
-static struct cfq_queue *
-cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
-{
-	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
-}
-
 static void cfq_free_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
@@ -1342,7 +1316,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 	cfqq = cic->cfqq[ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
+		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc->task,
 					 GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[ASYNC] = new_cfqq;
@@ -1374,16 +1348,16 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
+cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
 	      gfp_t gfp_mask)
 {
-	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	unsigned short ioprio;
+	struct cfq_io_context *cic;
 
 retry:
-	ioprio = tsk->ioprio;
-	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
+	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	/* cic always exists here */
+	cfqq = cic_to_cfqq(cic, is_sync);
 
 	if (!cfqq) {
 		if (new_cfqq) {
@@ -1408,20 +1382,20 @@ retry:
 
 		memset(cfqq, 0, sizeof(*cfqq));
 
-		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		RB_CLEAR_NODE(&cfqq->rb_node);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
-		cfqq->key = key;
-		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 		atomic_set(&cfqq->ref, 0);
 		cfqq->cfqd = cfqd;
 
-		if (key != CFQ_KEY_ASYNC)
+		if (is_sync) {
 			cfq_mark_cfqq_idle_window(cfqq);
+			cfq_mark_cfqq_sync(cfqq);
+		}
 
 		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_mark_cfqq_queue_new(cfqq);
+
 		cfq_init_prio_data(cfqq);
 	}
 
@@ -1453,6 +1427,9 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	struct cfq_io_context *cic;
 	void *k, *key = cfqd;
 
+	if (unlikely(!ioc))
+		return NULL;
+
 restart:
 	n = ioc->cic_root.rb_node;
 	while (n) {
@@ -1839,10 +1816,8 @@ static int cfq_may_queue(request_queue_t *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
+	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
-	unsigned int key;
-
-	key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC);
 
 	/*
 	 * don't force setup of a queue from here, as a call to may_queue
@@ -1850,7 +1825,11 @@ static int cfq_may_queue(request_queue_t *q, int rw)
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
-	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
+	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	if (!cic)
+		return ELV_MQUEUE_MAY;
+
+	cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
 	if (cfqq) {
 		cfq_init_prio_data(cfqq);
 		cfq_prio_boost(cfqq);
@@ -1894,7 +1873,6 @@ cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
 	const int is_sync = rq_is_sync(rq);
-	pid_t key = cfq_queue_pid(tsk, rw, is_sync);
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 
@@ -1907,14 +1885,15 @@ cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
 	if (!cic)
 		goto queue_fail;
 
-	if (!cic->cfqq[is_sync]) {
-		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
+	cfqq = cic_to_cfqq(cic, is_sync);
+	if (!cfqq) {
+		cfqq = cfq_get_queue(cfqd, is_sync, tsk, gfp_mask);
+
 		if (!cfqq)
 			goto queue_fail;
 
-		cic->cfqq[is_sync] = cfqq;
-	} else
-		cfqq = cic->cfqq[is_sync];
+		cic_set_cfqq(cic, cfqq, is_sync);
+	}
 
 	cfqq->allocated[rw]++;
 	cfq_clear_cfqq_must_alloc(cfqq);
@@ -2044,14 +2023,12 @@ static void cfq_exit_queue(elevator_t *e)
 
 	cfq_shutdown_timer_wq(cfqd);
 
-	kfree(cfqd->cfq_hash);
 	kfree(cfqd);
 }
 
 static void *cfq_init_queue(request_queue_t *q)
 {
 	struct cfq_data *cfqd;
-	int i;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
 	if (!cfqd)
@@ -2062,13 +2039,6 @@ static void *cfq_init_queue(request_queue_t *q)
 	cfqd->service_tree = CFQ_RB_ROOT;
 	INIT_LIST_HEAD(&cfqd->cic_list);
 
-	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
-	if (!cfqd->cfq_hash)
-		goto out_free;
-
-	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
-
 	cfqd->queue = q;
 
 	init_timer(&cfqd->idle_slice_timer);
@@ -2092,9 +2062,6 @@ static void *cfq_init_queue(request_queue_t *q)
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 
 	return cfqd;
-out_free:
-	kfree(cfqd);
-	return NULL;
 }
 
 static void cfq_slab_kill(void)
-- 
cgit v1.2.3-59-g8ed1b


From 4e521c27eee33cebd618c26649e2c93803004647 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 24 Apr 2007 21:17:33 +0200
Subject: ll_rw_blk: add io_context private pointer

To be used by as/cfq as they see fit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ll_rw_blk.c      | 1 +
 include/linux/blkdev.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 3de06953ac33..123003a90477 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3741,6 +3741,7 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic_root.rb_node = NULL;
+		ret->ioc_data = NULL;
 		/* make sure set_task_ioprio() sees the settings above */
 		smp_wmb();
 		tsk->io_context = ret;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83dcd8c0e974..a686eabe22d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -116,6 +116,7 @@ struct io_context {
 
 	struct as_io_context *aic;
 	struct rb_root cic_root;
+	void *ioc_data;
 };
 
 void put_io_context(struct io_context *ioc);
-- 
cgit v1.2.3-59-g8ed1b


From 597bc485d6906359ad667fc8ead5e5f0ede03a0a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 24 Apr 2007 21:23:53 +0200
Subject: cfq-iosched: speedup cic rb lookup

We often lookup the same queue many times in succession, so cache
the last looked up queue to avoid browsing the rbtree.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 94f53a1f4677..64df3fa303b0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1165,6 +1165,8 @@ static void cfq_free_io_context(struct io_context *ioc)
 	struct rb_node *n;
 	int freed = 0;
 
+	ioc->ioc_data = NULL;
+
 	while ((n = rb_first(&ioc->cic_root)) != NULL) {
 		__cic = rb_entry(n, struct cfq_io_context, rb_node);
 		rb_erase(&__cic->rb_node, &ioc->cic_root);
@@ -1228,10 +1230,11 @@ static void cfq_exit_io_context(struct io_context *ioc)
 	struct cfq_io_context *__cic;
 	struct rb_node *n;
 
+	ioc->ioc_data = NULL;
+
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-
 	n = rb_first(&ioc->cic_root);
 	while (n != NULL) {
 		__cic = rb_entry(n, struct cfq_io_context, rb_node);
@@ -1415,6 +1418,10 @@ static void
 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	WARN_ON(!list_empty(&cic->queue_list));
+
+	if (ioc->ioc_data == cic)
+		ioc->ioc_data = NULL;
+
 	rb_erase(&cic->rb_node, &ioc->cic_root);
 	kmem_cache_free(cfq_ioc_pool, cic);
 	elv_ioc_count_dec(ioc_count);
@@ -1430,6 +1437,13 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	if (unlikely(!ioc))
 		return NULL;
 
+	/*
+	 * we maintain a last-hit cache, to avoid browsing over the tree
+	 */
+	cic = ioc->ioc_data;
+	if (cic && cic->key == cfqd)
+		return cic;
+
 restart:
 	n = ioc->cic_root.rb_node;
 	while (n) {
@@ -1445,8 +1459,10 @@ restart:
 			n = n->rb_left;
 		else if (key > k)
 			n = n->rb_right;
-		else
+		else {
+			ioc->ioc_data = cic;
 			return cic;
+		}
 	}
 
 	return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 5972511b77809cb7c9ccdb79b825c54921c5c546 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 2 Apr 2007 10:06:42 +0200
Subject: [BLOCK] Don't pin lots of memory in mempools

Currently we scale the mempool sizes depending on memory installed
in the machine, except for the bio pool itself which sits at a fixed
256 entry pre-allocation.

There's really no point in "optimizing" this OOM path, we just need
enough preallocated to make progress. A single unit is enough, lets
scale it down to 2 just to be on the safe side.

This patch saves ~150kb of pinned kernel memory on a 32-bit box.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/dm-crypt.c   |  2 +-
 drivers/md/dm-io.c      |  2 +-
 drivers/md/dm.c         |  2 +-
 drivers/scsi/scsi_lib.c |  2 +-
 fs/bio.c                | 41 ++++++-----------------------------------
 include/linux/bio.h     |  2 +-
 6 files changed, 11 insertions(+), 40 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c2471ee054a..d8121234c347 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -867,7 +867,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad4;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4);
+	cc->bs = bioset_create(MIN_IOS, MIN_IOS);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad_bs;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4eb73d395213..8bdc8a87b249 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -60,7 +60,7 @@ static int resize_pool(unsigned int new_ios)
 		if (!_io_pool)
 			return -ENOMEM;
 
-		_bios = bioset_create(16, 16, 4);
+		_bios = bioset_create(16, 16);
 		if (!_bios) {
 			mempool_destroy(_io_pool);
 			_io_pool = NULL;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3668b170ea68..11a98df298ec 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1012,7 +1012,7 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->tio_pool)
 		goto bad3;
 
-	md->bs = bioset_create(16, 16, 4);
+	md->bs = bioset_create(16, 16);
 	if (!md->bs)
 		goto bad_no_bioset;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9f7482d0b594..05d79af5ab90 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -31,7 +31,7 @@
 
 
 #define SG_MEMPOOL_NR		ARRAY_SIZE(scsi_sg_pools)
-#define SG_MEMPOOL_SIZE		32
+#define SG_MEMPOOL_SIZE		2
 
 struct scsi_host_sg_pool {
 	size_t		size;
diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb18368..693940da4090 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,7 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-#define BIO_POOL_SIZE 256
+#define BIO_POOL_SIZE 2
 
 static struct kmem_cache *bio_slab __read_mostly;
 
@@ -38,7 +38,7 @@ static struct kmem_cache *bio_slab __read_mostly;
  * a small number of entries is fine, not going to be performance critical.
  * basically we just need to survive
  */
-#define BIO_SPLIT_ENTRIES 8	
+#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 
 struct biovec_slab {
@@ -1120,7 +1120,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
+static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
 	int i;
 
@@ -1128,9 +1128,6 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
 		struct biovec_slab *bp = bvec_slabs + i;
 		mempool_t **bvp = bs->bvec_pools + i;
 
-		if (pool_entries > 1 && i >= scale)
-			pool_entries >>= 1;
-
 		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
 		if (!*bvp)
 			return -ENOMEM;
@@ -1161,7 +1158,7 @@ void bioset_free(struct bio_set *bs)
 	kfree(bs);
 }
 
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
+struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 {
 	struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
 
@@ -1172,7 +1169,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (!biovec_create_pools(bs, bvec_pool_size, scale))
+	if (!biovec_create_pools(bs, bvec_pool_size))
 		return bs;
 
 bad:
@@ -1196,38 +1193,12 @@ static void __init biovec_init_slabs(void)
 
 static int __init init_bio(void)
 {
-	int megabytes, bvec_pool_entries;
-	int scale = BIOVEC_NR_POOLS;
-
 	bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
 				SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	biovec_init_slabs();
 
-	megabytes = nr_free_pages() >> (20 - PAGE_SHIFT);
-
-	/*
-	 * find out where to start scaling
-	 */
-	if (megabytes <= 16)
-		scale = 0;
-	else if (megabytes <= 32)
-		scale = 1;
-	else if (megabytes <= 64)
-		scale = 2;
-	else if (megabytes <= 96)
-		scale = 3;
-	else if (megabytes <= 128)
-		scale = 4;
-
-	/*
-	 * Limit number of entries reserved -- mempools are only used when
-	 * the system is completely unable to allocate memory, so we only
-	 * need enough to make progress.
-	 */
-	bvec_pool_entries = 1 + scale;
-
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 08daf3272c02..4d85262b4fa4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -276,7 +276,7 @@ extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
 extern mempool_t *bio_split_pool;
 extern void bio_pair_release(struct bio_pair *dbio);
 
-extern struct bio_set *bioset_create(int, int, int);
+extern struct bio_set *bioset_create(int, int);
 extern void bioset_free(struct bio_set *);
 
 extern struct bio *bio_alloc(gfp_t, int);
-- 
cgit v1.2.3-59-g8ed1b


From 2a12dcd71a5e0667b33f7b47bcac95c71d551840 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 26 Apr 2007 14:41:53 +0200
Subject: [PATCH] elevator: elv_list_lock does not need irq disabling

It's never grabbed from irq context, so just make it plain spin_lock().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 96a00c822748..ce866eb75f6a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -134,13 +134,13 @@ static struct elevator_type *elevator_get(const char *name)
 {
 	struct elevator_type *e;
 
-	spin_lock_irq(&elv_list_lock);
+	spin_lock(&elv_list_lock);
 
 	e = elevator_find(name);
 	if (e && !try_module_get(e->elevator_owner))
 		e = NULL;
 
-	spin_unlock_irq(&elv_list_lock);
+	spin_unlock(&elv_list_lock);
 
 	return e;
 }
@@ -965,10 +965,11 @@ void elv_unregister_queue(struct request_queue *q)
 int elv_register(struct elevator_type *e)
 {
 	char *def = "";
-	spin_lock_irq(&elv_list_lock);
+
+	spin_lock(&elv_list_lock);
 	BUG_ON(elevator_find(e->elevator_name));
 	list_add_tail(&e->list, &elv_list);
-	spin_unlock_irq(&elv_list_lock);
+	spin_unlock(&elv_list_lock);
 
 	if (!strcmp(e->elevator_name, chosen_elevator) ||
 			(!*chosen_elevator &&
@@ -998,9 +999,9 @@ void elv_unregister(struct elevator_type *e)
 		read_unlock(&tasklist_lock);
 	}
 
-	spin_lock_irq(&elv_list_lock);
+	spin_lock(&elv_list_lock);
 	list_del_init(&e->list);
-	spin_unlock_irq(&elv_list_lock);
+	spin_unlock(&elv_list_lock);
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 
@@ -1118,7 +1119,7 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
 	struct list_head *entry;
 	int len = 0;
 
-	spin_lock_irq(&elv_list_lock);
+	spin_lock(&elv_list_lock);
 	list_for_each(entry, &elv_list) {
 		struct elevator_type *__e;
 
@@ -1128,7 +1129,7 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
 		else
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
-	spin_unlock_irq(&elv_list_lock);
+	spin_unlock(&elv_list_lock);
 
 	len += sprintf(len+name, "\n");
 	return len;
-- 
cgit v1.2.3-59-g8ed1b