aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/badblocks.c2
-rw-r--r--block/bfq-cgroup.c155
-rw-r--r--block/bfq-iosched.c834
-rw-r--r--block/bfq-iosched.h23
-rw-r--r--block/bfq-wf2q.c8
-rw-r--r--block/bio-integrity.c8
-rw-r--r--block/bio.c286
-rw-r--r--block/blk-cgroup.c20
-rw-r--r--block/blk-core.c398
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c37
-rw-r--r--block/blk-lib.c120
-rw-r--r--block/blk-map.c49
-rw-r--r--block/blk-merge.c42
-rw-r--r--block/blk-mq-debugfs.c25
-rw-r--r--block/blk-mq-sched.c206
-rw-r--r--block/blk-mq-sched.h2
-rw-r--r--block/blk-mq-sysfs.c9
-rw-r--r--block/blk-mq-tag.c24
-rw-r--r--block/blk-mq-tag.h7
-rw-r--r--block/blk-mq.c936
-rw-r--r--block/blk-mq.h112
-rw-r--r--block/blk-settings.c2
-rw-r--r--block/blk-stat.c51
-rw-r--r--block/blk-sysfs.c52
-rw-r--r--block/blk-throttle.c175
-rw-r--r--block/blk-timeout.c29
-rw-r--r--block/blk-wbt.c21
-rw-r--r--block/blk-zoned.c42
-rw-r--r--block/blk.h80
-rw-r--r--block/bounce.c39
-rw-r--r--block/bsg-lib.c3
-rw-r--r--block/bsg.c66
-rw-r--r--block/deadline-iosched.c114
-rw-r--r--block/elevator.c79
-rw-r--r--block/genhd.c152
-rw-r--r--block/ioctl.c19
-rw-r--r--block/kyber-iosched.c32
-rw-r--r--block/mq-deadline.c152
-rw-r--r--block/partition-generic.c6
-rw-r--r--block/partitions/msdos.c4
-rw-r--r--block/scsi_ioctl.c42
-rw-r--r--block/sed-opal.c2
43 files changed, 3002 insertions, 1465 deletions
diff --git a/block/badblocks.c b/block/badblocks.c
index 43c71166e1e2..91f7bcf979d3 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -178,7 +178,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
if (bb->shift < 0)
/* badblocks are disabled */
- return 0;
+ return 1;
if (bb->shift) {
/* round the start down, and the end up */
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index ceefb9a706d6..d819dc77fe65 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -24,7 +24,7 @@
#include "bfq-iosched.h"
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
/* bfqg stats flags */
enum bfqg_stats_flags {
@@ -152,6 +152,57 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
bfqg_stats_update_group_wait_time(stats);
}
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ bfqg_stats_end_empty_time(&bfqg->stats);
+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, io_start_time))
+ blkg_rwstat_add(&stats->service_time, op,
+ now - io_start_time);
+ if (time_after64(io_start_time, start_time))
+ blkg_rwstat_add(&stats->wait_time, op,
+ io_start_time - start_time);
+}
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
/*
* blk-cgroup policy-related handlers
* The following functions help in converting between blk-cgroup
@@ -229,42 +280,10 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
blkg_put(bfqg_to_blkg(bfqg));
}
-void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op)
-{
- blkg_rwstat_add(&bfqg->stats.queued, op, 1);
- bfqg_stats_end_empty_time(&bfqg->stats);
- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
-}
-
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
-{
- blkg_rwstat_add(&bfqg->stats.queued, op, -1);
-}
-
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
-{
- blkg_rwstat_add(&bfqg->stats.merged, op, 1);
-}
-
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
- uint64_t io_start_time, unsigned int op)
-{
- struct bfqg_stats *stats = &bfqg->stats;
- unsigned long long now = sched_clock();
-
- if (time_after64(now, io_start_time))
- blkg_rwstat_add(&stats->service_time, op,
- now - io_start_time);
- if (time_after64(io_start_time, start_time))
- blkg_rwstat_add(&stats->wait_time, op,
- io_start_time - start_time);
-}
-
/* @stats = 0 */
static void bfqg_stats_reset(struct bfqg_stats *stats)
{
+#ifdef CONFIG_DEBUG_BLK_CGROUP
/* queued stats shouldn't be cleared */
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
@@ -276,6 +295,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats)
blkg_stat_reset(&stats->group_wait_time);
blkg_stat_reset(&stats->idle_time);
blkg_stat_reset(&stats->empty_time);
+#endif
}
/* @to += @from */
@@ -284,6 +304,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
if (!to || !from)
return;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
/* queued stats shouldn't be cleared */
blkg_rwstat_add_aux(&to->merged, &from->merged);
blkg_rwstat_add_aux(&to->service_time, &from->service_time);
@@ -296,6 +317,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
blkg_stat_add_aux(&to->idle_time, &from->idle_time);
blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+#endif
}
/*
@@ -342,6 +364,7 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
static void bfqg_stats_exit(struct bfqg_stats *stats)
{
+#ifdef CONFIG_DEBUG_BLK_CGROUP
blkg_rwstat_exit(&stats->merged);
blkg_rwstat_exit(&stats->service_time);
blkg_rwstat_exit(&stats->wait_time);
@@ -353,10 +376,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats)
blkg_stat_exit(&stats->group_wait_time);
blkg_stat_exit(&stats->idle_time);
blkg_stat_exit(&stats->empty_time);
+#endif
}
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
+#ifdef CONFIG_DEBUG_BLK_CGROUP
if (blkg_rwstat_init(&stats->merged, gfp) ||
blkg_rwstat_init(&stats->service_time, gfp) ||
blkg_rwstat_init(&stats->wait_time, gfp) ||
@@ -371,6 +396,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
bfqg_stats_exit(stats);
return -ENOMEM;
}
+#endif
return 0;
}
@@ -749,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
unsigned long flags;
int i;
+ spin_lock_irqsave(&bfqd->lock, flags);
+
if (!entity) /* root group */
- return;
+ goto put_async_queues;
- spin_lock_irqsave(&bfqd->lock, flags);
/*
* Empty all service_trees belonging to this group before
* deactivating the group itself.
@@ -783,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
}
__bfq_deactivate_entity(entity, false);
+
+put_async_queues:
bfq_put_async_queues(bfqd, bfqg);
spin_unlock_irqrestore(&bfqd->lock, flags);
@@ -887,6 +916,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
}
+#ifdef CONFIG_DEBUG_BLK_CGROUP
static int bfqg_print_stat(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
@@ -991,6 +1021,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
0, false);
return 0;
}
+#endif /* CONFIG_DEBUG_BLK_CGROUP */
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
@@ -1029,15 +1060,6 @@ struct cftype bfq_blkcg_legacy_files[] = {
/* statistics, covers only the tasks in the bfqg */
{
- .name = "bfq.time",
- .private = offsetof(struct bfq_group, stats.time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.sectors",
- .seq_show = bfqg_print_stat_sectors,
- },
- {
.name = "bfq.io_service_bytes",
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_bytes,
@@ -1047,6 +1069,16 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_ios,
},
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ {
+ .name = "bfq.time",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.sectors",
+ .seq_show = bfqg_print_stat_sectors,
+ },
{
.name = "bfq.io_service_time",
.private = offsetof(struct bfq_group, stats.service_time),
@@ -1067,18 +1099,10 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.queued),
.seq_show = bfqg_print_rwstat,
},
+#endif /* CONFIG_DEBUG_BLK_CGROUP */
/* the same statictics which cover the bfqg and its descendants */
{
- .name = "bfq.time_recursive",
- .private = offsetof(struct bfq_group, stats.time),
- .seq_show = bfqg_print_stat_recursive,
- },
- {
- .name = "bfq.sectors_recursive",
- .seq_show = bfqg_print_stat_sectors_recursive,
- },
- {
.name = "bfq.io_service_bytes_recursive",
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_bytes_recursive,
@@ -1088,6 +1112,16 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_ios_recursive,
},
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ {
+ .name = "bfq.time_recursive",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat_recursive,
+ },
+ {
+ .name = "bfq.sectors_recursive",
+ .seq_show = bfqg_print_stat_sectors_recursive,
+ },
{
.name = "bfq.io_service_time_recursive",
.private = offsetof(struct bfq_group, stats.service_time),
@@ -1132,6 +1166,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.dequeue),
.seq_show = bfqg_print_stat,
},
+#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
};
@@ -1147,18 +1182,6 @@ struct cftype bfq_blkg_files[] = {
#else /* CONFIG_BFQ_GROUP_IOSCHED */
-void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op) { }
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
- uint64_t io_start_time, unsigned int op) { }
-void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
-void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
-void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
-void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
-void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-
void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg) {}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index a4783da90ba8..aeca22d91101 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -108,6 +108,7 @@
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
+#include "blk-wbt.h"
#define BFQ_BFQQ_FNS(name) \
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
@@ -165,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
/* Default timeout values, in jiffies, approximating CFQ defaults. */
const int bfq_timeout = HZ / 8;
+/*
+ * Time limit for merging (see comments in bfq_setup_cooperator). Set
+ * to the slowest value that, in our tests, proved to be effective in
+ * removing false positives, while not causing true positives to miss
+ * queue merging.
+ *
+ * As can be deduced from the low time limit below, queue merging, if
+ * successful, happens at the very beggining of the I/O of the involved
+ * cooperating processes, as a consequence of the arrival of the very
+ * first requests from each cooperator. After that, there is very
+ * little chance to find cooperators.
+ */
+static const unsigned long bfq_merge_time_limit = HZ/10;
+
static struct kmem_cache *bfq_pool;
/* Below this threshold (in ns), we consider thinktime immediate. */
@@ -177,7 +192,7 @@ static struct kmem_cache *bfq_pool;
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
/* Min number of samples required to perform peak-rate update */
#define BFQ_RATE_MIN_SAMPLES 32
@@ -194,15 +209,17 @@ static struct kmem_cache *bfq_pool;
* interactive applications automatically, using the following formula:
* duration = (R / r) * T, where r is the peak rate of the device, and
* R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
+ * In particular, R is the peak rate of the reference device (see
+ * below), and T is a reference time: given the systems that are
+ * likely to be installed on the reference device according to its
+ * speed class, T is about the maximum time needed, under BFQ and
+ * while reading two files in parallel, to load typical large
+ * applications on these systems (see the comments on
+ * max_service_from_wr below, for more details on how T is obtained).
+ * In practice, the slower/faster the device at hand is, the more/less
+ * it takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to
+ * interactive applications.
*
* BFQ uses four different reference pairs (R, T), depending on:
* . whether the device is rotational or non-rotational;
@@ -239,6 +256,60 @@ static int T_slow[2];
static int T_fast[2];
static int device_speed_thresh[2];
+/*
+ * BFQ uses the above-detailed, time-based weight-raising mechanism to
+ * privilege interactive tasks. This mechanism is vulnerable to the
+ * following false positives: I/O-bound applications that will go on
+ * doing I/O for much longer than the duration of weight
+ * raising. These applications have basically no benefit from being
+ * weight-raised at the beginning of their I/O. On the opposite end,
+ * while being weight-raised, these applications
+ * a) unjustly steal throughput to applications that may actually need
+ * low latency;
+ * b) make BFQ uselessly perform device idling; device idling results
+ * in loss of device throughput with most flash-based storage, and may
+ * increase latencies when used purposelessly.
+ *
+ * BFQ tries to reduce these problems, by adopting the following
+ * countermeasure. To introduce this countermeasure, we need first to
+ * finish explaining how the duration of weight-raising for
+ * interactive tasks is computed.
+ *
+ * For a bfq_queue deemed as interactive, the duration of weight
+ * raising is dynamically adjusted, as a function of the estimated
+ * peak rate of the device, so as to be equal to the time needed to
+ * execute the 'largest' interactive task we benchmarked so far. By
+ * largest task, we mean the task for which each involved process has
+ * to do more I/O than for any of the other tasks we benchmarked. This
+ * reference interactive task is the start-up of LibreOffice Writer,
+ * and in this task each process/bfq_queue needs to have at most ~110K
+ * sectors transferred.
+ *
+ * This last piece of information enables BFQ to reduce the actual
+ * duration of weight-raising for at least one class of I/O-bound
+ * applications: those doing sequential or quasi-sequential I/O. An
+ * example is file copy. In fact, once started, the main I/O-bound
+ * processes of these applications usually consume the above 110K
+ * sectors in much less time than the processes of an application that
+ * is starting, because these I/O-bound processes will greedily devote
+ * almost all their CPU cycles only to their target,
+ * throughput-friendly I/O operations. This is even more true if BFQ
+ * happens to be underestimating the device peak rate, and thus
+ * overestimating the duration of weight raising. But, according to
+ * our measurements, once transferred 110K sectors, these processes
+ * have no right to be weight-raised any longer.
+ *
+ * Basing on the last consideration, BFQ ends weight-raising for a
+ * bfq_queue if the latter happens to have received an amount of
+ * service at least equal to the following constant. The constant is
+ * set to slightly more than 110K, to have a minimum safety margin.
+ *
+ * This early ending of weight-raising reduces the amount of time
+ * during which interactive false positives cause the two problems
+ * described at the beginning of these comments.
+ */
+static const unsigned long max_service_from_wr = 120000;
+
#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
@@ -402,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
}
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function.
+ */
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+{
+ bfqd->sb_shift = bt->sb.shift;
+
+ /*
+ * In-word depths if no bfq_queue is being weight-raised:
+ * leaving 25% of tags only for sync reads.
+ *
+ * In next formulas, right-shift the value
+ * (1U<<bfqd->sb_shift), instead of computing directly
+ * (1U<<(bfqd->sb_shift - something)), to be robust against
+ * any possible value of bfqd->sb_shift, without having to
+ * limit 'something'.
+ */
+ /* no more than 50% of tags for async I/O */
+ bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
+ /*
+ * no more than 75% of tags for sync writes (25% extra tags
+ * w.r.t. async I/O, to prevent async I/O from starving sync
+ * writes)
+ */
+ bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
+
+ /*
+ * In-word depths in case some bfq_queue is being weight-
+ * raised: leaving ~63% of tags for sync reads. This is the
+ * highest percentage for which, in our tests, application
+ * start-up times didn't suffer from any regression due to tag
+ * shortage.
+ */
+ /* no more than ~18% of tags for async I/O */
+ bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
+ /* no more than ~37% of tags for sync writes (~20% extra tags) */
+ bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
+}
+
+/*
+ * Async I/O can easily starve sync I/O (both sync reads and sync
+ * writes), by consuming all tags. Similarly, storms of sync writes,
+ * such as those that sync(2) may trigger, can starve sync reads.
+ * Limit depths of async I/O and sync writes so as to counter both
+ * problems.
+ */
+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ struct sbitmap_queue *bt;
+
+ if (op_is_sync(op) && !op_is_write(op))
+ return;
+
+ if (data->flags & BLK_MQ_REQ_RESERVED) {
+ if (unlikely(!tags->nr_reserved_tags)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ bt = &tags->breserved_tags;
+ } else
+ bt = &tags->bitmap_tags;
+
+ if (unlikely(bfqd->sb_shift != bt->sb.shift))
+ bfq_update_depths(bfqd, bt);
+
+ data->shallow_depth =
+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+
+ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+ __func__, bfqd->wr_busy_queues, op_is_sync(op),
+ data->shallow_depth);
+}
+
static struct bfq_queue *
bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
sector_t sector, struct rb_node **ret_parent,
@@ -443,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
return bfqq;
}
+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
+{
+ return bfqq->service_from_backlogged > 0 &&
+ time_is_before_jiffies(bfqq->first_IO_time +
+ bfq_merge_time_limit);
+}
+
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
struct rb_node **p, *parent;
@@ -453,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL;
}
+ /*
+ * bfqq cannot be merged any longer (see comments in
+ * bfq_setup_cooperator): no point in adding bfqq into the
+ * position tree.
+ */
+ if (bfq_too_late_for_merging(bfqq))
+ return;
+
if (bfq_class_idle(bfqq))
return;
if (!bfqq->next_rq)
@@ -724,6 +886,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
}
}
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+ u64 dur;
+
+ if (bfqd->bfq_wr_max_time > 0)
+ return bfqd->bfq_wr_max_time;
+
+ dur = bfqd->RT_prod;
+ do_div(dur, bfqd->peak_rate);
+
+ /*
+ * Limit duration between 3 and 13 seconds. Tests show that
+ * higher values than 13 seconds often yield the opposite of
+ * the desired result, i.e., worsen responsiveness by letting
+ * non-interactive and non-soft-real-time applications
+ * preserve weight raising for a too long time interval.
+ *
+ * On the other end, lower values than 3 seconds make it
+ * difficult for most interactive tasks to complete their jobs
+ * before weight-raising finishes.
+ */
+ if (dur > msecs_to_jiffies(13000))
+ dur = msecs_to_jiffies(13000);
+ else if (dur < msecs_to_jiffies(3000))
+ dur = msecs_to_jiffies(3000);
+
+ return dur;
+}
+
+/* switch back from soft real-time to interactive weight raising */
+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
+ struct bfq_data *bfqd)
+{
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
+}
+
static void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
struct bfq_io_cq *bic, bool bfq_already_existing)
@@ -750,10 +950,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
bfqq->wr_cur_max_time))) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "resume state: switching off wr");
-
- bfqq->wr_coeff = 1;
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ !bfq_bfqq_in_large_burst(bfqq) &&
+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd))) {
+ switch_back_to_interactive_wr(bfqq, bfqd);
+ } else {
+ bfqq->wr_coeff = 1;
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "resume state: switching off wr");
+ }
}
/* make sure weight will be updated, however we got here */
@@ -1173,33 +1379,22 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
return wr_or_deserves_wr;
}
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
{
- u64 dur;
-
- if (bfqd->bfq_wr_max_time > 0)
- return bfqd->bfq_wr_max_time;
-
- dur = bfqd->RT_prod;
- do_div(dur, bfqd->peak_rate);
-
- /*
- * Limit duration between 3 and 13 seconds. Tests show that
- * higher values than 13 seconds often yield the opposite of
- * the desired result, i.e., worsen responsiveness by letting
- * non-interactive and non-soft-real-time applications
- * preserve weight raising for a too long time interval.
- *
- * On the other end, lower values than 3 seconds make it
- * difficult for most interactive tasks to complete their jobs
- * before weight-raising finishes.
- */
- if (dur > msecs_to_jiffies(13000))
- dur = msecs_to_jiffies(13000);
- else if (dur < msecs_to_jiffies(3000))
- dur = msecs_to_jiffies(3000);
+ return jiffies + MAX_JIFFY_OFFSET;
+}
- return dur;
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+ return jiffies - MAX_JIFFY_OFFSET;
}
static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
@@ -1213,10 +1408,23 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
if (old_wr_coeff == 1 && wr_or_deserves_wr) {
/* start a weight-raising period */
if (interactive) {
+ bfqq->service_from_wr = 0;
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
} else {
- bfqq->wr_start_at_switch_to_srt = jiffies;
+ /*
+ * No interactive weight raising in progress
+ * here: assign minus infinity to
+ * wr_start_at_switch_to_srt, to make sure
+ * that, at the end of the soft-real-time
+ * weight raising periods that is starting
+ * now, no interactive weight-raising period
+ * may be wrongly considered as still in
+ * progress (and thus actually started by
+ * mistake).
+ */
+ bfqq->wr_start_at_switch_to_srt =
+ bfq_smallest_from_now();
bfqq->wr_coeff = bfqd->bfq_wr_coeff *
BFQ_SOFTRT_WEIGHT_FACTOR;
bfqq->wr_cur_max_time =
@@ -1313,7 +1521,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3;
- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
/*
* bfqq deserves to be weight-raised if:
@@ -1582,12 +1789,13 @@ static void bfq_remove_request(struct request_queue *q,
rb_erase(&bfqq->pos_node, bfqq->pos_root);
bfqq->pos_root = NULL;
}
+ } else {
+ bfq_pos_tree_add_move(bfqd, bfqq);
}
if (rq->cmd_flags & REQ_META)
bfqq->meta_pending--;
- bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
}
static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
@@ -1700,6 +1908,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
bfqq->next_rq = rq;
bfq_remove_request(q, next);
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags);
spin_unlock_irq(&bfqq->bfqd->lock);
end:
@@ -1888,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
struct bfq_queue *new_bfqq)
{
+ if (bfq_too_late_for_merging(new_bfqq))
+ return false;
+
if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
(bfqq->ioprio_class != new_bfqq->ioprio_class))
return false;
@@ -1912,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
}
/*
- * If this function returns true, then bfqq cannot be merged. The idea
- * is that true cooperation happens very early after processes start
- * to do I/O. Usually, late cooperations are just accidental false
- * positives. In case bfqq is weight-raised, such false positives
- * would evidently degrade latency guarantees for bfqq.
- */
-static bool wr_from_too_long(struct bfq_queue *bfqq)
-{
- return bfqq->wr_coeff > 1 &&
- time_is_before_jiffies(bfqq->last_wr_start_finish +
- msecs_to_jiffies(100));
-}
-
-/*
* Attempt to schedule a merge of bfqq with the currently in-service
* queue or with a close queue among the scheduled queues. Return
* NULL if no merge was scheduled, a pointer to the shared bfq_queue
@@ -1938,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)
* to maintain. Besides, in such a critical condition as an out of memory,
* the benefits of queue merging may be little relevant, or even negligible.
*
- * Weight-raised queues can be merged only if their weight-raising
- * period has just started. In fact cooperating processes are usually
- * started together. Thus, with this filter we avoid false positives
- * that would jeopardize low-latency guarantees.
- *
* WARNING: queue merging may impair fairness among non-weight raised
* queues, for at least two reasons: 1) the original weight of a
* merged queue may change during the merged state, 2) even being the
@@ -1956,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ /*
+ * Prevent bfqq from being merged if it has been created too
+ * long ago. The idea is that true cooperating processes, and
+ * thus their associated bfq_queues, are supposed to be
+ * created shortly after each other. This is the case, e.g.,
+ * for KVM/QEMU and dump I/O threads. Basing on this
+ * assumption, the following filtering greatly reduces the
+ * probability that two non-cooperating processes, which just
+ * happen to do close I/O for some short time interval, have
+ * their queues merged by mistake.
+ */
+ if (bfq_too_late_for_merging(bfqq))
+ return NULL;
+
if (bfqq->new_bfqq)
return bfqq->new_bfqq;
- if (!io_struct ||
- wr_from_too_long(bfqq) ||
- unlikely(bfqq == &bfqd->oom_bfqq))
+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
return NULL;
/* If there is only one backlogged queue, don't search. */
@@ -1970,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
in_service_bfqq = bfqd->in_service_queue;
- if (!in_service_bfqq || in_service_bfqq == bfqq
- || wr_from_too_long(in_service_bfqq) ||
- unlikely(in_service_bfqq == &bfqd->oom_bfqq))
- goto check_scheduled;
-
- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+ if (in_service_bfqq && in_service_bfqq != bfqq &&
+ likely(in_service_bfqq != &bfqd->oom_bfqq) &&
+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
bfqq->entity.parent == in_service_bfqq->entity.parent &&
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@@ -1987,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* queues. The only thing we need is that the bio/request is not
* NULL, as we need it to establish whether a cooperator exists.
*/
-check_scheduled:
new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
bfq_io_struct_pos(io_struct, request));
- if (new_bfqq && !wr_from_too_long(new_bfqq) &&
- likely(new_bfqq != &bfqd->oom_bfqq) &&
+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
bfq_may_be_close_cooperator(bfqq, new_bfqq))
return bfq_setup_merge(bfqq, new_bfqq);
@@ -2016,10 +2216,28 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
- bic->saved_wr_coeff = bfqq->wr_coeff;
- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ if (unlikely(bfq_bfqq_just_created(bfqq) &&
+ !bfq_bfqq_in_large_burst(bfqq) &&
+ bfqq->bfqd->low_latency)) {
+ /*
+ * bfqq being merged right after being created: bfqq
+ * would have deserved interactive weight raising, but
+ * did not make it to be set in a weight-raised state,
+ * because of this early merge. Store directly the
+ * weight-raising state that would have been assigned
+ * to bfqq, so that to avoid that bfqq unjustly fails
+ * to enjoy weight raising if split soon.
+ */
+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
+ bic->saved_last_wr_start_finish = jiffies;
+ } else {
+ bic->saved_wr_coeff = bfqq->wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt =
+ bfqq->wr_start_at_switch_to_srt;
+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ }
}
static void
@@ -2166,7 +2384,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
if (bfqq) {
- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
bfq_clear_bfqq_fifo_expire(bfqq);
bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
@@ -2856,63 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* whereas soft_rt_next_start is set to infinity for applications that do
* not.
*
- * Unfortunately, even a greedy application may happen to behave in an
- * isochronous way if the CPU load is high. In fact, the application may
- * stop issuing requests while the CPUs are busy serving other processes,
- * then restart, then stop again for a while, and so on. In addition, if
- * the disk achieves a low enough throughput with the request pattern
- * issued by the application (e.g., because the request pattern is random
- * and/or the device is slow), then the application may meet the above
- * bandwidth requirement too. To prevent such a greedy application to be
- * deemed as soft real-time, a further rule is used in the computation of
- * soft_rt_next_start: soft_rt_next_start must be higher than the current
- * time plus the maximum time for which the arrival of a request is waited
- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
- * This filters out greedy applications, as the latter issue instead their
- * next request as soon as possible after the last one has been completed
- * (in contrast, when a batch of requests is completed, a soft real-time
- * application spends some time processing data).
+ * Unfortunately, even a greedy (i.e., I/O-bound) application may
+ * happen to meet, occasionally or systematically, both the above
+ * bandwidth and isochrony requirements. This may happen at least in
+ * the following circumstances. First, if the CPU load is high. The
+ * application may stop issuing requests while the CPUs are busy
+ * serving other processes, then restart, then stop again for a while,
+ * and so on. The other circumstances are related to the storage
+ * device: the storage device is highly loaded or reaches a low-enough
+ * throughput with the I/O of the application (e.g., because the I/O
+ * is random and/or the device is slow). In all these cases, the
+ * I/O of the application may be simply slowed down enough to meet
+ * the bandwidth and isochrony requirements. To reduce the probability
+ * that greedy applications are deemed as soft real-time in these
+ * corner cases, a further rule is used in the computation of
+ * soft_rt_next_start: the return value of this function is forced to
+ * be higher than the maximum between the following two quantities.
+ *
+ * (a) Current time plus: (1) the maximum time for which the arrival
+ * of a request is waited for when a sync queue becomes idle,
+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We
+ * postpone for a moment the reason for adding a few extra
+ * jiffies; we get back to it after next item (b). Lower-bounding
+ * the return value of this function with the current time plus
+ * bfqd->bfq_slice_idle tends to filter out greedy applications,
+ * because the latter issue their next request as soon as possible
+ * after the last one has been completed. In contrast, a soft
+ * real-time application spends some time processing data, after a
+ * batch of its requests has been completed.
*
- * Unfortunately, the last filter may easily generate false positives if
- * only bfqd->bfq_slice_idle is used as a reference time interval and one
- * or both the following cases occur:
- * 1) HZ is so low that the duration of a jiffy is comparable to or higher
- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
- * HZ=100.
+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out
+ * above, greedy applications may happen to meet both the
+ * bandwidth and isochrony requirements under heavy CPU or
+ * storage-device load. In more detail, in these scenarios, these
+ * applications happen, only for limited time periods, to do I/O
+ * slowly enough to meet all the requirements described so far,
+ * including the filtering in above item (a). These slow-speed
+ * time intervals are usually interspersed between other time
+ * intervals during which these applications do I/O at a very high
+ * speed. Fortunately, exactly because of the high speed of the
+ * I/O in the high-speed intervals, the values returned by this
+ * function happen to be so high, near the end of any such
+ * high-speed interval, to be likely to fall *after* the end of
+ * the low-speed time interval that follows. These high values are
+ * stored in bfqq->soft_rt_next_start after each invocation of
+ * this function. As a consequence, if the last value of
+ * bfqq->soft_rt_next_start is constantly used to lower-bound the
+ * next value that this function may return, then, from the very
+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is
+ * likely to be constantly kept so high that any I/O request
+ * issued during the low-speed interval is considered as arriving
+ * to soon for the application to be deemed as soft
+ * real-time. Then, in the high-speed interval that follows, the
+ * application will not be deemed as soft real-time, just because
+ * it will do I/O at a high speed. And so on.
+ *
+ * Getting back to the filtering in item (a), in the following two
+ * cases this filtering might be easily passed by a greedy
+ * application, if the reference quantity was just
+ * bfqd->bfq_slice_idle:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or
+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow
+ * devices with HZ=100. The time granularity may be so coarse
+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle
+ * is rather lower than the exact value.
* 2) jiffies, instead of increasing at a constant rate, may stop increasing
* for a while, then suddenly 'jump' by several units to recover the lost
* increments. This seems to happen, e.g., inside virtual machines.
- * To address this issue, we do not use as a reference time interval just
- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
- * particular we add the minimum number of jiffies for which the filter
- * seems to be quite precise also in embedded systems and KVM/QEMU virtual
- * machines.
+ * To address this issue, in the filtering in (a) we do not use as a
+ * reference time interval just bfqd->bfq_slice_idle, but
+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the
+ * minimum number of jiffies for which the filter seems to be quite
+ * precise also in embedded systems and KVM/QEMU virtual machines.
*/
static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
- return max(bfqq->last_idle_bklogged +
- HZ * bfqq->service_from_backlogged /
- bfqd->bfq_wr_max_softrt_rate,
- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
-}
-
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
- return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
- * Return the farthest past time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_smallest_from_now(void)
-{
- return jiffies - MAX_JIFFY_OFFSET;
+ return max3(bfqq->soft_rt_next_start,
+ bfqq->last_idle_bklogged +
+ HZ * bfqq->service_from_backlogged /
+ bfqd->bfq_wr_max_softrt_rate,
+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
}
/**
@@ -2957,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
/*
- * Increase service_from_backlogged before next statement,
- * because the possible next invocation of
- * bfq_bfqq_charge_time would likely inflate
- * entity->service. In contrast, service_from_backlogged must
- * contain real service, to enable the soft real-time
- * heuristic to correctly compute the bandwidth consumed by
- * bfqq.
- */
- bfqq->service_from_backlogged += entity->service;
-
- /*
* As above explained, charge slow (typically seeky) and
* timed-out queues with the time and not the service
* received, to favor sequential workloads.
@@ -3425,7 +3655,6 @@ check_queue:
*/
bfq_clear_bfqq_wait_request(bfqq);
hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
- bfqg_stats_update_idle_time(bfqq_group(bfqq));
}
goto keep_queue;
}
@@ -3489,14 +3718,16 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_wr_duration(bfqd)))
bfq_bfqq_end_wr(bfqq);
else {
- /* switch back to interactive wr */
- bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
- bfqq->last_wr_start_finish =
- bfqq->wr_start_at_switch_to_srt;
+ switch_back_to_interactive_wr(bfqq, bfqd);
bfqq->entity.prio_changed = 1;
}
}
+ if (bfqq->wr_coeff > 1 &&
+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
+ bfqq->service_from_wr > max_service_from_wr) {
+ /* see comments on max_service_from_wr */
+ bfq_bfqq_end_wr(bfqq);
+ }
}
/*
* To improve latency (for this or other queues), immediately
@@ -3592,20 +3823,22 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
}
/*
- * We exploit the put_rq_private hook to decrement
- * rq_in_driver, but put_rq_private will not be
- * invoked on this request. So, to avoid unbalance,
- * just start this request, without incrementing
- * rq_in_driver. As a negative consequence,
- * rq_in_driver is deceptively lower than it should be
- * while this request is in service. This may cause
- * bfq_schedule_dispatch to be invoked uselessly.
+ * We exploit the bfq_finish_requeue_request hook to
+ * decrement rq_in_driver, but
+ * bfq_finish_requeue_request will not be invoked on
+ * this request. So, to avoid unbalance, just start
+ * this request, without incrementing rq_in_driver. As
+ * a negative consequence, rq_in_driver is deceptively
+ * lower than it should be while this request is in
+ * service. This may cause bfq_schedule_dispatch to be
+ * invoked uselessly.
*
* As for implementing an exact solution, the
- * put_request hook, if defined, is probably invoked
- * also on this request. So, by exploiting this hook,
- * we could 1) increment rq_in_driver here, and 2)
- * decrement it in put_request. Such a solution would
+ * bfq_finish_requeue_request hook, if defined, is
+ * probably invoked also on this request. So, by
+ * exploiting this hook, we could 1) increment
+ * rq_in_driver here, and 2) decrement it in
+ * bfq_finish_requeue_request. Such a solution would
* let the value of the counter be always accurate,
* but it would entail using an extra interface
* function. This cost seems higher than the benefit,
@@ -3651,16 +3884,80 @@ exit:
return rq;
}
+#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+static void bfq_update_dispatch_stats(struct request_queue *q,
+ struct request *rq,
+ struct bfq_queue *in_serv_queue,
+ bool idle_timer_disabled)
+{
+ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL;
+
+ if (!idle_timer_disabled && !bfqq)
+ return;
+
+ /*
+ * rq and bfqq are guaranteed to exist until this function
+ * ends, for the following reasons. First, rq can be
+ * dispatched to the device, and then can be completed and
+ * freed, only after this function ends. Second, rq cannot be
+ * merged (and thus freed because of a merge) any longer,
+ * because it has already started. Thus rq cannot be freed
+ * before this function ends, and, since rq has a reference to
+ * bfqq, the same guarantee holds for bfqq too.
+ *
+ * In addition, the following queue lock guarantees that
+ * bfqq_group(bfqq) exists as well.
+ */
+ spin_lock_irq(q->queue_lock);
+ if (idle_timer_disabled)
+ /*
+ * Since the idle timer has been disabled,
+ * in_serv_queue contained some request when
+ * __bfq_dispatch_request was invoked above, which
+ * implies that rq was picked exactly from
+ * in_serv_queue. Thus in_serv_queue == bfqq, and is
+ * therefore guaranteed to exist because of the above
+ * arguments.
+ */
+ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue));
+ if (bfqq) {
+ struct bfq_group *bfqg = bfqq_group(bfqq);
+
+ bfqg_stats_update_avg_queue_size(bfqg);
+ bfqg_stats_set_start_empty_time(bfqg);
+ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
+ }
+ spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void bfq_update_dispatch_stats(struct request_queue *q,
+ struct request *rq,
+ struct bfq_queue *in_serv_queue,
+ bool idle_timer_disabled) {}
+#endif
+
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
+ struct bfq_queue *in_serv_queue;
+ bool waiting_rq, idle_timer_disabled;
spin_lock_irq(&bfqd->lock);
+ in_serv_queue = bfqd->in_service_queue;
+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
+
rq = __bfq_dispatch_request(hctx);
+
+ idle_timer_disabled =
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+
spin_unlock_irq(&bfqd->lock);
+ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
+ idle_timer_disabled);
+
return rq;
}
@@ -3685,16 +3982,37 @@ void bfq_put_queue(struct bfq_queue *bfqq)
if (bfqq->ref)
return;
- if (bfq_bfqq_sync(bfqq))
+ if (!hlist_unhashed(&bfqq->burst_list_node)) {
+ hlist_del_init(&bfqq->burst_list_node);
/*
- * The fact that this queue is being destroyed does not
- * invalidate the fact that this queue may have been
- * activated during the current burst. As a consequence,
- * although the queue does not exist anymore, and hence
- * needs to be removed from the burst list if there,
- * the burst size has not to be decremented.
+ * Decrement also burst size after the removal, if the
+ * process associated with bfqq is exiting, and thus
+ * does not contribute to the burst any longer. This
+ * decrement helps filter out false positives of large
+ * bursts, when some short-lived process (often due to
+ * the execution of commands by some service) happens
+ * to start and exit while a complex application is
+ * starting, and thus spawning several processes that
+ * do I/O (and that *must not* be treated as a large
+ * burst, see comments on bfq_handle_burst).
+ *
+ * In particular, the decrement is performed only if:
+ * 1) bfqq is not a merged queue, because, if it is,
+ * then this free of bfqq is not triggered by the exit
+ * of the process bfqq is associated with, but exactly
+ * by the fact that bfqq has just been merged.
+ * 2) burst_size is greater than 0, to handle
+ * unbalanced decrements. Unbalanced decrements may
+ * happen in te following case: bfqq is inserted into
+ * the current burst list--without incrementing
+ * bust_size--because of a split, but the current
+ * burst list is not the burst list bfqq belonged to
+ * (see comments on the case of a split in
+ * bfq_set_request).
*/
- hlist_del_init(&bfqq->burst_list_node);
+ if (bfqq->bic && bfqq->bfqd->burst_size > 0)
+ bfqq->bfqd->burst_size--;
+ }
kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -3888,10 +4206,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->split_time = bfq_smallest_from_now();
/*
- * Set to the value for which bfqq will not be deemed as
- * soft rt when it becomes backlogged.
+ * To not forget the possibly high bandwidth consumed by a
+ * process/queue in the recent past,
+ * bfq_bfqq_softrt_next_start() returns a value at least equal
+ * to the current value of bfqq->soft_rt_next_start (see
+ * comments on bfq_bfqq_softrt_next_start). Set
+ * soft_rt_next_start to now, to mean that bfqq has consumed
+ * no bandwidth so far.
*/
- bfqq->soft_rt_next_start = bfq_greatest_from_now();
+ bfqq->soft_rt_next_start = jiffies;
/* first request is almost certainly seeky */
bfqq->seek_history = 1;
@@ -4097,7 +4420,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfq_clear_bfqq_wait_request(bfqq);
hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
- bfqg_stats_update_idle_time(bfqq_group(bfqq));
/*
* The queue is not empty, because a new request just
@@ -4112,10 +4434,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
}
-static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
+/* returns true if it causes the idle timer to be disabled */
+static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq),
*new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+ bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
@@ -4127,7 +4451,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
new_bfqq->allocated++;
bfqq->allocated--;
new_bfqq->ref++;
- bfq_clear_bfqq_just_created(bfqq);
/*
* If the bic associated with the process
* issuing this request still points to bfqq
@@ -4139,6 +4462,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
bfqq, new_bfqq);
+
+ bfq_clear_bfqq_just_created(bfqq);
/*
* rq is about to be enqueued into new_bfqq,
* release rq reference on bfqq
@@ -4148,19 +4473,60 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfqq = new_bfqq;
}
+ waiting = bfqq && bfq_bfqq_wait_request(bfqq);
bfq_add_request(rq);
+ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq);
+
+ return idle_timer_disabled;
}
+#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+static void bfq_update_insert_stats(struct request_queue *q,
+ struct bfq_queue *bfqq,
+ bool idle_timer_disabled,
+ unsigned int cmd_flags)
+{
+ if (!bfqq)
+ return;
+
+ /*
+ * bfqq still exists, because it can disappear only after
+ * either it is merged with another queue, or the process it
+ * is associated with exits. But both actions must be taken by
+ * the same process currently executing this flow of
+ * instructions.
+ *
+ * In addition, the following queue lock guarantees that
+ * bfqq_group(bfqq) exists as well.
+ */
+ spin_lock_irq(q->queue_lock);
+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
+ if (idle_timer_disabled)
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+ spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void bfq_update_insert_stats(struct request_queue *q,
+ struct bfq_queue *bfqq,
+ bool idle_timer_disabled,
+ unsigned int cmd_flags) {}
+#endif
+
+static void bfq_prepare_request(struct request *rq, struct bio *bio);
+
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ bool idle_timer_disabled = false;
+ unsigned int cmd_flags;
spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq)) {
@@ -4179,7 +4545,25 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
} else {
- __bfq_insert_request(bfqd, rq);
+ if (WARN_ON_ONCE(!bfqq)) {
+ /*
+ * This should never happen. Most likely rq is
+ * a requeued regular request, being
+ * re-inserted without being first
+ * re-prepared. Do a prepare, to avoid
+ * failure.
+ */
+ bfq_prepare_request(rq, rq->bio);
+ bfqq = RQ_BFQQ(rq);
+ }
+
+ idle_timer_disabled = __bfq_insert_request(bfqd, rq);
+ /*
+ * Update bfqq, because, if a queue merge has occurred
+ * in __bfq_insert_request, then rq has been
+ * redirected into a new queue.
+ */
+ bfqq = RQ_BFQQ(rq);
if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
@@ -4188,7 +4572,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
}
}
+ /*
+ * Cache cmd_flags before releasing scheduler lock, because rq
+ * may disappear afterwards (for example, because of a request
+ * merge).
+ */
+ cmd_flags = rq->cmd_flags;
+
spin_unlock_irq(&bfqd->lock);
+
+ bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
+ cmd_flags);
}
static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@@ -4319,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_schedule_dispatch(bfqd);
}
-static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
{
bfqq->allocated--;
bfq_put_queue(bfqq);
}
-static void bfq_finish_request(struct request *rq)
+/*
+ * Handle either a requeue or a finish for rq. The things to do are
+ * the same in both cases: all references to rq are to be dropped. In
+ * particular, rq is considered completed from the point of view of
+ * the scheduler.
+ */
+static void bfq_finish_requeue_request(struct request *rq)
{
- struct bfq_queue *bfqq;
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd;
- if (!rq->elv.icq)
+ /*
+ * Requeue and finish hooks are invoked in blk-mq without
+ * checking whether the involved request is actually still
+ * referenced in the scheduler. To handle this fact, the
+ * following two checks make this function exit in case of
+ * spurious invocations, for which there is nothing to do.
+ *
+ * First, check whether rq has nothing to do with an elevator.
+ */
+ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
+ return;
+
+ /*
+ * rq either is not associated with any icq, or is an already
+ * requeued request that has not (yet) been re-inserted into
+ * a bfq_queue.
+ */
+ if (!rq->elv.icq || !bfqq)
return;
- bfqq = RQ_BFQQ(rq);
bfqd = bfqq->bfqd;
if (rq->rq_flags & RQF_STARTED)
@@ -4349,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
spin_lock_irqsave(&bfqd->lock, flags);
bfq_completed_request(bfqq, bfqd);
- bfq_put_rq_priv_body(bfqq);
+ bfq_finish_requeue_request_body(bfqq);
spin_unlock_irqrestore(&bfqd->lock, flags);
} else {
/*
* Request rq may be still/already in the scheduler,
- * in which case we need to remove it. And we cannot
+ * in which case we need to remove it (this should
+ * never happen in case of requeue). And we cannot
* defer such a check and removal, to avoid
* inconsistencies in the time interval from the end
* of this function to the start of the deferred work.
@@ -4365,11 +4782,31 @@ static void bfq_finish_request(struct request *rq)
* lock is held.
*/
- if (!RB_EMPTY_NODE(&rq->rb_node))
+ if (!RB_EMPTY_NODE(&rq->rb_node)) {
bfq_remove_request(rq->q, rq);
- bfq_put_rq_priv_body(bfqq);
+ bfqg_stats_update_io_remove(bfqq_group(bfqq),
+ rq->cmd_flags);
+ }
+ bfq_finish_requeue_request_body(bfqq);
}
+ /*
+ * Reset private fields. In case of a requeue, this allows
+ * this function to correctly do nothing if it is spuriously
+ * invoked again on this same request (see the check at the
+ * beginning of the function). Probably, a better general
+ * design would be to prevent blk-mq from invoking the requeue
+ * or finish hooks of an elevator, for a request that is not
+ * referred by that elevator.
+ *
+ * Resetting the following fields would break the
+ * request-insertion logic if rq is re-inserted into a bfq
+ * internal queue, without a re-preparation. Here we assume
+ * that re-insertions of requeued requests, without
+ * re-preparation, can happen only for pass_through or at_head
+ * requests (which are not re-inserted into bfq internal
+ * queues).
+ */
rq->elv.priv[0] = NULL;
rq->elv.priv[1] = NULL;
}
@@ -4424,6 +4861,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
else {
bfq_clear_bfqq_in_large_burst(bfqq);
if (bic->was_in_burst_list)
+ /*
+ * If bfqq was in the current
+ * burst list before being
+ * merged, then we have to add
+ * it back. And we do not need
+ * to increase burst_size, as
+ * we did not decrement
+ * burst_size when we removed
+ * bfqq from the burst list as
+ * a consequence of a merge
+ * (see comments in
+ * bfq_put_queue). In this
+ * respect, it would be rather
+ * costly to know whether the
+ * current burst list is still
+ * the same burst list from
+ * which bfqq was removed on
+ * the merge. To avoid this
+ * cost, if bfqq was in a
+ * burst list, then we add
+ * bfqq to the current burst
+ * list without any further
+ * check. This can cause
+ * inappropriate insertions,
+ * but rarely enough to not
+ * harm the detection of large
+ * bursts significantly.
+ */
hlist_add_head(&bfqq->burst_list_node,
&bfqd->burst_list);
}
@@ -4624,6 +5089,9 @@ static void bfq_exit_queue(struct elevator_queue *e)
hrtimer_cancel(&bfqd->idle_slice_timer);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ /* release oom-queue reference to root group */
+ bfqg_and_blkg_put(bfqd->root_group);
+
blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
#else
spin_lock_irq(&bfqd->lock);
@@ -4775,7 +5243,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-
+ wbt_disable_default(q);
return 0;
out_free:
@@ -5012,8 +5480,10 @@ static struct elv_fs_entry bfq_attrs[] = {
static struct elevator_type iosched_bfq_mq = {
.ops.mq = {
+ .limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
- .finish_request = bfq_finish_request,
+ .requeue_request = bfq_finish_requeue_request,
+ .finish_request = bfq_finish_requeue_request,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ac0809c72c98..350c39ae2896 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -337,6 +337,11 @@ struct bfq_queue {
* last transition from idle to backlogged.
*/
unsigned long service_from_backlogged;
+ /*
+ * Cumulative service received from the @bfq_queue since its
+ * last transition to weight-raised state.
+ */
+ unsigned long service_from_wr;
/*
* Value of wr start time when switching to soft rt
@@ -344,6 +349,8 @@ struct bfq_queue {
unsigned long wr_start_at_switch_to_srt;
unsigned long split_time; /* time of last split */
+
+ unsigned long first_IO_time; /* time of first I/O for this queue */
};
/**
@@ -627,6 +634,18 @@ struct bfq_data {
struct bfq_io_cq *bio_bic;
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq;
+
+ /*
+ * Cached sbitmap shift, used to compute depth limits in
+ * bfq_update_depths.
+ */
+ unsigned int sb_shift;
+
+ /*
+ * Depth limits used in bfq_limit_depth (see comments on the
+ * function)
+ */
+ unsigned int word_depths[2][2];
};
enum bfqq_state_flags {
@@ -689,7 +708,7 @@ enum bfqq_expiration {
};
struct bfqg_stats {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -717,7 +736,7 @@ struct bfqg_stats {
uint64_t start_idle_time;
uint64_t start_empty_time;
uint16_t flags;
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
};
#ifdef CONFIG_BFQ_GROUP_IOSCHED
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 414ba686a847..4498c43245e2 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st;
+ if (!bfqq->service_from_backlogged)
+ bfqq->first_IO_time = jiffies;
+
+ if (bfqq->wr_coeff > 1)
+ bfqq->service_from_wr += served;
+
+ bfqq->service_from_backlogged += served;
for_each_entity(entity) {
st = bfq_entity_service_tree(entity);
@@ -843,7 +850,6 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
st->vtime += bfq_delta(served, st->wsum);
bfq_forget_idle(st);
}
- bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5df32907ff3b..9cfdd6c83b5b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
- * @error: Pointer to errno
*
* Description: Completion for integrity I/O
*
@@ -485,11 +484,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
void bioset_integrity_free(struct bio_set *bs)
{
- if (bs->bio_integrity_pool)
- mempool_destroy(bs->bio_integrity_pool);
-
- if (bs->bvec_integrity_pool)
- mempool_destroy(bs->bvec_integrity_pool);
+ mempool_destroy(bs->bio_integrity_pool);
+ mempool_destroy(bs->bvec_integrity_pool);
}
EXPORT_SYMBOL(bioset_integrity_free);
diff --git a/block/bio.c b/block/bio.c
index 101c2a9b5481..e1708db48258 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
/**
* bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask: the GFP_ mask given to the slab allocator
+ * @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
@@ -597,7 +597,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
* so we don't set nor calculate new physical/hw segment counts here
*/
bio->bi_disk = bio_src->bi_disk;
+ bio->bi_partno = bio_src->bi_partno;
bio_set_flag(bio, BIO_CLONED);
+ if (bio_flagged(bio_src, BIO_THROTTLED))
+ bio_set_flag(bio, BIO_THROTTLED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
@@ -917,17 +920,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
-struct submit_bio_ret {
- struct completion event;
- int error;
-};
-
static void submit_bio_wait_endio(struct bio *bio)
{
- struct submit_bio_ret *ret = bio->bi_private;
-
- ret->error = blk_status_to_errno(bio->bi_status);
- complete(&ret->event);
+ complete(bio->bi_private);
}
/**
@@ -943,16 +938,15 @@ static void submit_bio_wait_endio(struct bio *bio)
*/
int submit_bio_wait(struct bio *bio)
{
- struct submit_bio_ret ret;
+ DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
- init_completion(&ret.event);
- bio->bi_private = &ret;
+ bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
- wait_for_completion_io(&ret.event);
+ wait_for_completion_io(&done);
- return ret.error;
+ return blk_status_to_errno(bio->bi_status);
}
EXPORT_SYMBOL(submit_bio_wait);
@@ -977,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
EXPORT_SYMBOL(bio_advance);
/**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
- int i;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, i) {
- bv->bv_page = alloc_page(gfp_mask);
- if (!bv->bv_page) {
- while (--bv >= bio->bi_io_vec)
- __free_page(bv->bv_page);
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-
-/**
* bio_copy_data - copy contents of data buffers from one chain of bios to
* another
* @src: source bio list
@@ -1070,14 +1036,21 @@ struct bio_map_data {
struct iovec iov[];
};
-static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
+static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
gfp_t gfp_mask)
{
- if (iov_count > UIO_MAXIOV)
+ struct bio_map_data *bmd;
+ if (data->nr_segs > UIO_MAXIOV)
return NULL;
- return kmalloc(sizeof(struct bio_map_data) +
- sizeof(struct iovec) * iov_count, gfp_mask);
+ bmd = kmalloc(sizeof(struct bio_map_data) +
+ sizeof(struct iovec) * data->nr_segs, gfp_mask);
+ if (!bmd)
+ return NULL;
+ memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
+ bmd->iter = *data;
+ bmd->iter.iov = bmd->iov;
+ return bmd;
}
/**
@@ -1088,7 +1061,7 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
* Copy all pages from iov_iter to bio.
* Returns 0 on success, or error on failure.
*/
-static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter)
+static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
{
int i;
struct bio_vec *bvec;
@@ -1099,9 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter)
ret = copy_page_from_iter(bvec->bv_page,
bvec->bv_offset,
bvec->bv_len,
- &iter);
+ iter);
- if (!iov_iter_count(&iter))
+ if (!iov_iter_count(iter))
break;
if (ret < bvec->bv_len)
@@ -1195,40 +1168,18 @@ int bio_uncopy_user(struct bio *bio)
*/
struct bio *bio_copy_user_iov(struct request_queue *q,
struct rq_map_data *map_data,
- const struct iov_iter *iter,
+ struct iov_iter *iter,
gfp_t gfp_mask)
{
struct bio_map_data *bmd;
struct page *page;
struct bio *bio;
- int i, ret;
- int nr_pages = 0;
+ int i = 0, ret;
+ int nr_pages;
unsigned int len = iter->count;
unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
- for (i = 0; i < iter->nr_segs; i++) {
- unsigned long uaddr;
- unsigned long end;
- unsigned long start;
-
- uaddr = (unsigned long) iter->iov[i].iov_base;
- end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
- start = uaddr >> PAGE_SHIFT;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages += end - start;
- }
-
- if (offset)
- nr_pages++;
-
- bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask);
+ bmd = bio_alloc_map_data(iter, gfp_mask);
if (!bmd)
return ERR_PTR(-ENOMEM);
@@ -1238,9 +1189,10 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
* shortlived one.
*/
bmd->is_our_pages = map_data ? 0 : 1;
- memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs);
- bmd->iter = *iter;
- bmd->iter.iov = bmd->iov;
+
+ nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ if (nr_pages > BIO_MAX_PAGES)
+ nr_pages = BIO_MAX_PAGES;
ret = -ENOMEM;
bio = bio_kmalloc(gfp_mask, nr_pages);
@@ -1289,17 +1241,24 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
if (ret)
goto cleanup;
+ if (map_data)
+ map_data->offset += bio->bi_iter.bi_size;
+
/*
* success
*/
if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) ||
(map_data && map_data->from_user)) {
- ret = bio_copy_from_iter(bio, *iter);
+ ret = bio_copy_from_iter(bio, iter);
if (ret)
goto cleanup;
+ } else {
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
}
bio->bi_private = bmd;
+ if (map_data && map_data->null_mapped)
+ bio_set_flag(bio, BIO_NULL_MAPPED);
return bio;
cleanup:
if (!map_data)
@@ -1320,111 +1279,74 @@ out_bmd:
* device. Returns an error pointer in case of error.
*/
struct bio *bio_map_user_iov(struct request_queue *q,
- const struct iov_iter *iter,
+ struct iov_iter *iter,
gfp_t gfp_mask)
{
int j;
- int nr_pages = 0;
- struct page **pages;
struct bio *bio;
- int cur_page = 0;
- int ret, offset;
- struct iov_iter i;
- struct iovec iov;
+ int ret;
struct bio_vec *bvec;
- iov_for_each(iov, i, *iter) {
- unsigned long uaddr = (unsigned long) iov.iov_base;
- unsigned long len = iov.iov_len;
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages += end - start;
- /*
- * buffer must be aligned to at least logical block size for now
- */
- if (uaddr & queue_dma_alignment(q))
- return ERR_PTR(-EINVAL);
- }
-
- if (!nr_pages)
+ if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
- bio = bio_kmalloc(gfp_mask, nr_pages);
+ bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
if (!bio)
return ERR_PTR(-ENOMEM);
- ret = -ENOMEM;
- pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
- if (!pages)
- goto out;
+ while (iov_iter_count(iter)) {
+ struct page **pages;
+ ssize_t bytes;
+ size_t offs, added = 0;
+ int npages;
- iov_for_each(iov, i, *iter) {
- unsigned long uaddr = (unsigned long) iov.iov_base;
- unsigned long len = iov.iov_len;
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
- const int local_nr_pages = end - start;
- const int page_limit = cur_page + local_nr_pages;
-
- ret = get_user_pages_fast(uaddr, local_nr_pages,
- (iter->type & WRITE) != WRITE,
- &pages[cur_page]);
- if (unlikely(ret < local_nr_pages)) {
- for (j = cur_page; j < page_limit; j++) {
- if (!pages[j])
- break;
- put_page(pages[j]);
- }
- ret = -EFAULT;
+ bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
+ if (unlikely(bytes <= 0)) {
+ ret = bytes ? bytes : -EFAULT;
goto out_unmap;
}
- offset = offset_in_page(uaddr);
- for (j = cur_page; j < page_limit; j++) {
- unsigned int bytes = PAGE_SIZE - offset;
- unsigned short prev_bi_vcnt = bio->bi_vcnt;
+ npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- /*
- * sorry...
- */
- if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
- bytes)
- break;
+ if (unlikely(offs & queue_dma_alignment(q))) {
+ ret = -EINVAL;
+ j = 0;
+ } else {
+ for (j = 0; j < npages; j++) {
+ struct page *page = pages[j];
+ unsigned int n = PAGE_SIZE - offs;
+ unsigned short prev_bi_vcnt = bio->bi_vcnt;
- /*
- * check if vector was merged with previous
- * drop page reference if needed
- */
- if (bio->bi_vcnt == prev_bi_vcnt)
- put_page(pages[j]);
+ if (n > bytes)
+ n = bytes;
- len -= bytes;
- offset = 0;
- }
+ if (!bio_add_pc_page(q, bio, page, n, offs))
+ break;
- cur_page = j;
+ /*
+ * check if vector was merged with previous
+ * drop page reference if needed
+ */
+ if (bio->bi_vcnt == prev_bi_vcnt)
+ put_page(page);
+
+ added += n;
+ bytes -= n;
+ offs = 0;
+ }
+ iov_iter_advance(iter, added);
+ }
/*
* release the pages we didn't map into the bio, if any
*/
- while (j < page_limit)
+ while (j < npages)
put_page(pages[j++]);
+ kvfree(pages);
+ /* couldn't stuff something into bio? */
+ if (bytes)
+ break;
}
- kfree(pages);
-
bio_set_flag(bio, BIO_USER_MAPPED);
/*
@@ -1440,8 +1362,6 @@ struct bio *bio_map_user_iov(struct request_queue *q,
bio_for_each_segment_all(bvec, bio, j) {
put_page(bvec->bv_page);
}
- out:
- kfree(pages);
bio_put(bio);
return ERR_PTR(ret);
}
@@ -1873,7 +1793,7 @@ EXPORT_SYMBOL(bio_endio);
struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)
{
- struct bio *split = NULL;
+ struct bio *split;
BUG_ON(sectors <= 0);
BUG_ON(sectors >= bio_sectors(bio));
@@ -1890,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size);
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
- bio_set_flag(bio, BIO_TRACE_COMPLETION);
+ bio_set_flag(split, BIO_TRACE_COMPLETION);
return split;
}
@@ -1940,11 +1860,8 @@ void bioset_free(struct bio_set *bs)
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
- if (bs->bio_pool)
- mempool_destroy(bs->bio_pool);
-
- if (bs->bvec_pool)
- mempool_destroy(bs->bvec_pool);
+ mempool_destroy(bs->bio_pool);
+ mempool_destroy(bs->bvec_pool);
bioset_integrity_free(bs);
bio_put_slab(bs);
@@ -2045,37 +1962,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet. Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released. The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
- struct io_context *ioc;
-
- if (bio->bi_css)
- return -EBUSY;
-
- ioc = current->io_context;
- if (!ioc)
- return -ENOENT;
-
- get_io_context_active(ioc);
- bio->bi_ioc = ioc;
- bio->bi_css = task_get_css(current, io_cgrp_id);
- return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
-/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
*/
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56baee936..c2033a232a44 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -812,7 +812,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
- struct module *owner;
unsigned int major, minor;
int key_len, part, ret;
char *body;
@@ -904,9 +903,7 @@ fail_unlock:
spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
fail:
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
+ put_disk_and_module(disk);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
@@ -931,13 +928,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
__releases(ctx->disk->queue->queue_lock) __releases(rcu)
{
- struct module *owner;
-
spin_unlock_irq(ctx->disk->queue->queue_lock);
rcu_read_unlock();
- owner = ctx->disk->fops->owner;
- put_disk(ctx->disk);
- module_put(owner);
+ put_disk_and_module(ctx->disk);
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
@@ -1419,6 +1412,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS)
goto err_unlock;
+ /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
+ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ goto err_unlock;
+
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1452,7 +1450,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
err_free_cpds:
- if (pol->cpd_alloc_fn) {
+ if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1490,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_alloc_fn) {
+ if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
diff --git a/block/blk-core.c b/block/blk-core.c
index 048be4aa6024..6d82c4f7fadd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
#include <linux/debugfs.h>
+#include <linux/bpf.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -126,6 +127,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
+ seqcount_init(&rq->gstate_seq);
+ u64_stats_init(&rq->aborted_gstate_sync);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -143,6 +146,7 @@ static const struct {
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
+ [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
/* device mapper special case, should not leak out: */
@@ -333,11 +337,13 @@ EXPORT_SYMBOL(blk_stop_queue);
void blk_sync_queue(struct request_queue *q)
{
del_timer_sync(&q->timeout);
+ cancel_work_sync(&q->timeout_work);
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
int i;
+ cancel_delayed_work_sync(&q->requeue_work);
queue_for_each_hw_ctx(q, hctx, i)
cancel_delayed_work_sync(&hctx->run_work);
} else {
@@ -347,6 +353,37 @@ void blk_sync_queue(struct request_queue *q)
EXPORT_SYMBOL(blk_sync_queue);
/**
+ * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * @q: request queue pointer
+ *
+ * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
+ * set and 1 if the flag was already set.
+ */
+int blk_set_preempt_only(struct request_queue *q)
+{
+ unsigned long flags;
+ int res;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+
+void blk_clear_preempt_only(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+ wake_up_all(&q->mq_freeze_wq);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+
+/**
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
* @q: The queue to run
*
@@ -529,6 +566,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
}
}
+void blk_drain_queue(struct request_queue *q)
+{
+ spin_lock_irq(q->queue_lock);
+ __blk_drain_queue(q, true);
+ spin_unlock_irq(q->queue_lock);
+}
+
/**
* blk_queue_bypass_start - enter queue bypass mode
* @q: queue of interest
@@ -604,12 +648,15 @@ void blk_set_queue_dying(struct request_queue *q)
spin_lock_irq(q->queue_lock);
blk_queue_for_each_rl(rl, q) {
if (rl->rq_pool) {
- wake_up(&rl->wait[BLK_RW_SYNC]);
- wake_up(&rl->wait[BLK_RW_ASYNC]);
+ wake_up_all(&rl->wait[BLK_RW_SYNC]);
+ wake_up_all(&rl->wait[BLK_RW_ASYNC]);
}
}
spin_unlock_irq(q->queue_lock);
}
+
+ /* Make blk_queue_enter() reexamine the DYING flag. */
+ wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -653,11 +700,18 @@ void blk_cleanup_queue(struct request_queue *q)
*/
blk_freeze_queue(q);
spin_lock_irq(lock);
- if (!q->mq_ops)
- __blk_drain_queue(q, true);
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
+ /*
+ * make sure all in-progress dispatch are completed because
+ * blk_freeze_queue() can only complete all requests, and
+ * dispatch may still be in-progress since we dispatch requests
+ * from more than one contexts
+ */
+ if (q->mq_ops)
+ blk_mq_quiesce_queue(q);
+
/* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity();
@@ -718,7 +772,7 @@ static void free_request_size(void *element, void *data)
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask)
{
- if (unlikely(rl->rq_pool))
+ if (unlikely(rl->rq_pool) || q->mq_ops)
return 0;
rl->q = q;
@@ -760,15 +814,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_alloc_queue);
-int blk_queue_enter(struct request_queue *q, bool nowait)
+/**
+ * blk_queue_enter() - try to increase q->q_usage_counter
+ * @q: request queue pointer
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ */
+int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
+ const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+
while (true) {
+ bool success = false;
int ret;
- if (percpu_ref_tryget_live(&q->q_usage_counter))
+ rcu_read_lock_sched();
+ if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+ /*
+ * The code that sets the PREEMPT_ONLY flag is
+ * responsible for ensuring that that flag is globally
+ * visible before the queue is unfrozen.
+ */
+ if (preempt || !blk_queue_preempt_only(q)) {
+ success = true;
+ } else {
+ percpu_ref_put(&q->q_usage_counter);
+ }
+ }
+ rcu_read_unlock_sched();
+
+ if (success)
return 0;
- if (nowait)
+ if (flags & BLK_MQ_REQ_NOWAIT)
return -EBUSY;
/*
@@ -781,7 +858,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
smp_rmb();
ret = wait_event_interruptible(q->mq_freeze_wq,
- !atomic_read(&q->mq_freeze_depth) ||
+ (atomic_read(&q->mq_freeze_depth) == 0 &&
+ (preempt || !blk_queue_preempt_only(q))) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
@@ -803,9 +881,9 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
wake_up_all(&q->mq_freeze_wq);
}
-static void blk_rq_timed_out_timer(unsigned long data)
+static void blk_rq_timed_out_timer(struct timer_list *t)
{
- struct request_queue *q = (struct request_queue *)data;
+ struct request_queue *q = from_timer(q, t, timeout);
kblockd_schedule_work(&q->timeout_work);
}
@@ -841,9 +919,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info->name = "block";
q->node = node_id;
- setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
- laptop_mode_timer_fn, (unsigned long) q);
- setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+ timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
+ laptop_mode_timer_fn, 0);
+ timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
+ INIT_WORK(&q->timeout_work, NULL);
INIT_LIST_HEAD(&q->queue_head);
INIT_LIST_HEAD(&q->timeout_list);
INIT_LIST_HEAD(&q->icq_list);
@@ -1154,7 +1233,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* @rl: request list to allocate from
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLQ_MQ_REQ_* flags
*
* Get a free request from @q. This function may fail under memory
* pressure or if @q is dead.
@@ -1164,7 +1243,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *__get_request(struct request_list *rl, unsigned int op,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio, blk_mq_req_flags_t flags)
{
struct request_queue *q = rl->q;
struct request *rq;
@@ -1173,6 +1252,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
struct io_cq *icq = NULL;
const bool is_sync = op_is_sync(op);
int may_queue;
+ gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+ __GFP_DIRECT_RECLAIM;
req_flags_t rq_flags = RQF_ALLOCED;
lockdep_assert_held(q->queue_lock);
@@ -1255,6 +1336,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_set_rl(rq, rl);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
+ if (flags & BLK_MQ_REQ_PREEMPT)
+ rq->rq_flags |= RQF_PREEMPT;
/* init elvpriv */
if (rq_flags & RQF_ELVPRIV) {
@@ -1333,7 +1416,7 @@ rq_starved:
* @q: request_queue to allocate request from
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLK_MQ_REQ_* flags.
*
* Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
* this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1343,7 +1426,7 @@ rq_starved:
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *get_request(struct request_queue *q, unsigned int op,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio, blk_mq_req_flags_t flags)
{
const bool is_sync = op_is_sync(op);
DEFINE_WAIT(wait);
@@ -1355,7 +1438,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
rl = blk_get_rl(q, bio); /* transferred to @rq on success */
retry:
- rq = __get_request(rl, op, bio, gfp_mask);
+ rq = __get_request(rl, op, bio, flags);
if (!IS_ERR(rq))
return rq;
@@ -1364,7 +1447,7 @@ retry:
return ERR_PTR(-EAGAIN);
}
- if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
+ if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
blk_put_rl(rl);
return rq;
}
@@ -1391,20 +1474,28 @@ retry:
goto retry;
}
+/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
static struct request *blk_old_get_request(struct request_queue *q,
- unsigned int op, gfp_t gfp_mask)
+ unsigned int op, blk_mq_req_flags_t flags)
{
struct request *rq;
+ gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+ __GFP_DIRECT_RECLAIM;
+ int ret = 0;
WARN_ON_ONCE(q->mq_ops);
/* create ioc upfront */
create_io_context(gfp_mask, q->node);
+ ret = blk_queue_enter(q, flags);
+ if (ret)
+ return ERR_PTR(ret);
spin_lock_irq(q->queue_lock);
- rq = get_request(q, op, NULL, gfp_mask);
+ rq = get_request(q, op, NULL, flags);
if (IS_ERR(rq)) {
spin_unlock_irq(q->queue_lock);
+ blk_queue_exit(q);
return rq;
}
@@ -1415,25 +1506,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
return rq;
}
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
- gfp_t gfp_mask)
+/**
+ * blk_get_request_flags - allocate a request
+ * @q: request queue to allocate a request for
+ * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
+ * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
+ */
+struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
+ blk_mq_req_flags_t flags)
{
struct request *req;
+ WARN_ON_ONCE(op & REQ_NOWAIT);
+ WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
+
if (q->mq_ops) {
- req = blk_mq_alloc_request(q, op,
- (gfp_mask & __GFP_DIRECT_RECLAIM) ?
- 0 : BLK_MQ_REQ_NOWAIT);
+ req = blk_mq_alloc_request(q, op, flags);
if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
q->mq_ops->initialize_rq_fn(req);
} else {
- req = blk_old_get_request(q, op, gfp_mask);
+ req = blk_old_get_request(q, op, flags);
if (!IS_ERR(req) && q->initialize_rq_fn)
q->initialize_rq_fn(req);
}
return req;
}
+EXPORT_SYMBOL(blk_get_request_flags);
+
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+ gfp_t gfp_mask)
+{
+ return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
+ 0 : BLK_MQ_REQ_NOWAIT);
+}
EXPORT_SYMBOL(blk_get_request);
/**
@@ -1553,6 +1659,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
lockdep_assert_held(q->queue_lock);
+ blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
elv_completed_request(q, req);
@@ -1576,6 +1683,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
blk_free_request(rl, req);
freed_request(rl, sync, rq_flags);
blk_put_rl(rl);
+ blk_queue_exit(q);
}
}
EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1857,8 +1965,10 @@ get_rq:
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
- req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
+ blk_queue_enter_live(q);
+ req = get_request(q, bio->bi_opf, bio, 0);
if (IS_ERR(req)) {
+ blk_queue_exit(q);
__wbt_done(q->rq_wb, wb_acct);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
@@ -1959,6 +2069,29 @@ static inline bool should_fail_request(struct hd_struct *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */
+static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+{
+ if (part->policy && op_is_write(bio_op(bio))) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_ERR
+ "generic_make_request: Trying to write "
+ "to read-only block-device %s (partno %d)\n",
+ bio_devname(bio, b), part->partno);
+ return true;
+ }
+
+ return false;
+}
+
+static noinline int should_fail_bio(struct bio *bio)
+{
+ if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+ return -EIO;
+ return 0;
+}
+ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
+
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
@@ -1967,27 +2100,28 @@ static inline int blk_partition_remap(struct bio *bio)
struct hd_struct *p;
int ret = 0;
+ rcu_read_lock();
+ p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+ if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+ bio_check_ro(bio, p))) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Zone reset does not include bi_size so bio_sectors() is always 0.
* Include a test for the reset op code and perform the remap if needed.
*/
- if (!bio->bi_partno ||
- (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
- return 0;
+ if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+ goto out;
- rcu_read_lock();
- p = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
- bio->bi_iter.bi_sector += p->start_sect;
- bio->bi_partno = 0;
- trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
- bio->bi_iter.bi_sector - p->start_sect);
- } else {
- printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
- ret = -EIO;
- }
- rcu_read_unlock();
+ bio->bi_iter.bi_sector += p->start_sect;
+ bio->bi_partno = 0;
+ trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+ bio->bi_iter.bi_sector - p->start_sect);
+out:
+ rcu_read_unlock();
return ret;
}
@@ -2046,15 +2180,19 @@ generic_make_request_checks(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
*/
-
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;
- if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+ if (should_fail_bio(bio))
goto end_io;
- if (blk_partition_remap(bio))
- goto end_io;
+ if (!bio->bi_partno) {
+ if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+ goto end_io;
+ } else {
+ if (blk_partition_remap(bio))
+ goto end_io;
+ }
if (bio_check_eod(bio, nr_sectors))
goto end_io;
@@ -2200,8 +2338,10 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
+ blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
+ BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
+ if (likely(blk_queue_enter(q, flags) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
@@ -2242,6 +2382,40 @@ out:
EXPORT_SYMBOL(generic_make_request);
/**
+ * direct_make_request - hand a buffer directly to its device driver for I/O
+ * @bio: The bio describing the location in memory and on the device.
+ *
+ * This function behaves like generic_make_request(), but does not protect
+ * against recursion. Must only be used if the called driver is known
+ * to not call generic_make_request (or direct_make_request) again from
+ * its make_request function. (Calling direct_make_request again from
+ * a workqueue is perfectly fine as that doesn't recurse).
+ */
+blk_qc_t direct_make_request(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ blk_qc_t ret;
+
+ if (!generic_make_request_checks(bio))
+ return BLK_QC_T_NONE;
+
+ if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
+ if (nowait && !blk_queue_dying(q))
+ bio->bi_status = BLK_STS_AGAIN;
+ else
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+
+ ret = q->make_request_fn(q, bio);
+ blk_queue_exit(q);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(direct_make_request);
+
+/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
@@ -2260,7 +2434,7 @@ blk_qc_t submit_bio(struct bio *bio)
unsigned int count;
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
- count = queue_logical_block_size(bio->bi_disk->queue);
+ count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
else
count = bio_sectors(bio);
@@ -2285,6 +2459,17 @@ blk_qc_t submit_bio(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio);
+bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+{
+ if (!q->poll_fn || !blk_qc_t_valid(cookie))
+ return false;
+
+ if (current->plug)
+ blk_flush_plug_list(current->plug, false);
+ return q->poll_fn(q, cookie);
+}
+EXPORT_SYMBOL_GPL(blk_poll);
+
/**
* blk_cloned_rq_check_limits - Helper function to check a cloned request
* for new the queue limits
@@ -2350,8 +2535,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
- blk_mq_request_bypass_insert(rq);
- return BLK_STS_OK;
+ return blk_mq_request_issue_directly(rq);
}
spin_lock_irqsave(q->queue_lock, flags);
@@ -2464,20 +2648,22 @@ void blk_account_io_done(struct request *req)
* Don't process normal requests when queue is suspended
* or in the process of suspending/resuming
*/
-static struct request *blk_pm_peek_request(struct request_queue *q,
- struct request *rq)
+static bool blk_pm_allow_request(struct request *rq)
{
- if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
- (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM))))
- return NULL;
- else
- return rq;
+ switch (rq->q->rpm_status) {
+ case RPM_RESUMING:
+ case RPM_SUSPENDING:
+ return rq->rq_flags & RQF_PM;
+ case RPM_SUSPENDED:
+ return false;
+ }
+
+ return true;
}
#else
-static inline struct request *blk_pm_peek_request(struct request_queue *q,
- struct request *rq)
+static bool blk_pm_allow_request(struct request *rq)
{
- return rq;
+ return true;
}
#endif
@@ -2517,6 +2703,48 @@ void blk_account_io_start(struct request *rq, bool new_io)
part_stat_unlock();
}
+static struct request *elv_next_request(struct request_queue *q)
+{
+ struct request *rq;
+ struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
+
+ WARN_ON_ONCE(q->mq_ops);
+
+ while (1) {
+ list_for_each_entry(rq, &q->queue_head, queuelist) {
+ if (blk_pm_allow_request(rq))
+ return rq;
+
+ if (rq->rq_flags & RQF_SOFTBARRIER)
+ break;
+ }
+
+ /*
+ * Flush request is running and flush request isn't queueable
+ * in the drive, we can hold the queue till flush request is
+ * finished. Even we don't do this, driver can't dispatch next
+ * requests and will requeue them. And this can improve
+ * throughput too. For example, we have request flush1, write1,
+ * flush 2. flush1 is dispatched, then queue is hold, write1
+ * isn't inserted to queue. After flush1 is finished, flush2
+ * will be dispatched. Since disk cache is already clean,
+ * flush2 will be finished very soon, so looks like flush2 is
+ * folded to flush1.
+ * Since the queue is hold, a flag is set to indicate the queue
+ * should be restarted later. Please see flush_end_io() for
+ * details.
+ */
+ if (fq->flush_pending_idx != fq->flush_running_idx &&
+ !queue_flush_queueable(q)) {
+ fq->flush_queue_delayed = 1;
+ return NULL;
+ }
+ if (unlikely(blk_queue_bypass(q)) ||
+ !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
+ return NULL;
+ }
+}
+
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
@@ -2538,12 +2766,7 @@ struct request *blk_peek_request(struct request_queue *q)
lockdep_assert_held(q->queue_lock);
WARN_ON_ONCE(q->mq_ops);
- while ((rq = __elv_next_request(q)) != NULL) {
-
- rq = blk_pm_peek_request(q, rq);
- if (!rq)
- break;
-
+ while ((rq = elv_next_request(q)) != NULL) {
if (!(rq->rq_flags & RQF_STARTED)) {
/*
* This is the first time the device driver
@@ -2664,7 +2887,7 @@ void blk_start_request(struct request *req)
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
- BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+ BUG_ON(blk_rq_is_complete(req));
blk_add_timer(req);
}
EXPORT_SYMBOL(blk_start_request);
@@ -2695,6 +2918,27 @@ struct request *blk_fetch_request(struct request_queue *q)
}
EXPORT_SYMBOL(blk_fetch_request);
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+ if (rq->bio) {
+ if (list->tail)
+ list->tail->bi_next = rq->bio;
+ else
+ list->head = rq->bio;
+ list->tail = rq->biotail;
+
+ rq->bio = NULL;
+ rq->biotail = NULL;
+ }
+
+ rq->__data_len = 0;
+}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
+
/**
* blk_update_request - Special helper function for request stacking drivers
* @req: the request being processed
@@ -3048,6 +3292,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
{
if (bio_has_data(bio))
rq->nr_phys_segments = bio_phys_segments(q, bio);
+ else if (bio_op(bio) == REQ_OP_DISCARD)
+ rq->nr_phys_segments = 1;
rq->__data_len = bio->bi_iter.bi_size;
rq->bio = rq->biotail = bio;
@@ -3212,20 +3458,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
-int kblockd_schedule_delayed_work(struct delayed_work *dwork,
- unsigned long delay)
-{
- return queue_delayed_work(kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
- unsigned long delay)
-{
- return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
-
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 5c0f3dc446dc..f7b292f12449 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set
*/
if (q->mq_ops) {
- blk_mq_sched_insert_request(rq, at_head, true, false, false);
+ blk_mq_sched_insert_request(rq, at_head, true, false);
return;
}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4938bec8cfef..f17170675917 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
- blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
- flush_rq->tag = -1;
+ if (!q->elevator) {
+ blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+ flush_rq->tag = -1;
+ } else {
+ blk_mq_put_driver_tag_hctx(hctx, flush_rq);
+ flush_rq->internal_tag = -1;
+ }
}
running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
blk_rq_init(q, flush_rq);
/*
- * Borrow tag from the first request since they can't
- * be in flight at the same time. And acquire the tag's
- * ownership for flush req.
+ * In case of none scheduler, borrow tag from the first request
+ * since they can't be in flight at the same time. And acquire
+ * the tag's ownership for flush req.
+ *
+ * In case of IO scheduler, flush rq need to borrow scheduler tag
+ * just for cheating put/get driver tag.
*/
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
flush_rq->mq_ctx = first_rq->mq_ctx;
- flush_rq->tag = first_rq->tag;
- fq->orig_rq = first_rq;
- hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
- blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+ if (!q->elevator) {
+ fq->orig_rq = first_rq;
+ flush_rq->tag = first_rq->tag;
+ hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+ blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+ } else {
+ flush_rq->internal_tag = first_rq->internal_tag;
+ }
}
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
hctx = blk_mq_map_queue(q, ctx->cpu);
+ if (q->elevator) {
+ WARN_ON(rq->tag < 0);
+ blk_mq_put_driver_tag_hctx(hctx, rq);
+ }
+
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
@@ -463,7 +480,7 @@ void blk_insert_flush(struct request *rq)
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops)
- blk_mq_sched_insert_request(rq, false, true, false, false);
+ blk_mq_request_bypass_insert(rq, false);
else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 63fb971d6574..a676084d4740 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
@@ -275,6 +284,43 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
return min(pages, (sector_t)BIO_MAX_PAGES);
}
+static int __blkdev_issue_zero_pages(struct block_device *bdev,
+ sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+ struct bio **biop)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct bio *bio = *biop;
+ int bi_size = 0;
+ unsigned int sz;
+
+ if (!q)
+ return -ENXIO;
+
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
+ while (nr_sects != 0) {
+ bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+ gfp_mask);
+ bio->bi_iter.bi_sector = sector;
+ bio_set_dev(bio, bdev);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ while (nr_sects != 0) {
+ sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+ bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
+ nr_sects -= bi_size >> 9;
+ sector += bi_size >> 9;
+ if (bi_size < sz)
+ break;
+ }
+ cond_resched();
+ }
+
+ *biop = bio;
+ return 0;
+}
+
/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
@@ -288,12 +334,6 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
* Zero-fill a block range, either using hardware offload or by explicitly
* writing zeroes to the device.
*
- * Note that this function may fail with -EOPNOTSUPP if the driver signals
- * zeroing offload support, but the device fails to process the command (for
- * some devices there is no non-destructive way to verify whether this
- * operation is actually supported). In this case the caller should call
- * retry the call to blkdev_issue_zeroout() and the fallback path will be used.
- *
* If a device is using logical block provisioning, the underlying space will
* not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
*
@@ -305,9 +345,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
unsigned flags)
{
int ret;
- int bi_size = 0;
- struct bio *bio = *biop;
- unsigned int sz;
sector_t bs_mask;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
@@ -317,30 +354,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
biop, flags);
if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
- goto out;
-
- ret = 0;
- while (nr_sects != 0) {
- bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
- gfp_mask);
- bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
- while (nr_sects != 0) {
- sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
- bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
- nr_sects -= bi_size >> 9;
- sector += bi_size >> 9;
- if (bi_size < sz)
- break;
- }
- cond_resched();
- }
+ return ret;
- *biop = bio;
-out:
- return ret;
+ return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
+ biop);
}
EXPORT_SYMBOL(__blkdev_issue_zeroout);
@@ -360,18 +377,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
{
- int ret;
- struct bio *bio = NULL;
+ int ret = 0;
+ sector_t bs_mask;
+ struct bio *bio;
struct blk_plug plug;
+ bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
+ bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+ if ((sector | nr_sects) & bs_mask)
+ return -EINVAL;
+
+retry:
+ bio = NULL;
blk_start_plug(&plug);
- ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
- &bio, flags);
+ if (try_write_zeroes) {
+ ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
+ gfp_mask, &bio, flags);
+ } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+ ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
+ gfp_mask, &bio);
+ } else {
+ /* No zeroing offload support */
+ ret = -EOPNOTSUPP;
+ }
if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
blk_finish_plug(&plug);
+ if (ret && try_write_zeroes) {
+ if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+ try_write_zeroes = false;
+ goto retry;
+ }
+ if (!bdev_write_zeroes_sectors(bdev)) {
+ /*
+ * Zeroing offload support was indicated, but the
+ * device reported ILLEGAL REQUEST (for some devices
+ * there is no non-destructive way to verify whether
+ * WRITE ZEROES is actually supported).
+ */
+ ret = -EOPNOTSUPP;
+ }
+ }
return ret;
}
diff --git a/block/blk-map.c b/block/blk-map.c
index d5251edcc0dd..db9373bd31ac 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,22 +12,29 @@
#include "blk.h"
/*
- * Append a bio to a passthrough request. Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request. Only works if the bio can be merged
+ * into the request based on the driver constraints.
*/
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
- blk_queue_bounce(rq->q, &bio);
+ struct bio *orig_bio = *bio;
+
+ blk_queue_bounce(rq->q, bio);
if (!rq->bio) {
- blk_rq_bio_prep(rq->q, rq, bio);
+ blk_rq_bio_prep(rq->q, rq, *bio);
} else {
- if (!ll_back_merge_fn(rq->q, rq, bio))
+ if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+ if (orig_bio != *bio) {
+ bio_put(*bio);
+ *bio = orig_bio;
+ }
return -EINVAL;
+ }
- rq->biotail->bi_next = bio;
- rq->biotail = bio;
- rq->__data_len += bio->bi_iter.bi_size;
+ rq->biotail->bi_next = *bio;
+ rq->biotail = *bio;
+ rq->__data_len += (*bio)->bi_iter.bi_size;
}
return 0;
@@ -67,27 +74,18 @@ static int __blk_rq_map_user_iov(struct request *rq,
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
-
- iov_iter_advance(iter, bio->bi_iter.bi_size);
- if (map_data)
- map_data->offset += bio->bi_iter.bi_size;
-
orig_bio = bio;
/*
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
- ret = blk_rq_append_bio(rq, bio);
- bio_get(bio);
+ ret = blk_rq_append_bio(rq, &bio);
if (ret) {
- bio_endio(bio);
__blk_rq_unmap_user(orig_bio);
- bio_put(bio);
return ret;
}
+ bio_get(bio);
return 0;
}
@@ -121,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
- int ret;
+ int ret = -EINVAL;
if (!iter_is_iovec(iter))
goto fail;
@@ -150,7 +148,7 @@ unmap_rq:
__blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
- return -EINVAL;
+ return ret;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -220,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
int do_copy = 0;
- struct bio *bio;
+ struct bio *bio, *orig_bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
@@ -243,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (do_copy)
rq->rq_flags |= RQF_COPY_USER;
- ret = blk_rq_append_bio(rq, bio);
+ orig_bio = bio;
+ ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
/* request is too big */
- bio_put(bio);
+ bio_put(orig_bio);
return ret;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f5dedd57dff6..782940c65d8a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
nsegs++;
sectors = max_sectors;
}
- if (sectors)
- goto split;
- /* Make this single bvec as the 1st segment */
+ goto split;
}
if (bvprvp && blk_queue_cluster(q)) {
@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
- if (nsegs == 1 && seg_size > front_seg_size)
- front_seg_size = seg_size;
continue;
}
new_segment:
if (nsegs == queue_max_segments(q))
goto split;
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
+
nsegs++;
bvprv = bv;
bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
- if (nsegs == 1 && seg_size > front_seg_size)
- front_seg_size = seg_size;
}
do_split = false;
@@ -174,6 +171,8 @@ split:
bio = new;
}
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size;
@@ -551,6 +550,24 @@ static bool req_no_special_merge(struct request *req)
return !q->mq_ops && req->special;
}
+static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
+ struct request *next)
+{
+ unsigned short segments = blk_rq_nr_discard_segments(req);
+
+ if (segments >= queue_max_discard_segments(q))
+ goto no_merge;
+ if (blk_rq_sectors(req) + bio_sectors(next->bio) >
+ blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+ goto no_merge;
+
+ req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
+ return true;
+no_merge:
+ req_set_nomerge(q, req);
+ return false;
+}
+
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
@@ -684,9 +701,13 @@ static struct request *attempt_merge(struct request_queue *q,
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
- * counts here.
+ * counts here. Handle DISCARDs separately, as they
+ * have separate settings.
*/
- if (!ll_merge_requests_fn(q, req, next))
+ if (req_op(req) == REQ_OP_DISCARD) {
+ if (!req_attempt_discard_merge(q, req, next))
+ return NULL;
+ } else if (!ll_merge_requests_fn(q, req, next))
return NULL;
/*
@@ -716,7 +737,8 @@ static struct request *attempt_merge(struct request_queue *q,
req->__data_len += blk_rq_bytes(next);
- elv_merge_requests(q, req, next);
+ if (req_op(req) != REQ_OP_DISCARD)
+ elv_merge_requests(q, req, next);
/*
* 'next' is going away, so update stats accordingly
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index de294d775acf..21cbc1f071c6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
- QUEUE_FLAG_NAME(STACKABLE),
QUEUE_FLAG_NAME(NONROT),
QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(DISCARD),
@@ -75,6 +74,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED),
+ QUEUE_FLAG_NAME(PREEMPT_ONLY),
};
#undef QUEUE_FLAG_NAME
@@ -180,7 +180,6 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
- HCTX_STATE_NAME(TAG_WAITING),
HCTX_STATE_NAME(START_ON_RUN),
};
#undef HCTX_STATE_NAME
@@ -290,17 +289,12 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
+ RQF_NAME(ZONE_WRITE_LOCKED),
+ RQF_NAME(MQ_TIMEOUT_EXPIRED),
+ RQF_NAME(MQ_POLL_SLEPT),
};
#undef RQF_NAME
-#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
-static const char *const rqaf_name[] = {
- RQAF_NAME(COMPLETE),
- RQAF_NAME(STARTED),
- RQAF_NAME(POLL_SLEPT),
-};
-#undef RQAF_NAME
-
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -317,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
- seq_puts(m, ", .atomic_flags=");
- blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
+ seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
rq->internal_tag);
if (mq_ops->show_rq)
@@ -410,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
const struct show_busy_params *params = data;
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
- test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+ blk_mq_rq_state(rq) != MQ_RQ_IDLE)
__blk_mq_debugfs_rq_show(params->m,
list_entry_rq(&rq->queuelist));
}
@@ -704,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
const struct blk_mq_debugfs_attr *attr = m->private;
void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
- if (!attr->write)
+ /*
+ * Attributes that only implement .seq_ops are read-only and 'attr' is
+ * the same with 'data' in this case.
+ */
+ if (attr == data || !attr->write)
return -EPERM;
return attr->write(data, buf, count, ppos);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..25c14c58385c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,12 +81,95 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
} else
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
- if (blk_mq_hctx_has_pending(hctx)) {
- blk_mq_run_hw_queue(hctx, true);
- return true;
- }
+ return blk_mq_run_hw_queue(hctx, true);
+}
- return false;
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ */
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct elevator_queue *e = q->elevator;
+ LIST_HEAD(rq_list);
+
+ do {
+ struct request *rq;
+
+ if (e->type->ops.mq.has_work &&
+ !e->type->ops.mq.has_work(hctx))
+ break;
+
+ if (!blk_mq_get_dispatch_budget(hctx))
+ break;
+
+ rq = e->type->ops.mq.dispatch_request(hctx);
+ if (!rq) {
+ blk_mq_put_dispatch_budget(hctx);
+ break;
+ }
+
+ /*
+ * Now this rq owns the budget which has to be released
+ * if this rq won't be queued to driver via .queue_rq()
+ * in blk_mq_dispatch_rq_list().
+ */
+ list_add(&rq->queuelist, &rq_list);
+ } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+}
+
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ unsigned idx = ctx->index_hw;
+
+ if (++idx == hctx->nr_ctx)
+ idx = 0;
+
+ return hctx->ctxs[idx];
+}
+
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ */
+static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ LIST_HEAD(rq_list);
+ struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+ do {
+ struct request *rq;
+
+ if (!sbitmap_any_bit_set(&hctx->ctx_map))
+ break;
+
+ if (!blk_mq_get_dispatch_budget(hctx))
+ break;
+
+ rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+ if (!rq) {
+ blk_mq_put_dispatch_budget(hctx);
+ break;
+ }
+
+ /*
+ * Now this rq owns the budget which has to be released
+ * if this rq won't be queued to driver via .queue_rq()
+ * in blk_mq_dispatch_rq_list().
+ */
+ list_add(&rq->queuelist, &rq_list);
+
+ /* round robin for fair dispatch */
+ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+ } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ WRITE_ONCE(hctx->dispatch_from, ctx);
}
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -94,7 +177,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
- bool did_work = false;
LIST_HEAD(rq_list);
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,29 +204,34 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
* scheduler, we can no longer merge or sort them. So it's best to
* leave them there for as long as we can. Mark the hw queue as
* needing a restart in that case.
+ *
+ * We want to dispatch from the scheduler if there was nothing
+ * on the dispatch list or we were able to dispatch from the
+ * dispatch list.
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- did_work = blk_mq_dispatch_rq_list(q, &rq_list);
- } else if (!has_sched_dispatch) {
+ if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+ if (has_sched_dispatch)
+ blk_mq_do_dispatch_sched(hctx);
+ else
+ blk_mq_do_dispatch_ctx(hctx);
+ }
+ } else if (has_sched_dispatch) {
+ blk_mq_do_dispatch_sched(hctx);
+ } else if (q->mq_ops->get_budget) {
+ /*
+ * If we need to get budget before queuing request, we
+ * dequeue request one by one from sw queue for avoiding
+ * to mess up I/O merge when dispatch runs out of resource.
+ *
+ * TODO: get more budgets, and dequeue more requests in
+ * one time.
+ */
+ blk_mq_do_dispatch_ctx(hctx);
+ } else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(q, &rq_list);
- }
-
- /*
- * We want to dispatch from the scheduler if we had no work left
- * on the dispatch list, OR if we did have work but weren't able
- * to make progress.
- */
- if (!did_work && has_sched_dispatch) {
- do {
- struct request *rq;
-
- rq = e->type->ops.mq.dispatch_request(hctx);
- if (!rq)
- break;
- list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list));
+ blk_mq_dispatch_rq_list(q, &rq_list, false);
}
}
@@ -172,6 +259,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
if (!*merged_request)
elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
return true;
+ case ELEVATOR_DISCARD_MERGE:
+ return bio_attempt_discard_merge(q, rq, bio);
default:
return false;
}
@@ -260,21 +349,21 @@ void blk_mq_sched_request_inserted(struct request *rq)
EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
+ bool has_sched,
struct request *rq)
{
- if (rq->tag == -1) {
- rq->rq_flags |= RQF_SORTED;
- return false;
+ /* dispatch flush rq directly */
+ if (rq->rq_flags & RQF_FLUSH_SEQ) {
+ spin_lock(&hctx->lock);
+ list_add(&rq->queuelist, &hctx->dispatch);
+ spin_unlock(&hctx->lock);
+ return true;
}
- /*
- * If we already have a real request tag, send directly to
- * the dispatch list.
- */
- spin_lock(&hctx->lock);
- list_add(&rq->queuelist, &hctx->dispatch);
- spin_unlock(&hctx->lock);
- return true;
+ if (has_sched)
+ rq->rq_flags |= RQF_SORTED;
+
+ return false;
}
/**
@@ -339,35 +428,23 @@ done:
}
}
-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
- struct request *rq, bool can_block)
-{
- if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
- blk_insert_flush(rq);
- blk_mq_run_hw_queue(hctx, true);
- } else
- blk_mq_add_to_requeue_list(rq, false, true);
-}
-
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async, bool can_block)
+ bool run_queue, bool async)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
- if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
- blk_mq_sched_insert_flush(hctx, rq, can_block);
- return;
+ /* flush rq in flush machinery need to be dispatched directly */
+ if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
+ blk_insert_flush(rq);
+ goto run;
}
- if (e && blk_mq_sched_bypass_insert(hctx, rq))
+ WARN_ON(e && (rq->tag != -1));
+
+ if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
goto run;
if (e && e->type->ops.mq.insert_requests) {
@@ -393,23 +470,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
struct elevator_queue *e = hctx->queue->elevator;
- if (e) {
- struct request *rq, *next;
-
- /*
- * We bypass requests that already have a driver tag assigned,
- * which should only be flushes. Flushes are only ever inserted
- * as single requests, so we shouldn't ever hit the
- * WARN_ON_ONCE() below (but let's handle it just in case).
- */
- list_for_each_entry_safe(rq, next, list, queuelist) {
- if (WARN_ON_ONCE(rq->tag != -1)) {
- list_del_init(&rq->queuelist);
- blk_mq_sched_bypass_insert(hctx, rq);
- }
- }
- }
-
if (e && e->type->ops.mq.insert_requests)
e->type->ops.mq.insert_requests(hctx, list, false);
else
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index ba1d1418a96d..1e9c9018ace1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async, bool can_block);
+ bool run_queue, bool async);
void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79969c3c234f..a54b4b070f1c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
-static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
q->mq_sysfs_init_done = false;
}
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
- mutex_lock(&q->sysfs_lock);
- __blk_mq_unregister_dev(dev, q);
- mutex_unlock(&q->sysfs_lock);
-}
-
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6714507aa6c7..336dde07b230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL;
do {
- prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
-
- tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
- break;
-
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1)
break;
+ prepare_to_wait_exclusive(&ws->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != -1)
+ break;
+
if (data->ctx)
blk_mq_put_ctx(data->ctx);
@@ -298,12 +299,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
- int (reinit_request)(void *, struct request *))
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+ int (fn)(void *, struct request *))
{
int i, j, ret = 0;
- if (WARN_ON_ONCE(!reinit_request))
+ if (WARN_ON_ONCE(!fn))
goto out;
for (i = 0; i < set->nr_hw_queues; i++) {
@@ -316,8 +317,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
if (!tags->static_rqs[j])
continue;
- ret = reinit_request(set->driver_data,
- tags->static_rqs[j]);
+ ret = fn(data, tags->static_rqs[j]);
if (ret)
goto out;
}
@@ -326,7 +326,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
out:
return ret;
}
-EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
+EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index c190165d92ea..61deab0b5a5a 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -45,13 +45,8 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
}
enum {
- BLK_MQ_TAG_CACHE_MIN = 1,
- BLK_MQ_TAG_CACHE_MAX = 64,
-};
-
-enum {
BLK_MQ_TAG_FAIL = -1U,
- BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
+ BLK_MQ_TAG_MIN = 1,
BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a18609755e..16e83e6df404 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,6 +37,7 @@
#include "blk-wbt.h"
#include "blk-mq-sched.h"
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -60,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
- return sbitmap_any_bit_set(&hctx->ctx_map) ||
- !list_empty_careful(&hctx->dispatch) ||
+ return !list_empty_careful(&hctx->dispatch) ||
+ sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
}
@@ -94,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
{
struct mq_inflight *mi = priv;
- if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
- !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+ if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
/*
* index[0] counts the specific partition that was asked
* for. index[1] counts the ones that are active on the
@@ -125,7 +125,8 @@ void blk_freeze_queue_start(struct request_queue *q)
freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
if (freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
- blk_mq_run_hw_queues(q, false);
+ if (q->mq_ops)
+ blk_mq_run_hw_queues(q, false);
}
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -159,6 +160,8 @@ void blk_freeze_queue(struct request_queue *q)
* exported to drivers as the only user for unfreeze is blk_mq.
*/
blk_freeze_queue_start(q);
+ if (!q->mq_ops)
+ blk_drain_queue(q);
blk_mq_freeze_queue_wait(q);
}
@@ -218,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(hctx->queue_rq_srcu);
+ synchronize_srcu(hctx->srcu);
else
rcu = true;
}
@@ -255,13 +258,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
-
- /*
- * If we are called because the queue has now been marked as
- * dying, we need to ensure that processes currently waiting on
- * the queue are notified as well.
- */
- wake_up_all(&q->mq_freeze_wq);
}
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -275,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
-
- rq->rq_flags = 0;
+ req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
rq->tag = -1;
rq->internal_tag = tag;
} else {
if (blk_mq_tag_busy(data->hctx)) {
- rq->rq_flags = RQF_MQ_INFLIGHT;
+ rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
@@ -291,25 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
data->hctx->tags->rqs[rq->tag] = rq;
}
- INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
+ rq->rq_flags = rq_flags;
+ rq->cpu = -1;
rq->cmd_flags = op;
+ if (data->flags & BLK_MQ_REQ_PREEMPT)
+ rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
rq->rq_flags |= RQF_IO_STAT;
- /* do not touch atomic flags, it needs atomic ops against the timer */
- rq->cpu = -1;
+ INIT_LIST_HEAD(&rq->queuelist);
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
rq->start_time = jiffies;
-#ifdef CONFIG_BLK_CGROUP
- rq->rl = NULL;
- set_start_time_ns(rq);
- rq->io_start_time_ns = 0;
-#endif
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
@@ -317,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->special = NULL;
/* tag was already set */
rq->extra_len = 0;
+ rq->__deadline = 0;
INIT_LIST_HEAD(&rq->timeout_list);
rq->timeout = 0;
@@ -325,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->end_io_data = NULL;
rq->next_rq = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ rq->rl = NULL;
+ set_start_time_ns(rq);
+ rq->io_start_time_ns = 0;
+#endif
+
data->ctx->rq_dispatched[op_is_sync(op)]++;
return rq;
}
@@ -336,12 +335,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
struct elevator_queue *e = q->elevator;
struct request *rq;
unsigned int tag;
- struct blk_mq_ctx *local_ctx = NULL;
+ bool put_ctx_on_error = false;
blk_queue_enter_live(q);
data->q = q;
- if (likely(!data->ctx))
- data->ctx = local_ctx = blk_mq_get_ctx(q);
+ if (likely(!data->ctx)) {
+ data->ctx = blk_mq_get_ctx(q);
+ put_ctx_on_error = true;
+ }
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
if (op & REQ_NOWAIT)
@@ -360,8 +361,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_TAG_FAIL) {
- if (local_ctx) {
- blk_mq_put_ctx(local_ctx);
+ if (put_ctx_on_error) {
+ blk_mq_put_ctx(data->ctx);
data->ctx = NULL;
}
blk_queue_exit(q);
@@ -384,13 +385,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
}
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
- unsigned int flags)
+ blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
int ret;
- ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+ ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
@@ -410,7 +411,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
- unsigned int op, unsigned int flags, unsigned int hctx_idx)
+ unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
@@ -429,7 +430,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
- ret = blk_queue_enter(q, true);
+ ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
@@ -442,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
blk_queue_exit(q);
return ERR_PTR(-EXDEV);
}
- cpu = cpumask_first(alloc_data.hctx->cpumask);
+ cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@ -476,10 +477,15 @@ void blk_mq_free_request(struct request *rq)
if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+ if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
+ laptop_io_completion(q->backing_dev_info);
+
wbt_done(q->rq_wb, &rq->issue_stat);
- clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+ if (blk_rq_rl(rq))
+ blk_put_rl(blk_rq_rl(rq));
+
+ blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
@@ -525,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq)
bool shared = false;
int cpu;
+ WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+ blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
if (rq->rq_flags & RQF_STATS) {
@@ -552,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq)
put_cpu();
}
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+ __releases(hctx->srcu)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ rcu_read_unlock();
+ else
+ srcu_read_unlock(hctx->srcu, srcu_idx);
+}
+
+static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+ __acquires(hctx->srcu)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+ /* shut up gcc false positive */
+ *srcu_idx = 0;
+ rcu_read_lock();
+ } else
+ *srcu_idx = srcu_read_lock(hctx->srcu);
+}
+
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+ unsigned long flags;
+
+ /*
+ * blk_mq_rq_aborted_gstate() is used from the completion path and
+ * can thus be called from irq context. u64_stats_fetch in the
+ * middle of update on the same CPU leads to lockup. Disable irq
+ * while updating.
+ */
+ local_irq_save(flags);
+ u64_stats_update_begin(&rq->aborted_gstate_sync);
+ rq->aborted_gstate = gstate;
+ u64_stats_update_end(&rq->aborted_gstate_sync);
+ local_irq_restore(flags);
+}
+
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+ unsigned int start;
+ u64 aborted_gstate;
+
+ do {
+ start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+ aborted_gstate = rq->aborted_gstate;
+ } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+ return aborted_gstate;
+}
+
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
@@ -563,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq)
void blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+ int srcu_idx;
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq))
+
+ /*
+ * If @rq->aborted_gstate equals the current instance, timeout is
+ * claiming @rq and we lost. This is synchronized through
+ * hctx_lock(). See blk_mq_timeout_work() for details.
+ *
+ * Completion path never blocks and we can directly use RCU here
+ * instead of hctx_lock() which can be either RCU or SRCU.
+ * However, that would complicate paths which want to synchronize
+ * against us. Let stay in sync with the issue path so that
+ * hctx_lock() covers both issue and completion paths.
+ */
+ hctx_lock(hctx, &srcu_idx);
+ if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
__blk_mq_complete_request(rq);
+ hctx_unlock(hctx, srcu_idx);
}
EXPORT_SYMBOL(blk_mq_complete_request);
int blk_mq_request_started(struct request *rq)
{
- return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+ return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);
@@ -591,24 +666,27 @@ void blk_mq_start_request(struct request *rq)
wbt_issue(q->rq_wb, &rq->issue_stat);
}
- blk_add_timer(rq);
+ WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
/*
- * Ensure that ->deadline is visible before set the started
- * flag and clear the completed flag.
+ * Mark @rq in-flight which also advances the generation number,
+ * and register for timeout. Protect with a seqcount to allow the
+ * timeout path to read both @rq->gstate and @rq->deadline
+ * coherently.
+ *
+ * This is the only place where a request is marked in-flight. If
+ * the timeout path reads an in-flight @rq->gstate, the
+ * @rq->deadline it reads together under @rq->gstate_seq is
+ * guaranteed to be the matching one.
*/
- smp_mb__before_atomic();
+ preempt_disable();
+ write_seqcount_begin(&rq->gstate_seq);
- /*
- * Mark us as started and clear complete. Complete might have been
- * set if requeue raced with timeout, which then marked it as
- * complete. So be sure to clear complete again when we start
- * the request, otherwise we'll ignore the completion event.
- */
- if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
- set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
- clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+ blk_add_timer(rq);
+
+ write_seqcount_end(&rq->gstate_seq);
+ preempt_enable();
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
@@ -622,23 +700,21 @@ void blk_mq_start_request(struct request *rq)
EXPORT_SYMBOL(blk_mq_start_request);
/*
- * When we reach here because queue is busy, REQ_ATOM_COMPLETE
- * flag isn't set yet, so there may be race with timeout handler,
- * but given rq->deadline is just set in .queue_rq() under
- * this situation, the race won't be possible in reality because
- * rq->timeout should be set as big enough to cover the window
- * between blk_mq_start_request() called from .queue_rq() and
- * clearing REQ_ATOM_STARTED here.
+ * When we reach here because queue is busy, it's safe to change the state
+ * to IDLE without checking @rq->aborted_gstate because we should still be
+ * holding the RCU read lock and thus protected against timeout.
*/
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_put_driver_tag(rq);
+
trace_block_rq_requeue(q, rq);
wbt_requeue(q->rq_wb, &rq->issue_stat);
- blk_mq_sched_requeue_request(rq);
- if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+ if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
+ blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
}
@@ -648,6 +724,9 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
__blk_mq_requeue_request(rq);
+ /* this request will be re-inserted to io scheduler queue */
+ blk_mq_sched_requeue_request(rq);
+
BUG_ON(blk_queued_rq(rq));
blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
@@ -670,13 +749,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, true, false, false, true);
+ blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, false, false, false, true);
+ blk_mq_sched_insert_request(rq, false, false, false);
}
blk_mq_run_hw_queues(q, false);
@@ -690,7 +769,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
/*
* We abuse this flag that is otherwise used by the I/O scheduler to
- * request head insertation from the workqueue.
+ * request head insertion from the workqueue.
*/
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
@@ -710,7 +789,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
- kblockd_schedule_delayed_work(&q->requeue_work, 0);
+ kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
@@ -736,24 +815,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
struct blk_mq_timeout_data {
unsigned long next;
unsigned int next_set;
+ unsigned int nr_expired;
};
-void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
- /*
- * We know that complete is set at this point. If STARTED isn't set
- * anymore, then the request isn't active and the "timeout" should
- * just be ignored. This can happen due to the bitflag ordering.
- * Timeout first checks if STARTED is set, and if it is, assumes
- * the request is active. But if we race with completion, then
- * both flags will get cleared. So check here again, and ignore
- * a timeout event with a request that isn't active.
- */
- if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
- return;
+ req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
if (ops->timeout)
ret = ops->timeout(req, reserved);
@@ -763,8 +833,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
+ /*
+ * As nothing prevents from completion happening while
+ * ->aborted_gstate is set, this may lead to ignored
+ * completions and further spurious timeouts.
+ */
+ blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
- blk_clear_rq_complete(req);
break;
case BLK_EH_NOT_HANDLED:
break;
@@ -778,32 +853,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
struct blk_mq_timeout_data *data = priv;
+ unsigned long gstate, deadline;
+ int start;
+
+ might_sleep();
- if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+ if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
return;
- /*
- * The rq being checked may have been freed and reallocated
- * out already here, we avoid this race by checking rq->deadline
- * and REQ_ATOM_COMPLETE flag together:
- *
- * - if rq->deadline is observed as new value because of
- * reusing, the rq won't be timed out because of timing.
- * - if rq->deadline is observed as previous value,
- * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
- * because we put a barrier between setting rq->deadline
- * and clearing the flag in blk_mq_start_request(), so
- * this rq won't be timed out too.
- */
- if (time_after_eq(jiffies, rq->deadline)) {
- if (!blk_mark_rq_complete(rq))
- blk_mq_rq_timed_out(rq, reserved);
- } else if (!data->next_set || time_after(data->next, rq->deadline)) {
- data->next = rq->deadline;
+ /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+ while (true) {
+ start = read_seqcount_begin(&rq->gstate_seq);
+ gstate = READ_ONCE(rq->gstate);
+ deadline = blk_rq_deadline(rq);
+ if (!read_seqcount_retry(&rq->gstate_seq, start))
+ break;
+ cond_resched();
+ }
+
+ /* if in-flight && overdue, mark for abortion */
+ if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+ time_after_eq(jiffies, deadline)) {
+ blk_mq_rq_update_aborted_gstate(rq, gstate);
+ data->nr_expired++;
+ hctx->nr_expired++;
+ } else if (!data->next_set || time_after(data->next, deadline)) {
+ data->next = deadline;
data->next_set = 1;
}
}
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, void *priv, bool reserved)
+{
+ /*
+ * We marked @rq->aborted_gstate and waited for RCU. If there were
+ * completions that we lost to, they would have finished and
+ * updated @rq->gstate by now; otherwise, the completion path is
+ * now guaranteed to see @rq->aborted_gstate and yield. If
+ * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+ */
+ if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+ READ_ONCE(rq->gstate) == rq->aborted_gstate)
+ blk_mq_rq_timed_out(rq, reserved);
+}
+
static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
@@ -811,7 +905,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
+ .nr_expired = 0,
};
+ struct blk_mq_hw_ctx *hctx;
int i;
/* A deadlock might occur if a request is stuck requiring a
@@ -830,14 +926,46 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
+ /* scan for the expired ones and set their ->aborted_gstate */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+ if (data.nr_expired) {
+ bool has_rcu = false;
+
+ /*
+ * Wait till everyone sees ->aborted_gstate. The
+ * sequential waits for SRCUs aren't ideal. If this ever
+ * becomes a problem, we can add per-hw_ctx rcu_head and
+ * wait in parallel.
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->nr_expired)
+ continue;
+
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ has_rcu = true;
+ else
+ synchronize_srcu(hctx->srcu);
+
+ hctx->nr_expired = 0;
+ }
+ if (has_rcu)
+ synchronize_rcu();
+
+ /* terminate the ones we won */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+ }
+
if (data.next_set) {
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
- struct blk_mq_hw_ctx *hctx;
-
+ /*
+ * Request timeouts are handled as a forward rolling timer. If
+ * we end up here it means that no requests are pending and
+ * also that no request has been pending for a while. Mark
+ * each hctx as idle.
+ */
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
@@ -880,6 +1008,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
}
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
+struct dispatch_rq_data {
+ struct blk_mq_hw_ctx *hctx;
+ struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+ void *data)
+{
+ struct dispatch_rq_data *dispatch_data = data;
+ struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+ struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+ spin_lock(&ctx->lock);
+ if (unlikely(!list_empty(&ctx->rq_list))) {
+ dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+ list_del_init(&dispatch_data->rq->queuelist);
+ if (list_empty(&ctx->rq_list))
+ sbitmap_clear_bit(sb, bitnr);
+ }
+ spin_unlock(&ctx->lock);
+
+ return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *start)
+{
+ unsigned off = start ? start->index_hw : 0;
+ struct dispatch_rq_data data = {
+ .hctx = hctx,
+ .rq = NULL,
+ };
+
+ __sbitmap_for_each_set(&hctx->ctx_map, off,
+ dispatch_rq_from_ctx, &data);
+
+ return data.rq;
+}
+
static inline unsigned int queued_to_index(unsigned int queued)
{
if (!queued)
@@ -920,136 +1087,131 @@ done:
return rq->tag != -1;
}
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
- rq->tag = -1;
-
- if (rq->rq_flags & RQF_MQ_INFLIGHT) {
- rq->rq_flags &= ~RQF_MQ_INFLIGHT;
- atomic_dec(&hctx->nr_active);
- }
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- if (rq->tag == -1 || rq->internal_tag == -1)
- return;
-
- __blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
+ int flags, void *key)
{
struct blk_mq_hw_ctx *hctx;
- if (rq->tag == -1 || rq->internal_tag == -1)
- return;
+ hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
- __blk_mq_put_driver_tag(hctx, rq);
+ list_del_init(&wait->entry);
+ blk_mq_run_hw_queue(hctx, true);
+ return 1;
}
/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
+ * Mark us waiting for a tag. For shared tags, this involves hooking us into
+ * the tag wakeups. For non-shared tags, we can simply mark us needing a
+ * restart. For both cases, take care to check the condition again after
+ * marking us as waiting.
*/
-static bool reorder_tags_to_front(struct list_head *list)
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+ struct request *rq)
{
- struct request *rq, *tmp, *first = NULL;
-
- list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
- if (rq == first)
- break;
- if (rq->tag != -1) {
- list_move(&rq->queuelist, list);
- if (!first)
- first = rq;
- }
- }
+ struct blk_mq_hw_ctx *this_hctx = *hctx;
+ struct sbq_wait_state *ws;
+ wait_queue_entry_t *wait;
+ bool ret;
- return first != NULL;
-}
+ if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+ if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
+ set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
- void *key)
-{
- struct blk_mq_hw_ctx *hctx;
+ /*
+ * It's possible that a tag was freed in the window between the
+ * allocation failure and adding the hardware queue to the wait
+ * queue.
+ *
+ * Don't clear RESTART here, someone else could have set it.
+ * At most this will cost an extra queue run.
+ */
+ return blk_mq_get_driver_tag(rq, hctx, false);
+ }
- hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+ wait = &this_hctx->dispatch_wait;
+ if (!list_empty_careful(&wait->entry))
+ return false;
- list_del(&wait->entry);
- clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
- blk_mq_run_hw_queue(hctx, true);
- return 1;
-}
+ spin_lock(&this_hctx->lock);
+ if (!list_empty(&wait->entry)) {
+ spin_unlock(&this_hctx->lock);
+ return false;
+ }
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
-{
- struct sbq_wait_state *ws;
+ ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+ add_wait_queue(&ws->wait, wait);
/*
- * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
- * The thread which wins the race to grab this bit adds the hardware
- * queue to the wait queue.
+ * It's possible that a tag was freed in the window between the
+ * allocation failure and adding the hardware queue to the wait
+ * queue.
*/
- if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
- test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+ ret = blk_mq_get_driver_tag(rq, hctx, false);
+ if (!ret) {
+ spin_unlock(&this_hctx->lock);
return false;
-
- init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
- ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+ }
/*
- * As soon as this returns, it's no longer safe to fiddle with
- * hctx->dispatch_wait, since a completion can wake up the wait queue
- * and unlock the bit.
+ * We got a tag, remove ourselves from the wait queue to ensure
+ * someone else gets the wakeup.
*/
- add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+ spin_lock_irq(&ws->wait.lock);
+ list_del_init(&wait->entry);
+ spin_unlock_irq(&ws->wait.lock);
+ spin_unlock(&this_hctx->lock);
+
return true;
}
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
+
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+ bool got_budget)
{
struct blk_mq_hw_ctx *hctx;
- struct request *rq;
+ struct request *rq, *nxt;
+ bool no_tag = false;
int errors, queued;
+ blk_status_t ret = BLK_STS_OK;
if (list_empty(list))
return false;
+ WARN_ON(!list_is_singular(list) && got_budget);
+
/*
* Now process all the entries, sending them to the driver.
*/
errors = queued = 0;
do {
struct blk_mq_queue_data bd;
- blk_status_t ret;
rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
- if (!queued && reorder_tags_to_front(list))
- continue;
-
/*
* The initial allocation attempt failed, so we need to
- * rerun the hardware queue when a tag is freed.
+ * rerun the hardware queue when a tag is freed. The
+ * waitqueue takes care of that. If the queue is run
+ * before we add this entry back on the dispatch list,
+ * we'll re-run it below.
*/
- if (!blk_mq_dispatch_wait_add(hctx))
+ if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+ if (got_budget)
+ blk_mq_put_dispatch_budget(hctx);
+ /*
+ * For non-shared tags, the RESTART check
+ * will suffice.
+ */
+ if (hctx->flags & BLK_MQ_F_TAG_SHARED)
+ no_tag = true;
break;
+ }
+ }
- /*
- * It's possible that a tag was freed in the window
- * between the allocation failure and adding the
- * hardware queue to the wait queue.
- */
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
- break;
+ if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+ blk_mq_put_driver_tag(rq);
+ break;
}
list_del_init(&rq->queuelist);
@@ -1063,15 +1225,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
if (list_empty(list))
bd.last = true;
else {
- struct request *nxt;
-
nxt = list_first_entry(list, struct request, queuelist);
bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_STS_RESOURCE) {
- blk_mq_put_driver_tag_hctx(hctx, rq);
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
+ /*
+ * If an I/O scheduler has been configured and we got a
+ * driver tag for the next request already, free it
+ * again.
+ */
+ if (!list_empty(list)) {
+ nxt = list_first_entry(list, struct request, queuelist);
+ blk_mq_put_driver_tag(nxt);
+ }
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
@@ -1093,12 +1261,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
- /*
- * If an I/O scheduler has been configured and we got a driver
- * tag for the next request already, free it again.
- */
- rq = list_first_entry(list, struct request, queuelist);
- blk_mq_put_driver_tag(rq);
+ bool needs_restart;
spin_lock(&hctx->lock);
list_splice_init(list, &hctx->dispatch);
@@ -1109,10 +1272,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed.
*
- * If TAG_WAITING is set that means that an I/O scheduler has
- * been configured and another thread is waiting for a driver
- * tag. To guarantee fairness, do not rerun this hardware queue
- * but let the other thread grab the driver tag.
+ * If 'no_tag' is set, that means that we failed getting
+ * a driver tag with an I/O scheduler attached. If our dispatch
+ * waitqueue is no longer active, ensure that we run the queue
+ * AFTER adding our entries back to the list.
*
* If no I/O scheduler has been configured it is possible that
* the hardware queue got stopped and restarted before requests
@@ -1123,10 +1286,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* - Some but not all block drivers stop a queue before
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
* and dm-rq.
+ *
+ * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
+ * bit is set, run queue after a delay to avoid IO stalls
+ * that could otherwise occur if the queue is idle.
*/
- if (!blk_mq_sched_needs_restart(hctx) &&
- !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+ needs_restart = blk_mq_sched_needs_restart(hctx);
+ if (!needs_restart ||
+ (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
+ else if (needs_restart && (ret == BLK_STS_RESOURCE))
+ blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
}
return (queued + errors) != 0;
@@ -1139,9 +1309,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
/*
* We should be running this queue from one of the CPUs that
* are mapped to it.
+ *
+ * There are at least two related races now between setting
+ * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+ * __blk_mq_run_hw_queue():
+ *
+ * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+ * but later it becomes online, then this warning is harmless
+ * at all
+ *
+ * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+ * but later it becomes offline, then the warning can't be
+ * triggered, and we depend on blk-mq timeout handler to
+ * handle dispatched requests to this hctx
*/
- WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
- cpu_online(hctx->next_cpu));
+ if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+ cpu_online(hctx->next_cpu)) {
+ printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+ raw_smp_processor_id(),
+ cpumask_empty(hctx->cpumask) ? "inactive": "active");
+ dump_stack();
+ }
/*
* We can't run the queue inline with ints disabled. Ensure that
@@ -1149,17 +1337,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/
WARN_ON_ONCE(in_interrupt());
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- blk_mq_sched_dispatch_requests(hctx);
- rcu_read_unlock();
- } else {
- might_sleep();
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
- blk_mq_sched_dispatch_requests(hctx);
- srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
- }
+ hctx_lock(hctx, &srcu_idx);
+ blk_mq_sched_dispatch_requests(hctx);
+ hctx_unlock(hctx, srcu_idx);
}
/*
@@ -1170,20 +1352,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
+ bool tried = false;
+
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
int next_cpu;
-
- next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+select_cpu:
+ next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+ cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(hctx->cpumask);
+ next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
- hctx->next_cpu = next_cpu;
+ /*
+ * No online CPU is found, so have to make sure hctx->next_cpu
+ * is set correctly for not breaking workqueue.
+ */
+ if (next_cpu >= nr_cpu_ids)
+ hctx->next_cpu = cpumask_first(hctx->cpumask);
+ else
+ hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
+ /*
+ * Do unbound schedule if we can't find a online CPU for this hctx,
+ * and it should only happen in the path of handling CPU DEAD.
+ */
+ if (!cpu_online(hctx->next_cpu)) {
+ if (!tried) {
+ tried = true;
+ goto select_cpu;
+ }
+
+ /*
+ * Make sure to re-select CPU next time once after CPUs
+ * in hctx->cpumask become online again.
+ */
+ hctx->next_cpu_batch = 1;
+ return WORK_CPU_UNBOUND;
+ }
return hctx->next_cpu;
}
@@ -1207,9 +1416,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
put_cpu();
}
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work,
- msecs_to_jiffies(msecs));
+ kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+ msecs_to_jiffies(msecs));
}
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1218,9 +1426,30 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ int srcu_idx;
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ hctx_lock(hctx, &srcu_idx);
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx);
+ hctx_unlock(hctx, srcu_idx);
+
+ if (need_run) {
+ __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ return true;
+ }
+
+ return false;
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
@@ -1230,8 +1459,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (!blk_mq_hctx_has_pending(hctx) ||
- blk_mq_hctx_stopped(hctx))
+ if (blk_mq_hctx_stopped(hctx))
continue;
blk_mq_run_hw_queue(hctx, async);
@@ -1405,7 +1633,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq)
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1414,7 +1642,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
- blk_mq_run_hw_queue(hctx, false);
+ if (run_queue)
+ blk_mq_run_hw_queue(hctx, false);
}
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
@@ -1501,13 +1730,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
blk_init_request_from_bio(rq, bio);
- blk_account_io_start(rq, true);
-}
+ blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
-static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
-{
- return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
- !blk_queue_nomerges(hctx->queue);
+ blk_account_io_start(rq, true);
}
static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
@@ -1527,9 +1752,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie, bool may_sleep)
+static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
@@ -1538,61 +1763,104 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
};
blk_qc_t new_cookie;
blk_status_t ret;
- bool run_queue = true;
-
- /* RCU or SRCU read lock is needed before checking quiesced flag */
- if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
- run_queue = false;
- goto insert;
- }
-
- if (q->elevator)
- goto insert;
-
- if (!blk_mq_get_driver_tag(rq, NULL, false))
- goto insert;
new_cookie = request_to_qc_t(hctx, rq);
/*
- * For OK queue, we are done. For error, kill it. Any other
- * error (busy), just add it to our list as we previously
- * would have done
+ * For OK queue, we are done. For error, caller may kill it.
+ * Any other error (busy), just add it to our list as we
+ * previously would have done.
*/
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
*cookie = new_cookie;
- return;
+ break;
case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
__blk_mq_requeue_request(rq);
- goto insert;
+ break;
default:
*cookie = BLK_QC_T_NONE;
- blk_mq_end_request(rq, ret);
- return;
+ break;
+ }
+
+ return ret;
+}
+
+static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie,
+ bool bypass_insert)
+{
+ struct request_queue *q = rq->q;
+ bool run_queue = true;
+
+ /*
+ * RCU or SRCU read lock is needed before checking quiesced flag.
+ *
+ * When queue is stopped or quiesced, ignore 'bypass_insert' from
+ * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+ * and avoid driver to try to dispatch again.
+ */
+ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
+ run_queue = false;
+ bypass_insert = false;
+ goto insert;
}
+ if (q->elevator && !bypass_insert)
+ goto insert;
+
+ if (!blk_mq_get_driver_tag(rq, NULL, false))
+ goto insert;
+
+ if (!blk_mq_get_dispatch_budget(hctx)) {
+ blk_mq_put_driver_tag(rq);
+ goto insert;
+ }
+
+ return __blk_mq_issue_directly(hctx, rq, cookie);
insert:
- blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+ if (bypass_insert)
+ return BLK_STS_RESOURCE;
+
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
+ return BLK_STS_OK;
}
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- __blk_mq_try_issue_directly(hctx, rq, cookie, false);
- rcu_read_unlock();
- } else {
- unsigned int srcu_idx;
+ blk_status_t ret;
+ int srcu_idx;
- might_sleep();
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
- __blk_mq_try_issue_directly(hctx, rq, cookie, true);
- srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
- }
+ hctx_lock(hctx, &srcu_idx);
+
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
+ blk_mq_sched_insert_request(rq, false, true, false);
+ else if (ret != BLK_STS_OK)
+ blk_mq_end_request(rq, ret);
+
+ hctx_unlock(hctx, srcu_idx);
+}
+
+blk_status_t blk_mq_request_issue_directly(struct request *rq)
+{
+ blk_status_t ret;
+ int srcu_idx;
+ blk_qc_t unused_cookie;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+ hctx_lock(hctx, &srcu_idx);
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+ hctx_unlock(hctx, srcu_idx);
+
+ return ret;
}
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -1641,13 +1909,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (unlikely(is_flush_fua)) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- if (q->elevator) {
- blk_mq_sched_insert_request(rq, false, true, true,
- true);
- } else {
- blk_insert_flush(rq);
- blk_mq_run_hw_queue(data.hctx, true);
- }
+
+ /* bypass scheduler for flush rq */
+ blk_insert_flush(rq);
+ blk_mq_run_hw_queue(data.hctx, true);
} else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL;
@@ -1706,7 +1971,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
} else if (q->elevator) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true, true, true);
+ blk_mq_sched_insert_request(rq, false, true, true);
} else {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
@@ -1799,6 +2064,22 @@ static size_t order_to_size(unsigned int order)
return (size_t)PAGE_SIZE << order;
}
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+ unsigned int hctx_idx, int node)
+{
+ int ret;
+
+ if (set->ops->init_request) {
+ ret = set->ops->init_request(set, rq, hctx_idx, node);
+ if (ret)
+ return ret;
+ }
+
+ seqcount_init(&rq->gstate_seq);
+ u64_stats_init(&rq->aborted_gstate_sync);
+ return 0;
+}
+
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth)
{
@@ -1860,12 +2141,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
struct request *rq = p;
tags->static_rqs[i] = rq;
- if (set->ops->init_request) {
- if (set->ops->init_request(set, rq, hctx_idx,
- node)) {
- tags->static_rqs[i] = NULL;
- goto fail;
- }
+ if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+ tags->static_rqs[i] = NULL;
+ goto fail;
}
p += rq_size;
@@ -1924,7 +2202,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
{
blk_mq_debugfs_unregister_hctx(hctx);
- blk_mq_tag_idle(hctx);
+ if (blk_mq_hw_queue_mapped(hctx))
+ blk_mq_tag_idle(hctx);
if (set->ops->exit_request)
set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
@@ -1935,7 +2214,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
set->ops->exit_hctx(hctx, hctx_idx);
if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->queue_rq_srcu);
+ cleanup_srcu_struct(hctx->srcu);
blk_mq_remove_cpuhp(hctx);
blk_free_flush_queue(hctx->fq);
@@ -1979,7 +2258,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
* Allocate space for all possible cpus to avoid allocation at
* runtime
*/
- hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
+ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
GFP_KERNEL, node);
if (!hctx->ctxs)
goto unregister_cpu_notifier;
@@ -1990,6 +2269,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->nr_ctx = 0;
+ init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+ INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
+
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto free_bitmap;
@@ -2001,13 +2283,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (!hctx->fq)
goto sched_exit_hctx;
- if (set->ops->init_request &&
- set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
- node))
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
goto free_fq;
if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(hctx->queue_rq_srcu);
+ init_srcu_struct(hctx->srcu);
blk_mq_debugfs_register_hctx(q, hctx);
@@ -2043,16 +2323,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
- /* If the cpu isn't present, the cpu is mapped to first hctx */
- if (!cpu_present(i))
- continue;
-
- hctx = blk_mq_map_queue(q, i);
-
/*
* Set local node, IFF we have more than one hw queue. If
* not, we remain on the home node of the device
*/
+ hctx = blk_mq_map_queue(q, i);
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
hctx->numa_node = local_memory_node(cpu_to_node(i));
}
@@ -2109,7 +2384,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
- for_each_present_cpu(i) {
+ for_each_possible_cpu(i) {
hctx_idx = q->mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
@@ -2163,7 +2438,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
/*
* Initialize batch roundrobin counts
*/
- hctx->next_cpu = cpumask_first(hctx->cpumask);
+ hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+ cpu_online_mask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
}
@@ -2229,8 +2505,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
mutex_lock(&set->tag_list_lock);
- /* Check to see if we're transitioning to shared (from 1 to 2 queues). */
- if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
+ /*
+ * Check to see if we're transitioning to shared (from 1 to 2 queues).
+ */
+ if (!list_empty(&set->tag_list) &&
+ !(set->flags & BLK_MQ_F_TAG_SHARED)) {
set->flags |= BLK_MQ_F_TAG_SHARED;
/* update existing queue */
blk_mq_update_tag_set_depth(set, true);
@@ -2293,7 +2572,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
{
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
__alignof__(struct blk_mq_hw_ctx)) !=
sizeof(struct blk_mq_hw_ctx));
@@ -2310,6 +2589,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
blk_mq_sysfs_unregister(q);
+
+ /* protect against switching io scheduler */
+ mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int node;
@@ -2354,6 +2636,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
}
}
q->nr_hw_queues = i;
+ mutex_unlock(&q->sysfs_lock);
blk_mq_sysfs_register(q);
}
@@ -2404,6 +2687,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
spin_lock_init(&q->requeue_lock);
blk_queue_make_request(q, blk_mq_make_request);
+ if (q->mq_ops->poll)
+ q->poll_fn = blk_mq_poll;
/*
* Do this after blk_queue_make_request() overrides it...
@@ -2460,10 +2745,9 @@ static void blk_mq_queue_reinit(struct request_queue *q)
/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
- * we should change hctx numa_node according to new topology (this
- * involves free and re-allocate memory, worthy doing?)
+ * we should change hctx numa_node according to the new topology (this
+ * involves freeing and re-allocating memory, worth doing?)
*/
-
blk_mq_map_swqueue(q);
blk_mq_sysfs_register(q);
@@ -2524,9 +2808,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
- if (set->ops->map_queues)
+ if (set->ops->map_queues) {
+ int cpu;
+ /*
+ * transport .map_queues is usually done in the following
+ * way:
+ *
+ * for (queue = 0; queue < set->nr_hw_queues; queue++) {
+ * mask = get_cpu_mask(queue)
+ * for_each_cpu(cpu, mask)
+ * set->mq_map[cpu] = queue;
+ * }
+ *
+ * When we need to remap, the table has to be cleared for
+ * killing stale mapping since one CPU may not be mapped
+ * to any hw queue.
+ */
+ for_each_possible_cpu(cpu)
+ set->mq_map[cpu] = 0;
+
return set->ops->map_queues(set);
- else
+ } else
return blk_mq_map_queues(set);
}
@@ -2552,6 +2854,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->ops->queue_rq)
return -EINVAL;
+ if (!set->ops->get_budget ^ !set->ops->put_budget)
+ return -EINVAL;
+
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
pr_info("blk-mq: reduced tag depth to %u\n",
BLK_MQ_MAX_DEPTH);
@@ -2632,6 +2937,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return -EINVAL;
blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
@@ -2642,8 +2948,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
* queue depth. This is similar to what the old code would do.
*/
if (!hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
- min(nr, set->queue_depth),
+ ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false);
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
@@ -2656,6 +2961,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!ret)
q->nr_requests = nr;
+ blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;
@@ -2771,7 +3077,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
unsigned int nsecs;
ktime_t kt;
- if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+ if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
return false;
/*
@@ -2791,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
if (!nsecs)
return false;
- set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+ rq->rq_flags |= RQF_MQ_POLL_SLEPT;
/*
* This will be replaced with the stats tracking code, using
@@ -2805,7 +3111,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
hrtimer_init_sleeper(&hs, current);
do {
- if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+ if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
hrtimer_start_expires(&hs.timer, mode);
@@ -2860,23 +3166,18 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
cpu_relax();
}
+ __set_current_state(TASK_RUNNING);
return false;
}
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
{
struct blk_mq_hw_ctx *hctx;
- struct blk_plug *plug;
struct request *rq;
- if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
- !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return false;
- plug = current->plug;
- if (plug)
- blk_flush_plug_list(plug, false);
-
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
if (!blk_qc_t_is_internal(cookie))
rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
@@ -2894,7 +3195,6 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
return __blk_mq_poll(hctx, rq);
}
-EXPORT_SYMBOL_GPL(blk_mq_poll);
static int __init blk_mq_init(void)
{
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 4933af9d61f7..88c558f71819 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -3,6 +3,7 @@
#define INT_BLK_MQ_H
#include "blk-stat.h"
+#include "blk-mq-tag.h"
struct blk_mq_tag_set;
@@ -26,16 +27,30 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+/*
+ * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+ MQ_RQ_IDLE = 0,
+ MQ_RQ_IN_FLIGHT = 1,
+ MQ_RQ_COMPLETE = 2,
+
+ MQ_RQ_STATE_BITS = 2,
+ MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
+ MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
+};
+
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *start);
/*
* Internal helpers for allocating/freeing the request map
@@ -55,10 +70,13 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
*/
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq);
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
+/* Used by blk_insert_cloned_request() to issue request directly */
+blk_status_t blk_mq_request_issue_directly(struct request *rq);
+
/*
* CPU -> queue mappings
*/
@@ -80,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
-extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
-
void blk_mq_release(struct request_queue *q);
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+ return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state. The caller is responsible for ensuring that
+ * there are no other updaters. A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+ enum mq_rq_state state)
+{
+ u64 old_val = READ_ONCE(rq->gstate);
+ u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+
+ if (state == MQ_RQ_IN_FLIGHT) {
+ WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+ new_val += MQ_RQ_GEN_INC;
+ }
+
+ /* avoid exposing interim values */
+ WRITE_ONCE(rq->gstate, new_val);
+}
+
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
@@ -109,7 +158,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
- unsigned int flags;
+ blk_mq_req_flags_t flags;
unsigned int shallow_depth;
/* input & output parameter */
@@ -138,4 +187,53 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+
+ if (q->mq_ops->put_budget)
+ q->mq_ops->put_budget(hctx);
+}
+
+static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+
+ if (q->mq_ops->get_budget)
+ return q->mq_ops->get_budget(hctx);
+ return true;
+}
+
+static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+ rq->tag = -1;
+
+ if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+ rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+ atomic_dec(&hctx->nr_active);
+ }
+}
+
+static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ if (rq->tag == -1 || rq->internal_tag == -1)
+ return;
+
+ __blk_mq_put_driver_tag(hctx, rq);
+}
+
+static inline void blk_mq_put_driver_tag(struct request *rq)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ if (rq->tag == -1 || rq->internal_tag == -1)
+ return;
+
+ hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+ __blk_mq_put_driver_tag(hctx, rq);
+}
+
#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8559e9563c52..48ebe6be07b7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
* Caveat:
* The driver that does this *must* be able to deal appropriately
* with buffers in "highmemory". This can be accomplished by either calling
- * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ * kmap_atomic() to get a temporary kernel mapping, or by calling
* blk_queue_bounce() to create a buffer in normal memory.
**/
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d90fe3..28003bf9941c 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
#include "blk-mq.h"
#include "blk.h"
-#define BLK_RQ_STAT_BATCH 64
-
struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
{
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
- stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
- const s32 nr_batch = READ_ONCE(stat->nr_batch);
- const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
- if (!nr_batch)
- return;
- if (!nr_samples)
- stat->mean = div64_s64(stat->batch, nr_batch);
- else {
- stat->mean = div64_s64((stat->mean * nr_samples) +
- stat->batch,
- nr_batch + nr_samples);
- }
-
- stat->nr_samples += nr_batch;
- stat->nr_batch = stat->batch = 0;
+ stat->batch = 0;
}
+/* src is a per-cpu stat, mean isn't initialized */
static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
- blk_stat_flush_batch(src);
-
if (!src->nr_samples)
return;
dst->min = min(dst->min, src->min);
dst->max = max(dst->max, src->max);
- if (!dst->nr_samples)
- dst->mean = src->mean;
- else {
- dst->mean = div64_s64((src->mean * src->nr_samples) +
- (dst->mean * dst->nr_samples),
- dst->nr_samples + src->nr_samples);
- }
+ dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+ dst->nr_samples + src->nr_samples);
+
dst->nr_samples += src->nr_samples;
}
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
{
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
-
- if (stat->batch + value < stat->batch ||
- stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
- blk_stat_flush_batch(stat);
-
stat->batch += value;
- stat->nr_batch++;
+ stat->nr_samples++;
}
void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
int bucket;
- s64 now, value;
+ u64 now, value;
now = __blk_stat_time(ktime_to_ns(ktime_get()));
if (now < blk_stat_time(&rq->issue_stat))
@@ -110,9 +79,9 @@ void blk_stat_add(struct request *rq)
rcu_read_unlock();
}
-static void blk_stat_timer_fn(unsigned long data)
+static void blk_stat_timer_fn(struct timer_list *t)
{
- struct blk_stat_callback *cb = (void *)data;
+ struct blk_stat_callback *cb = from_timer(cb, t, timer);
unsigned int bucket;
int cpu;
@@ -161,7 +130,7 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
cb->bucket_fn = bucket_fn;
cb->data = data;
cb->buckets = buckets;
- setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+ timer_setup(&cb->timer, blk_stat_timer_fn, 0);
return cb;
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e54be402899d..cbea895a5547 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -450,12 +450,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
ret = wbt_init(q);
if (ret)
return ret;
-
- rwb = q->rq_wb;
- if (!rwb)
- return -EINVAL;
}
+ rwb = q->rq_wb;
if (val == -1)
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
else if (val >= 0)
@@ -856,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
+/**
+ * blk_register_queue - register a block layer queue with sysfs
+ * @disk: Disk of which the request queue should be registered with sysfs.
+ */
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -912,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
if (ret) {
+ mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
- goto unlock;
+ return ret;
}
}
ret = 0;
@@ -924,7 +926,15 @@ unlock:
mutex_unlock(&q->sysfs_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(blk_register_queue);
+/**
+ * blk_unregister_queue - counterpart of blk_register_queue()
+ * @disk: Disk of which the request queue should be unregistered from sysfs.
+ *
+ * Note: the caller is responsible for guaranteeing that this function is called
+ * after blk_register_queue() has finished.
+ */
void blk_unregister_queue(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
@@ -932,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
- mutex_lock(&q->sysfs_lock);
- queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
- mutex_unlock(&q->sysfs_lock);
+ /* Return early if disk->queue was never registered. */
+ if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ return;
- wbt_exit(q);
+ /*
+ * Since sysfs_remove_dir() prevents adding new directory entries
+ * before removal of existing entries starts, protect against
+ * concurrent elv_iosched_store() calls.
+ */
+ mutex_lock(&q->sysfs_lock);
+ spin_lock_irq(q->queue_lock);
+ queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+ spin_unlock_irq(q->queue_lock);
+ /*
+ * Remove the sysfs attributes before unregistering the queue data
+ * structures that can be modified through sysfs.
+ */
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- if (q->request_fn || (q->mq_ops && q->elevator))
- elv_unregister_queue(q);
+ mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
+
+ wbt_exit(q);
+
+ mutex_lock(&q->sysfs_lock);
+ if (q->request_fn || (q->mq_ops && q->elevator))
+ elv_unregister_queue(q);
+ mutex_unlock(&q->sysfs_lock);
+
kobject_put(&disk_to_dev(disk)->kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8631763866c6..c5a131673733 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -216,16 +216,16 @@ struct throtl_data
unsigned int scale;
- struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
- struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
- struct latency_bucket __percpu *latency_buckets;
+ struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time;
unsigned long filtered_latency;
bool track_bio_latency;
};
-static void throtl_pending_timer_fn(unsigned long arg);
+static void throtl_pending_timer_fn(struct timer_list *t);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
@@ -478,8 +478,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
sq->pending_tree = RB_ROOT;
- setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
- (unsigned long)sq);
+ timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
@@ -1249,9 +1248,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
* the top-level service_tree is reached, throtl_data->dispatch_work is
* kicked so that the ready bio's are issued.
*/
-static void throtl_pending_timer_fn(unsigned long arg)
+static void throtl_pending_timer_fn(struct timer_list *t)
{
- struct throtl_service_queue *sq = (void *)arg;
+ struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
struct request_queue *q = td->queue;
@@ -1512,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {
.seq_show = blkg_print_stat_bytes,
},
{
+ .name = "throttle.io_service_bytes_recursive",
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_bytes_recursive,
+ },
+ {
.name = "throttle.io_serviced",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
+ {
+ .name = "throttle.io_serviced_recursive",
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_ios_recursive,
+ },
{ } /* terminate */
};
@@ -2041,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
- struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
- int i, cpu;
- unsigned long last_latency = 0;
- unsigned long latency;
+ struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+ int i, cpu, rw;
+ unsigned long last_latency[2] = { 0 };
+ unsigned long latency[2];
if (!blk_queue_nonrot(td->queue))
return;
@@ -2053,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
td->last_calculate_time = jiffies;
memset(avg_latency, 0, sizeof(avg_latency));
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- struct latency_bucket *tmp = &td->tmp_buckets[i];
-
- for_each_possible_cpu(cpu) {
- struct latency_bucket *bucket;
-
- /* this isn't race free, but ok in practice */
- bucket = per_cpu_ptr(td->latency_buckets, cpu);
- tmp->total_latency += bucket[i].total_latency;
- tmp->samples += bucket[i].samples;
- bucket[i].total_latency = 0;
- bucket[i].samples = 0;
- }
+ for (rw = READ; rw <= WRITE; rw++) {
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets[rw],
+ cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
- if (tmp->samples >= 32) {
- int samples = tmp->samples;
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
- latency = tmp->total_latency;
+ latency[rw] = tmp->total_latency;
- tmp->total_latency = 0;
- tmp->samples = 0;
- latency /= samples;
- if (latency == 0)
- continue;
- avg_latency[i].latency = latency;
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency[rw] /= samples;
+ if (latency[rw] == 0)
+ continue;
+ avg_latency[rw][i].latency = latency[rw];
+ }
}
}
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- if (!avg_latency[i].latency) {
- if (td->avg_buckets[i].latency < last_latency)
- td->avg_buckets[i].latency = last_latency;
- continue;
- }
+ for (rw = READ; rw <= WRITE; rw++) {
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[rw][i].latency) {
+ if (td->avg_buckets[rw][i].latency < last_latency[rw])
+ td->avg_buckets[rw][i].latency =
+ last_latency[rw];
+ continue;
+ }
- if (!td->avg_buckets[i].valid)
- latency = avg_latency[i].latency;
- else
- latency = (td->avg_buckets[i].latency * 7 +
- avg_latency[i].latency) >> 3;
+ if (!td->avg_buckets[rw][i].valid)
+ latency[rw] = avg_latency[rw][i].latency;
+ else
+ latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+ avg_latency[rw][i].latency) >> 3;
- td->avg_buckets[i].latency = max(latency, last_latency);
- td->avg_buckets[i].valid = true;
- last_latency = td->avg_buckets[i].latency;
+ td->avg_buckets[rw][i].latency = max(latency[rw],
+ last_latency[rw]);
+ td->avg_buckets[rw][i].valid = true;
+ last_latency[rw] = td->avg_buckets[rw][i].latency;
+ }
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
throtl_log(&td->service_queue,
- "Latency bucket %d: latency=%ld, valid=%d", i,
- td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+ "Latency bucket %d: read latency=%ld, read valid=%d, "
+ "write latency=%ld, write valid=%d", i,
+ td->avg_buckets[READ][i].latency,
+ td->avg_buckets[READ][i].valid,
+ td->avg_buckets[WRITE][i].latency,
+ td->avg_buckets[WRITE][i].valid);
}
#else
static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2113,8 +2133,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
{
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- if (bio->bi_css)
+ if (bio->bi_css) {
+ if (bio->bi_cg_private)
+ blkg_put(tg_to_blkg(bio->bi_cg_private));
bio->bi_cg_private = tg;
+ blkg_get(tg_to_blkg(tg));
+ }
blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
#endif
}
@@ -2223,13 +2247,7 @@ again:
out_unlock:
spin_unlock_irq(q->queue_lock);
out:
- /*
- * As multiple blk-throtls may stack in the same issue path, we
- * don't want bios to leave with the flag set. Clear the flag if
- * being issued.
- */
- if (!throttled)
- bio_clear_flag(bio, BIO_THROTTLED);
+ bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
@@ -2245,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
struct latency_bucket *latency;
int index;
- if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+ if (!td || td->limit_index != LIMIT_LOW ||
+ !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
!blk_queue_nonrot(td->queue))
return;
index = request_bucket_index(size);
- latency = get_cpu_ptr(td->latency_buckets);
+ latency = get_cpu_ptr(td->latency_buckets[op]);
latency[index].total_latency += time;
latency[index].samples++;
- put_cpu_ptr(td->latency_buckets);
+ put_cpu_ptr(td->latency_buckets[op]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2273,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long finish_time;
unsigned long start_time;
unsigned long lat;
+ int rw = bio_data_dir(bio);
tg = bio->bi_cg_private;
if (!tg)
@@ -2284,8 +2304,10 @@ void blk_throtl_bio_endio(struct bio *bio)
start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
finish_time = __blk_stat_time(finish_time_ns) >> 10;
- if (!start_time || finish_time <= start_time)
+ if (!start_time || finish_time <= start_time) {
+ blkg_put(tg_to_blkg(tg));
return;
+ }
lat = finish_time - start_time;
/* this is only for bio based driver */
@@ -2299,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
bucket = request_bucket_index(
blk_stat_size(&bio->bi_issue_stat));
- threshold = tg->td->avg_buckets[bucket].latency +
+ threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target;
if (lat > threshold)
tg->bad_bio_cnt++;
@@ -2315,6 +2337,8 @@ void blk_throtl_bio_endio(struct bio *bio)
tg->bio_cnt /= 2;
tg->bad_bio_cnt /= 2;
}
+
+ blkg_put(tg_to_blkg(tg));
}
#endif
@@ -2390,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
- td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets) {
+ if (!td->latency_buckets[READ]) {
+ kfree(td);
+ return -ENOMEM;
+ }
+ td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets[WRITE]) {
+ free_percpu(td->latency_buckets[READ]);
kfree(td);
return -ENOMEM;
}
@@ -2411,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret) {
- free_percpu(td->latency_buckets);
+ free_percpu(td->latency_buckets[READ]);
+ free_percpu(td->latency_buckets[WRITE]);
kfree(td);
}
return ret;
@@ -2422,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets);
+ free_percpu(q->td->latency_buckets[READ]);
+ free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
@@ -2440,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
} else {
td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD;
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
- td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+ td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+ }
}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD;
#endif
- td->track_bio_latency = !q->mq_ops && !q->request_fn;
+ td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 17ec83bb0900..a05e3676d24a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
unsigned int *next_set)
{
- if (time_after_eq(jiffies, rq->deadline)) {
+ const unsigned long deadline = blk_rq_deadline(rq);
+
+ if (time_after_eq(jiffies, deadline)) {
list_del_init(&rq->timeout_list);
/*
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
*/
if (!blk_mark_rq_complete(rq))
blk_rq_timed_out(rq);
- } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
- *next_timeout = rq->deadline;
+ } else if (!*next_set || time_after(*next_timeout, deadline)) {
+ *next_timeout = deadline;
*next_set = 1;
}
}
@@ -134,8 +136,6 @@ void blk_timeout_work(struct work_struct *work)
struct request *rq, *tmp;
int next_set = 0;
- if (blk_queue_enter(q, true))
- return;
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -145,7 +145,6 @@ void blk_timeout_work(struct work_struct *work)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
- blk_queue_exit(q);
}
/**
@@ -159,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
*/
void blk_abort_request(struct request *req)
{
- if (blk_mark_rq_complete(req))
- return;
-
if (req->q->mq_ops) {
- blk_mq_rq_timed_out(req, false);
+ /*
+ * All we need to ensure is that timeout scan takes place
+ * immediately and that scan sees the new timeout value.
+ * No need for fancy synchronizations.
+ */
+ blk_rq_set_deadline(req, jiffies);
+ mod_timer(&req->q->timeout, 0);
} else {
+ if (blk_mark_rq_complete(req))
+ return;
blk_delete_timer(req);
blk_rq_timed_out(req);
}
@@ -211,7 +215,8 @@ void blk_add_timer(struct request *req)
if (!req->timeout)
req->timeout = q->rq_timeout;
- req->deadline = jiffies + req->timeout;
+ blk_rq_set_deadline(req, jiffies + req->timeout);
+ req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
/*
* Only the non-mq case needs to add the request to a protected list.
@@ -225,7 +230,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
- expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
+ expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6a9a0f03a67b..f92fc84b5e2c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -178,12 +178,11 @@ void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
if (wbt_is_read(stat))
wb_timestamp(rwb, &rwb->last_comp);
- wbt_clear_state(stat);
} else {
WARN_ON_ONCE(stat == rwb->sync_cookie);
__wbt_done(rwb, wbt_stat_to_mask(stat));
- wbt_clear_state(stat);
}
+ wbt_clear_state(stat);
}
/*
@@ -261,7 +260,7 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{
- u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+ u64 now, issue = READ_ONCE(rwb->sync_issue);
if (!issue || !rwb->sync_cookie)
return 0;
@@ -482,7 +481,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
/*
* At this point we know it's a buffered write. If this is
- * kswapd trying to free memory, or REQ_SYNC is set, set, then
+ * kswapd trying to free memory, or REQ_SYNC is set, then
* it's WB_SYNC_ALL writeback, and we'll use the max limit for
* that. If the write is marked as a background write, then use
* the idle limit, or go to normal if we haven't had competing
@@ -654,7 +653,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
}
/*
- * Disable wbt, if enabled by default. Only called from CFQ.
+ * Disable wbt, if enabled by default.
*/
void wbt_disable_default(struct request_queue *q)
{
@@ -698,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
static int wbt_data_dir(const struct request *rq)
{
- return rq_data_dir(rq);
+ const int op = req_op(rq);
+
+ if (op == REQ_OP_READ)
+ return READ;
+ else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+ return WRITE;
+
+ /* don't account */
+ return -1;
}
int wbt_init(struct request_queue *q)
@@ -723,8 +730,6 @@ int wbt_init(struct request_queue *q)
init_waitqueue_head(&rwb->rq_wait[i].wait);
}
- rwb->wc = 1;
- rwb->queue_depth = RWB_DEF_DEPTH;
rwb->last_comp = rwb->last_issue = jiffies;
rwb->queue = q;
rwb->win_nsec = RWB_WINDOW_NSEC;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
}
/*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+ if (!rq->q->seq_zones_wlock)
+ return false;
+
+ if (blk_rq_is_passthrough(rq))
+ return false;
+
+ switch (req_op(rq)) {
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ return blk_rq_zone_is_seq(rq);
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+
+void __blk_req_zone_write_lock(struct request *rq)
+{
+ if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+ rq->q->seq_zones_wlock)))
+ return;
+
+ WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+ rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+ rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+ if (rq->q->seq_zones_wlock)
+ WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+ rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+
+/*
* Check that a zone report belongs to the partition.
* If yes, fix its start sector and write pointer, copy it in the
* zone information array and return true. Return false otherwise.
diff --git a/block/blk.h b/block/blk.h
index 85be8b232b37..46db5dc83dcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -120,26 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes);
void blk_account_io_done(struct request *req);
/*
- * Internal atomic flags for request handling
- */
-enum rq_atomic_flags {
- REQ_ATOM_COMPLETE = 0,
- REQ_ATOM_STARTED,
- REQ_ATOM_POLL_SLEPT,
-};
-
-/*
* EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. Steal the bottom bit of the
+ * __deadline field for this.
*/
static inline int blk_mark_rq_complete(struct request *rq)
{
- return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ return test_and_set_bit(0, &rq->__deadline);
}
static inline void blk_clear_rq_complete(struct request *rq)
{
- clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ clear_bit(0, &rq->__deadline);
+}
+
+static inline bool blk_rq_is_complete(struct request *rq)
+{
+ return test_bit(0, &rq->__deadline);
}
/*
@@ -149,45 +146,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
void blk_insert_flush(struct request *rq);
-static inline struct request *__elv_next_request(struct request_queue *q)
-{
- struct request *rq;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-
- WARN_ON_ONCE(q->mq_ops);
-
- while (1) {
- if (!list_empty(&q->queue_head)) {
- rq = list_entry_rq(q->queue_head.next);
- return rq;
- }
-
- /*
- * Flush request is running and flush request isn't queueable
- * in the drive, we can hold the queue till flush request is
- * finished. Even we don't do this, driver can't dispatch next
- * requests and will requeue them. And this can improve
- * throughput too. For example, we have request flush1, write1,
- * flush 2. flush1 is dispatched, then queue is hold, write1
- * isn't inserted to queue. After flush1 is finished, flush2
- * will be dispatched. Since disk cache is already clean,
- * flush2 will be finished very soon, so looks like flush2 is
- * folded to flush1.
- * Since the queue is hold, a flag is set to indicate the queue
- * should be restarted later. Please see flush_end_io() for
- * details.
- */
- if (fq->flush_pending_idx != fq->flush_running_idx &&
- !queue_flush_queueable(q)) {
- fq->flush_queue_delayed = 1;
- return NULL;
- }
- if (unlikely(blk_queue_bypass(q)) ||
- !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
- return NULL;
- }
-}
-
static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
@@ -204,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
}
+int elv_register_queue(struct request_queue *q);
+void elv_unregister_queue(struct request_queue *q);
+
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -278,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
}
/*
+ * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+ * setting the deadline clears the bottom bit, potentially clearing the
+ * completed bit. The user has to be OK with this (current ones are fine).
+ */
+static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+{
+ rq->__deadline = time & ~0x1UL;
+}
+
+static inline unsigned long blk_rq_deadline(struct request *rq)
+{
+ return rq->__deadline & ~0x1UL;
+}
+
+/*
* Internal io_context interface
*/
void get_io_context(struct io_context *ioc);
@@ -362,4 +338,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
}
#endif /* CONFIG_BOUNCE */
+extern void blk_drain_queue(struct request_queue *q);
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index fceb1a96480b..6a3e68292273 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
- struct bio_vec tovec, *fromvec = from->bi_io_vec;
+ struct bio_vec tovec, fromvec;
struct bvec_iter iter;
+ /*
+ * The bio of @from is created by bounce, so we can iterate
+ * its bvec from start to end, but the @from->bi_iter can't be
+ * trusted because it might be changed by splitting.
+ */
+ struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
bio_for_each_segment(tovec, to, iter) {
- if (tovec.bv_page != fromvec->bv_page) {
+ fromvec = bio_iter_iovec(from, from_iter);
+ if (tovec.bv_page != fromvec.bv_page) {
/*
* fromvec->bv_offset and fromvec->bv_len might have
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
- vfrom = page_address(fromvec->bv_page) +
+ vfrom = page_address(fromvec.bv_page) +
tovec.bv_offset;
bounce_copy_vec(&tovec, vfrom);
flush_dcache_page(tovec.bv_page);
}
-
- fromvec++;
+ bio_advance_iter(from, &from_iter, tovec.bv_len);
}
}
static void bounce_end_io(struct bio *bio, mempool_t *pool)
{
struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, *org_vec;
+ struct bio_vec *bvec, orig_vec;
int i;
- int start = bio_orig->bi_iter.bi_idx;
+ struct bvec_iter orig_iter = bio_orig->bi_iter;
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, i) {
- org_vec = bio_orig->bi_io_vec + i + start;
-
- if (bvec->bv_page == org_vec->bv_page)
- continue;
-
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, pool);
+ orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+ if (bvec->bv_page != orig_vec.bv_page) {
+ dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+ mempool_free(bvec->bv_page, pool);
+ }
+ bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
}
bio_orig->bi_status = bio->bi_status;
@@ -200,6 +205,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
unsigned i = 0;
bool bounce = false;
int sectors = 0;
+ bool passthrough = bio_is_passthrough(*bio_orig);
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
@@ -210,13 +216,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
if (!bounce)
return;
- if (sectors < bio_sectors(*bio_orig)) {
+ if (!passthrough && sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
bio_chain(bio, *bio_orig);
generic_make_request(*bio_orig);
*bio_orig = bio;
}
- bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+ bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+ bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 15d25ccd51a5..1474153f73e3 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -30,7 +30,7 @@
/**
* bsg_teardown_job - routine to teardown a bsg job
- * @job: bsg_job that is to be torn down
+ * @kref: kref inside bsg_job that is to be torn down
*/
static void bsg_teardown_job(struct kref *kref)
{
@@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
* @name: device to give bsg device
* @job_fn: bsg job handler
* @dd_job_size: size of LLD data needed for each job
+ * @release: @dev release function
*/
struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
bsg_job_fn *job_fn, int dd_job_size,
diff --git a/block/bsg.c b/block/bsg.c
index ee1335c68de7..06dc96e1f670 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -32,6 +32,9 @@
#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
#define BSG_VERSION "0.4"
+#define bsg_dbg(bd, fmt, ...) \
+ pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
+
struct bsg_device {
struct request_queue *queue;
spinlock_t lock;
@@ -55,14 +58,6 @@ enum {
#define BSG_DEFAULT_CMDS 64
#define BSG_MAX_DEVS 32768
-#undef BSG_DEBUG
-
-#ifdef BSG_DEBUG
-#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
-#else
-#define dprintk(fmt, args...)
-#endif
-
static DEFINE_MUTEX(bsg_mutex);
static DEFINE_IDR(bsg_minor_idr);
@@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
bc->bd = bd;
INIT_LIST_HEAD(&bc->list);
- dprintk("%s: returning free cmd %p\n", bd->name, bc);
+ bsg_dbg(bd, "returning free cmd %p\n", bc);
return bc;
out:
spin_unlock_irq(&bd->lock);
@@ -137,7 +132,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_v4 *hdr, struct bsg_device *bd,
- fmode_t has_write_perm)
+ fmode_t mode)
{
struct scsi_request *req = scsi_req(rq);
@@ -152,7 +147,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
return -EFAULT;
if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
- if (blk_verify_command(req->cmd, has_write_perm))
+ if (blk_verify_command(req->cmd, mode))
return -EPERM;
} else if (!capable(CAP_SYS_RAWIO))
return -EPERM;
@@ -206,7 +201,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
* map sg_io_v4 to a request.
*/
static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
{
struct request_queue *q = bd->queue;
struct request *rq, *next_rq = NULL;
@@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
if (!bcd->class_dev)
return ERR_PTR(-ENXIO);
- dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
+ bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
+ (unsigned long long) hdr->dout_xferp,
hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
hdr->din_xfer_len);
@@ -237,7 +233,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
if (IS_ERR(rq))
return rq;
- ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
+ ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
if (ret)
goto out;
@@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
struct bsg_device *bd = bc->bd;
unsigned long flags;
- dprintk("%s: finished rq %p bc %p, bio %p\n",
- bd->name, rq, bc, bc->bio);
+ bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
+ rq, bc, bc->bio);
bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
@@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
list_add_tail(&bc->list, &bd->busy_list);
spin_unlock_irq(&bd->lock);
- dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
+ bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
rq->end_io_data = bc;
blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
@@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
}
} while (1);
- dprintk("%s: returning done %p\n", bd->name, bc);
+ bsg_dbg(bd, "returning done %p\n", bc);
return bc;
}
@@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct scsi_request *req = scsi_req(rq);
int ret = 0;
- dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
+ pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
/*
* fill in all the output members
*/
@@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
struct bsg_command *bc;
int ret, tret;
- dprintk("%s: entered\n", bd->name);
+ bsg_dbg(bd, "entered\n");
/*
* wait for all commands to complete
@@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
int ret;
ssize_t bytes_read;
- dprintk("%s: read %zd bytes\n", bd->name, count);
+ bsg_dbg(bd, "read %zd bytes\n", count);
bsg_set_block(bd, file);
@@ -587,8 +583,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
}
static int __bsg_write(struct bsg_device *bd, const char __user *buf,
- size_t count, ssize_t *bytes_written,
- fmode_t has_write_perm)
+ size_t count, ssize_t *bytes_written, fmode_t mode)
{
struct bsg_command *bc;
struct request *rq;
@@ -619,7 +614,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
/*
* get a request, fill in the blanks, and add to request queue
*/
- rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
+ rq = bsg_map_hdr(bd, &bc->hdr, mode);
if (IS_ERR(rq)) {
ret = PTR_ERR(rq);
rq = NULL;
@@ -647,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
ssize_t bytes_written;
int ret;
- dprintk("%s: write %zd bytes\n", bd->name, count);
+ bsg_dbg(bd, "write %zd bytes\n", count);
if (unlikely(uaccess_kernel()))
return -EINVAL;
@@ -655,8 +650,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
bsg_set_block(bd, file);
bytes_written = 0;
- ret = __bsg_write(bd, buf, count, &bytes_written,
- file->f_mode & FMODE_WRITE);
+ ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
*ppos = bytes_written;
@@ -666,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
if (!bytes_written || err_block_err(ret))
bytes_written = ret;
- dprintk("%s: returning %zd\n", bd->name, bytes_written);
+ bsg_dbg(bd, "returning %zd\n", bytes_written);
return bytes_written;
}
@@ -719,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
hlist_del(&bd->dev_list);
mutex_unlock(&bsg_mutex);
- dprintk("%s: tearing down\n", bd->name);
+ bsg_dbg(bd, "tearing down\n");
/*
* close can always block
@@ -746,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
struct file *file)
{
struct bsg_device *bd;
-#ifdef BSG_DEBUG
unsigned char buf[32];
-#endif
if (!blk_queue_scsi_passthrough(rq)) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
@@ -773,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
- dprintk("bound to <%s>, max queue %d\n",
+ bsg_dbg(bd, "bound to <%s>, max queue %d\n",
format_dev_t(buf, inode->i_rdev), bd->max_queue);
mutex_unlock(&bsg_mutex);
@@ -847,19 +839,19 @@ static int bsg_release(struct inode *inode, struct file *file)
return bsg_put_device(bd);
}
-static unsigned int bsg_poll(struct file *file, poll_table *wait)
+static __poll_t bsg_poll(struct file *file, poll_table *wait)
{
struct bsg_device *bd = file->private_data;
- unsigned int mask = 0;
+ __poll_t mask = 0;
poll_wait(file, &bd->wq_done, wait);
poll_wait(file, &bd->wq_free, wait);
spin_lock_irq(&bd->lock);
if (!list_empty(&bd->done_list))
- mask |= POLLIN | POLLRDNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
if (bd->queued_cmds < bd->max_queue)
- mask |= POLLOUT;
+ mask |= EPOLLOUT;
spin_unlock_irq(&bd->lock);
return mask;
@@ -915,7 +907,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (copy_from_user(&hdr, uarg, sizeof(hdr)))
return -EFAULT;
- rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
+ rq = bsg_map_hdr(bd, &hdr, file->f_mode);
if (IS_ERR(rq))
return PTR_ERR(rq);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
int front_merges;
};
-static void deadline_move_request(struct deadline_data *, struct request *);
-
static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
@@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
+ /*
+ * This may be a requeue of a write request that has locked its
+ * target zone. If it is the case, this releases the zone lock.
+ */
+ blk_req_zone_write_unlock(rq);
+
deadline_add_rq_rb(dd, rq);
/*
@@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
{
struct request_queue *q = rq->q;
+ /*
+ * For a zoned block device, write requests must write lock their
+ * target zone.
+ */
+ blk_req_zone_write_lock(rq);
+
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
}
@@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
}
/*
+ * For the specified data direction, return the next request to dispatch using
+ * arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+ struct request *rq;
+
+ if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+ return NULL;
+
+ if (list_empty(&dd->fifo_list[data_dir]))
+ return NULL;
+
+ rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+ if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ return rq;
+ }
+
+ return NULL;
+}
+
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+ struct request *rq;
+
+ if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+ return NULL;
+
+ rq = dd->next_rq[data_dir];
+ if (!rq)
+ return NULL;
+
+ if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ while (rq) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ return rq;
+ rq = deadline_latter_request(rq);
+ }
+
+ return NULL;
+}
+
+/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
*/
@@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]);
- struct request *rq;
+ struct request *rq, *next_rq;
int data_dir;
/*
* batches are currently reads XOR writes
*/
- if (dd->next_rq[WRITE])
- rq = dd->next_rq[WRITE];
- else
- rq = dd->next_rq[READ];
+ rq = deadline_next_request(dd, WRITE);
+ if (!rq)
+ rq = deadline_next_request(dd, READ);
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
- if (writes && (dd->starved++ >= dd->writes_starved))
+ if (deadline_fifo_request(dd, WRITE) &&
+ (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
@@ -291,21 +364,29 @@ dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
- if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+ next_rq = deadline_next_request(dd, data_dir);
+ if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
- rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+ rq = deadline_fifo_request(dd, data_dir);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
- rq = dd->next_rq[data_dir];
+ rq = next_rq;
}
+ /*
+ * For a zoned block device, if we only have writes queued and none of
+ * them can be dispatched, rq will be NULL.
+ */
+ if (!rq)
+ return 0;
+
dd->batching = 0;
dispatch_request:
@@ -318,6 +399,16 @@ dispatch_request:
return 1;
}
+/*
+ * For zoned block devices, write unlock the target zone of completed
+ * write requests.
+ */
+static void
+deadline_completed_request(struct request_queue *q, struct request *rq)
+{
+ blk_req_zone_write_unlock(rq);
+}
+
static void deadline_exit_queue(struct elevator_queue *e)
{
struct deadline_data *dd = e->elevator_data;
@@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
.elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests,
+ .elevator_completed_req_fn = deadline_completed_request,
.elevator_add_req_fn = deadline_add_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
diff --git a/block/elevator.c b/block/elevator.c
index 153926a90901..e87e9b43aba0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,12 +83,25 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static struct elevator_type *elevator_find(const char *name)
+static bool elevator_match(const struct elevator_type *e, const char *name)
+{
+ if (!strcmp(e->elevator_name, name))
+ return true;
+ if (e->elevator_alias && !strcmp(e->elevator_alias, name))
+ return true;
+
+ return false;
+}
+
+/*
+ * Return scheduler with name 'name' and with matching 'mq capability
+ */
+static struct elevator_type *elevator_find(const char *name, bool mq)
{
struct elevator_type *e;
list_for_each_entry(e, &elv_list, list) {
- if (!strcmp(e->elevator_name, name))
+ if (elevator_match(e, name) && (mq == e->uses_mq))
return e;
}
@@ -100,25 +113,25 @@ static void elevator_put(struct elevator_type *e)
module_put(e->elevator_owner);
}
-static struct elevator_type *elevator_get(const char *name, bool try_loading)
+static struct elevator_type *elevator_get(struct request_queue *q,
+ const char *name, bool try_loading)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
- e = elevator_find(name);
+ e = elevator_find(name, q->mq_ops != NULL);
if (!e && try_loading) {
spin_unlock(&elv_list_lock);
request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
- e = elevator_find(name);
+ e = elevator_find(name, q->mq_ops != NULL);
}
if (e && !try_module_get(e->elevator_owner))
e = NULL;
spin_unlock(&elv_list_lock);
-
return e;
}
@@ -144,8 +157,12 @@ void __init load_default_elevator_module(void)
if (!chosen_elevator[0])
return;
+ /*
+ * Boot parameter is deprecated, we haven't supported that for MQ.
+ * Only look for non-mq schedulers from here.
+ */
spin_lock(&elv_list_lock);
- e = elevator_find(chosen_elevator);
+ e = elevator_find(chosen_elevator, false);
spin_unlock(&elv_list_lock);
if (!e)
@@ -202,7 +219,7 @@ int elevator_init(struct request_queue *q, char *name)
q->boundary_rq = NULL;
if (name) {
- e = elevator_get(name, true);
+ e = elevator_get(q, name, true);
if (!e)
return -EINVAL;
}
@@ -214,7 +231,7 @@ int elevator_init(struct request_queue *q, char *name)
* allowed from async.
*/
if (!e && !q->mq_ops && *chosen_elevator) {
- e = elevator_get(chosen_elevator, false);
+ e = elevator_get(q, chosen_elevator, false);
if (!e)
printk(KERN_ERR "I/O scheduler %s not found\n",
chosen_elevator);
@@ -229,17 +246,17 @@ int elevator_init(struct request_queue *q, char *name)
*/
if (q->mq_ops) {
if (q->nr_hw_queues == 1)
- e = elevator_get("mq-deadline", false);
+ e = elevator_get(q, "mq-deadline", false);
if (!e)
return 0;
} else
- e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+ e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
if (!e) {
printk(KERN_ERR
"Default I/O scheduler not found. " \
"Using noop.\n");
- e = elevator_get("noop", false);
+ e = elevator_get(q, "noop", false);
}
}
@@ -852,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
struct elevator_queue *e = q->elevator;
int error;
+ lockdep_assert_held(&q->sysfs_lock);
+
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -869,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
}
return error;
}
-EXPORT_SYMBOL(elv_register_queue);
void elv_unregister_queue(struct request_queue *q)
{
+ lockdep_assert_held(&q->sysfs_lock);
+
if (q) {
struct elevator_queue *e = q->elevator;
@@ -883,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
wbt_enable_default(q);
}
}
-EXPORT_SYMBOL(elv_unregister_queue);
int elv_register(struct elevator_type *e)
{
@@ -905,7 +924,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- if (elevator_find(e->elevator_name)) {
+ if (elevator_find(e->elevator_name, e->uses_mq)) {
spin_unlock(&elv_list_lock);
if (e->icq_cache)
kmem_cache_destroy(e->icq_cache);
@@ -915,9 +934,9 @@ int elv_register(struct elevator_type *e)
spin_unlock(&elv_list_lock);
/* print pretty message */
- if (!strcmp(e->elevator_name, chosen_elevator) ||
+ if (elevator_match(e, chosen_elevator) ||
(!*chosen_elevator &&
- !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
+ elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
def = " (default)";
printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
@@ -950,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
{
int ret;
+ lockdep_assert_held(&q->sysfs_lock);
+
blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
if (q->elevator) {
if (q->elevator->registered)
@@ -977,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
blk_add_trace_msg(q, "elv switch: none");
out:
+ blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;
}
@@ -993,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
bool old_registered = false;
int err;
+ lockdep_assert_held(&q->sysfs_lock);
+
if (q->mq_ops)
return elevator_switch_mq(q, new_e);
@@ -1066,25 +1091,15 @@ static int __elevator_change(struct request_queue *q, const char *name)
return elevator_switch(q, NULL);
strlcpy(elevator_name, name, sizeof(elevator_name));
- e = elevator_get(strstrip(elevator_name), true);
+ e = elevator_get(q, strstrip(elevator_name), true);
if (!e)
return -EINVAL;
- if (q->elevator &&
- !strcmp(elevator_name, q->elevator->type->elevator_name)) {
+ if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
elevator_put(e);
return 0;
}
- if (!e->uses_mq && q->mq_ops) {
- elevator_put(e);
- return -EINVAL;
- }
- if (e->uses_mq && !q->mq_ops) {
- elevator_put(e);
- return -EINVAL;
- }
-
return elevator_switch(q, e);
}
@@ -1116,9 +1131,10 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
struct elevator_queue *e = q->elevator;
struct elevator_type *elv = NULL;
struct elevator_type *__e;
+ bool uses_mq = q->mq_ops != NULL;
int len = 0;
- if (!blk_queue_stackable(q))
+ if (!queue_is_rq_based(q))
return sprintf(name, "none\n");
if (!q->elevator)
@@ -1128,7 +1144,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
- if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
+ if (elv && elevator_match(elv, __e->elevator_name) &&
+ (__e->uses_mq == uses_mq)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
continue;
}
diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb0..9656f9e9f99e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -547,7 +547,7 @@ static int exact_lock(dev_t devt, void *data)
{
struct gendisk *p = data;
- if (!get_disk(p))
+ if (!get_disk_and_module(p))
return -1;
return 0;
}
@@ -588,6 +588,11 @@ static void register_disk(struct device *parent, struct gendisk *disk)
disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+ if (disk->flags & GENHD_FL_HIDDEN) {
+ dev_set_uevent_suppress(ddev, 0);
+ return;
+ }
+
/* No minors to use for partitions */
if (!disk_part_scan_enabled(disk))
goto exit;
@@ -616,21 +621,27 @@ exit:
while ((part = disk_part_iter_next(&piter)))
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
disk_part_iter_exit(&piter);
+
+ err = sysfs_create_link(&ddev->kobj,
+ &disk->queue->backing_dev_info->dev->kobj,
+ "bdi");
+ WARN_ON(err);
}
/**
- * device_add_disk - add partitioning information to kernel list
+ * __device_add_disk - add disk information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
+ * @register_queue: register the queue if set to true
*
* This function registers the partitioning information in @disk
* with the kernel.
*
* FIXME: error handling
*/
-void device_add_disk(struct device *parent, struct gendisk *disk)
+static void __device_add_disk(struct device *parent, struct gendisk *disk,
+ bool register_queue)
{
- struct backing_dev_info *bdi;
dev_t devt;
int retval;
@@ -639,7 +650,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
* parameters make sense.
*/
WARN_ON(disk->minors && !(disk->major || disk->first_minor));
- WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
+ WARN_ON(!disk->minors &&
+ !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
disk->flags |= GENHD_FL_UP;
@@ -648,24 +660,32 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
WARN_ON(1);
return;
}
- disk_to_dev(disk)->devt = devt;
-
- /* ->major and ->first_minor aren't supposed to be
- * dereferenced from here on, but set them just in case.
- */
disk->major = MAJOR(devt);
disk->first_minor = MINOR(devt);
disk_alloc_events(disk);
- /* Register BDI before referencing it from bdev */
- bdi = disk->queue->backing_dev_info;
- bdi_register_owner(bdi, disk_to_dev(disk));
-
- blk_register_region(disk_devt(disk), disk->minors, NULL,
- exact_match, exact_lock, disk);
+ if (disk->flags & GENHD_FL_HIDDEN) {
+ /*
+ * Don't let hidden disks show up in /proc/partitions,
+ * and don't bother scanning for partitions either.
+ */
+ disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+ disk->flags |= GENHD_FL_NO_PART_SCAN;
+ } else {
+ int ret;
+
+ /* Register BDI before referencing it from bdev */
+ disk_to_dev(disk)->devt = devt;
+ ret = bdi_register_owner(disk->queue->backing_dev_info,
+ disk_to_dev(disk));
+ WARN_ON(ret);
+ blk_register_region(disk_devt(disk), disk->minors, NULL,
+ exact_match, exact_lock, disk);
+ }
register_disk(parent, disk);
- blk_register_queue(disk);
+ if (register_queue)
+ blk_register_queue(disk);
/*
* Take an extra ref on queue which will be put on disk_release()
@@ -673,15 +693,22 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
*/
WARN_ON_ONCE(!blk_get_queue(disk->queue));
- retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
- "bdi");
- WARN_ON(retval);
-
disk_add_events(disk);
blk_integrity_add(disk);
}
+
+void device_add_disk(struct device *parent, struct gendisk *disk)
+{
+ __device_add_disk(parent, disk, true);
+}
EXPORT_SYMBOL(device_add_disk);
+void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+{
+ __device_add_disk(parent, disk, false);
+}
+EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
@@ -690,6 +717,11 @@ void del_gendisk(struct gendisk *disk)
blk_integrity_del(disk);
disk_del_events(disk);
+ /*
+ * Block lookups of the disk until all bdevs are unhashed and the
+ * disk is marked as dead (GENHD_FL_UP cleared).
+ */
+ down_write(&disk->lookup_sem);
/* invalidate stuff */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -704,25 +736,30 @@ void del_gendisk(struct gendisk *disk)
bdev_unhash_inode(disk_devt(disk));
set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
+ up_write(&disk->lookup_sem);
- sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
if (disk->queue) {
/*
* Unregister bdi before releasing device numbers (as they can
* get reused and we'd get clashes in sysfs).
*/
- bdi_unregister(disk->queue->backing_dev_info);
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ bdi_unregister(disk->queue->backing_dev_info);
blk_unregister_queue(disk);
} else {
WARN_ON(1);
}
- blk_unregister_region(disk_devt(disk), disk->minors);
- part_stat_set_all(&disk->part0, 0);
- disk->part0.stamp = 0;
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ blk_unregister_region(disk_devt(disk), disk->minors);
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
+
+ part_stat_set_all(&disk->part0, 0);
+ disk->part0.stamp = 0;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -778,13 +815,29 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
spin_lock_bh(&ext_devt_lock);
part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
- if (part && get_disk(part_to_disk(part))) {
+ if (part && get_disk_and_module(part_to_disk(part))) {
*partno = part->partno;
disk = part_to_disk(part);
}
spin_unlock_bh(&ext_devt_lock);
}
+ if (!disk)
+ return NULL;
+
+ /*
+ * Synchronize with del_gendisk() to not return disk that is being
+ * destroyed.
+ */
+ down_read(&disk->lookup_sem);
+ if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
+ !(disk->flags & GENHD_FL_UP))) {
+ up_read(&disk->lookup_sem);
+ put_disk_and_module(disk);
+ disk = NULL;
+ } else {
+ up_read(&disk->lookup_sem);
+ }
return disk;
}
EXPORT_SYMBOL(get_gendisk);
@@ -1028,6 +1081,15 @@ static ssize_t disk_removable_show(struct device *dev,
(disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}
+static ssize_t disk_hidden_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%d\n",
+ (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
+}
+
static ssize_t disk_ro_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -1065,6 +1127,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
@@ -1089,6 +1152,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_range.attr,
&dev_attr_ext_range.attr,
&dev_attr_removable.attr,
+ &dev_attr_hidden.attr,
&dev_attr_ro.attr,
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
@@ -1354,20 +1418,14 @@ dev_t blk_lookup_devt(const char *name, int partno)
}
EXPORT_SYMBOL(blk_lookup_devt);
-struct gendisk *alloc_disk(int minors)
-{
- return alloc_disk_node(minors, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(alloc_disk);
-
-struct gendisk *alloc_disk_node(int minors, int node_id)
+struct gendisk *__alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
struct disk_part_tbl *ptbl;
if (minors > DISK_MAX_PARTS) {
printk(KERN_ERR
- "block: can't allocated more than %d partitions\n",
+ "block: can't allocate more than %d partitions\n",
DISK_MAX_PARTS);
minors = DISK_MAX_PARTS;
}
@@ -1378,6 +1436,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
kfree(disk);
return NULL;
}
+ init_rwsem(&disk->lookup_sem);
disk->node_id = node_id;
if (disk_expand_part_tbl(disk, 0)) {
free_part_stats(&disk->part0);
@@ -1411,9 +1470,9 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
}
return disk;
}
-EXPORT_SYMBOL(alloc_disk_node);
+EXPORT_SYMBOL(__alloc_disk_node);
-struct kobject *get_disk(struct gendisk *disk)
+struct kobject *get_disk_and_module(struct gendisk *disk)
{
struct module *owner;
struct kobject *kobj;
@@ -1431,17 +1490,30 @@ struct kobject *get_disk(struct gendisk *disk)
return kobj;
}
-
-EXPORT_SYMBOL(get_disk);
+EXPORT_SYMBOL(get_disk_and_module);
void put_disk(struct gendisk *disk)
{
if (disk)
kobject_put(&disk_to_dev(disk)->kobj);
}
-
EXPORT_SYMBOL(put_disk);
+/*
+ * This is a counterpart of get_disk_and_module() and thus also of
+ * get_gendisk().
+ */
+void put_disk_and_module(struct gendisk *disk)
+{
+ if (disk) {
+ struct module *owner = disk->fops->owner;
+
+ put_disk(disk);
+ module_put(owner);
+ }
+}
+EXPORT_SYMBOL(put_disk_and_module);
+
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
char event[] = "DISK_RO=1";
diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee67eed..3884d810efd2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
{
uint64_t range[2];
uint64_t start, len;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
if (!(mode & FMODE_WRITE))
return -EBADF;
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
@@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
return -EINVAL;
if (len & 511)
return -EINVAL;
- start >>= 9;
- len >>= 9;
- if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+ if (start + len > i_size_read(bdev->bd_inode))
return -EINVAL;
- return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
+ truncate_inode_pages_range(mapping, start, start + len - 1);
+ return blkdev_issue_discard(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, flags);
}
static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
@@ -437,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
{
int ret, n;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
if (!is_unrecognized_ioctl(ret))
return ret;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
if (get_user(n, (int __user *)arg))
return -EFAULT;
set_device_ro(bdev, n);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index f58cab82105b..0d6d25e32e1f 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -100,9 +100,13 @@ struct kyber_hctx_data {
unsigned int cur_domain;
unsigned int batching;
wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
+ struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+ void *key);
+
static int rq_sched_domain(const struct request *rq)
{
unsigned int op = rq->cmd_flags;
@@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
INIT_LIST_HEAD(&khd->rqs[i]);
+ init_waitqueue_func_entry(&khd->domain_wait[i],
+ kyber_domain_wake);
+ khd->domain_wait[i].private = hctx;
INIT_LIST_HEAD(&khd->domain_wait[i].entry);
atomic_set(&khd->wait_index[i], 0);
}
@@ -524,19 +531,16 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
int nr;
nr = __sbitmap_queue_get(domain_tokens);
- if (nr >= 0)
- return nr;
/*
* If we failed to get a domain token, make sure the hardware queue is
* run when one becomes available. Note that this is serialized on
* khd->lock, but we still need to be careful about the waker.
*/
- if (list_empty_careful(&wait->entry)) {
- init_waitqueue_func_entry(wait, kyber_domain_wake);
- wait->private = hctx;
+ if (nr < 0 && list_empty_careful(&wait->entry)) {
ws = sbq_wait_ptr(domain_tokens,
&khd->wait_index[sched_domain]);
+ khd->domain_ws[sched_domain] = ws;
add_wait_queue(&ws->wait, wait);
/*
@@ -545,6 +549,21 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
*/
nr = __sbitmap_queue_get(domain_tokens);
}
+
+ /*
+ * If we got a token while we were on the wait queue, remove ourselves
+ * from the wait queue to ensure that all wake ups make forward
+ * progress. It's possible that the waker already deleted the entry
+ * between the !list_empty_careful() check and us grabbing the lock, but
+ * list_del_init() is okay with that.
+ */
+ if (nr >= 0 && !list_empty_careful(&wait->entry)) {
+ ws = khd->domain_ws[sched_domain];
+ spin_lock_irq(&ws->wait.lock);
+ list_del_init(&wait->entry);
+ spin_unlock_irq(&ws->wait.lock);
+ }
+
return nr;
}
@@ -641,7 +660,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
if (!list_empty_careful(&khd->rqs[i]))
return true;
}
- return false;
+ return sbitmap_any_bit_set(&hctx->ctx_map);
}
#define KYBER_LAT_SHOW_STORE(op) \
@@ -814,6 +833,7 @@ static struct elevator_type kyber_sched = {
.limit_depth = kyber_limit_depth,
.prepare_request = kyber_prepare_request,
.finish_request = kyber_finish_request,
+ .requeue_request = kyber_finish_request,
.completed_request = kyber_completed_request,
.dispatch_request = kyber_dispatch_request,
.has_work = kyber_has_work,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a1cad4331edd..8ec0ba9f5386 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
int front_merges;
spinlock_t lock;
+ spinlock_t zone_lock;
struct list_head dispatch;
};
@@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
}
/*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+ return NULL;
+
+ if (list_empty(&dd->fifo_list[data_dir]))
+ return NULL;
+
+ rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+ if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ goto out;
+ }
+ rq = NULL;
+out:
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ return rq;
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+ return NULL;
+
+ rq = dd->next_rq[data_dir];
+ if (!rq)
+ return NULL;
+
+ if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ while (rq) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ break;
+ rq = deadline_latter_request(rq);
+ }
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ return rq;
+}
+
+/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
*/
-static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+static struct request *__dd_dispatch_request(struct deadline_data *dd)
{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- struct request *rq;
+ struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
@@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
/*
* batches are currently reads XOR writes
*/
- if (dd->next_rq[WRITE])
- rq = dd->next_rq[WRITE];
- else
- rq = dd->next_rq[READ];
+ rq = deadline_next_request(dd, WRITE);
+ if (!rq)
+ rq = deadline_next_request(dd, READ);
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
- if (writes && (dd->starved++ >= dd->writes_starved))
+ if (deadline_fifo_request(dd, WRITE) &&
+ (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
@@ -260,21 +331,29 @@ dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
- if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+ next_rq = deadline_next_request(dd, data_dir);
+ if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
- rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+ rq = deadline_fifo_request(dd, data_dir);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
- rq = dd->next_rq[data_dir];
+ rq = next_rq;
}
+ /*
+ * For a zoned block device, if we only have writes queued and none of
+ * them can be dispatched, rq will be NULL.
+ */
+ if (!rq)
+ return NULL;
+
dd->batching = 0;
dispatch_request:
@@ -284,17 +363,27 @@ dispatch_request:
dd->batching++;
deadline_move_request(dd, rq);
done:
+ /*
+ * If the request needs its target zone locked, do it.
+ */
+ blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
+/*
+ * One confusing aspect here is that we get called for a specific
+ * hardware queue, but we return a request that may not be for a
+ * different hardware queue. This is because mq-deadline has shared
+ * state for all hardware queues, in terms of sorting, FIFOs, etc.
+ */
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq;
spin_lock(&dd->lock);
- rq = __dd_dispatch_request(hctx);
+ rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock);
return rq;
@@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
spin_lock_init(&dd->lock);
+ spin_lock_init(&dd->zone_lock);
INIT_LIST_HEAD(&dd->dispatch);
q->elevator = eq;
@@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
+ /*
+ * This may be a requeue of a write request that has locked its
+ * target zone. If it is the case, this releases the zone lock.
+ */
+ blk_req_zone_write_unlock(rq);
+
if (blk_mq_sched_try_insert_merge(q, rq))
return;
@@ -439,6 +535,35 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
spin_unlock(&dd->lock);
}
+/*
+ * Nothing to do here. This is defined only to ensure that .finish_request
+ * method is called upon request completion.
+ */
+static void dd_prepare_request(struct request *rq, struct bio *bio)
+{
+}
+
+/*
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests. Do this while holding the zone lock
+ * spinlock so that the zone is never unlocked while deadline_fifo_request()
+ * or deadline_next_request() are executing. This function is called for
+ * all requests, whether or not these requests complete successfully.
+ */
+static void dd_finish_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ if (blk_queue_is_zoned(q)) {
+ struct deadline_data *dd = q->elevator->elevator_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ blk_req_zone_write_unlock(rq);
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+ }
+}
+
static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
@@ -640,6 +765,8 @@ static struct elevator_type mq_deadline = {
.ops.mq = {
.insert_requests = dd_insert_requests,
.dispatch_request = dd_dispatch_request,
+ .prepare_request = dd_prepare_request,
+ .finish_request = dd_finish_request,
.next_request = elv_rb_latter_request,
.former_request = elv_rb_former_request,
.bio_merge = dd_bio_merge,
@@ -657,6 +784,7 @@ static struct elevator_type mq_deadline = {
#endif
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
+ .elevator_alias = "deadline",
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 91622db9aedf..08dabcd8b6ae 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -51,6 +51,12 @@ const char *bdevname(struct block_device *bdev, char *buf)
EXPORT_SYMBOL(bdevname);
+const char *bio_devname(struct bio *bio, char *buf)
+{
+ return disk_name(bio->bi_disk, bio->bi_partno, buf);
+}
+EXPORT_SYMBOL(bio_devname);
+
/*
* There's very little reason to use this, you should really
* have a struct block_device just about everywhere and use
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 0af3a3db6fb0..82c44f7df911 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
continue;
bsd_start = le32_to_cpu(p->p_offset);
bsd_size = le32_to_cpu(p->p_size);
- if (memcmp(flavour, "bsd\0", 4) == 0)
+ /* FreeBSD has relative offset if C partition offset is zero */
+ if (memcmp(flavour, "bsd\0", 4) == 0 &&
+ le32_to_cpu(l->d_partitions[2].p_offset) == 0)
bsd_start += offset;
if (offset == bsd_start && size == bsd_size)
/* full parent partition, we have it already */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 7440de44dd85..60b471f8621b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
}
-int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
+int blk_verify_command(unsigned char *cmd, fmode_t mode)
{
struct blk_cmd_filter *filter = &blk_default_cmd_filter;
@@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
return 0;
/* Write-safe commands require a writable open */
- if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
+ if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
return 0;
return -EPERM;
@@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
return -EFAULT;
- if (blk_verify_command(req->cmd, mode & FMODE_WRITE))
+ if (blk_verify_command(req->cmd, mode))
return -EPERM;
/*
@@ -384,9 +384,10 @@ out_put_request:
/**
* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
- * @file: file this ioctl operates on (optional)
* @q: request queue to send scsi commands down
* @disk: gendisk to operate on (option)
+ * @mode: mode used to open the file through which the ioctl has been
+ * submitted
* @sic: userspace structure describing the command to perform
*
* Send down the scsi command described by @sic to the device below
@@ -415,10 +416,10 @@ out_put_request:
* Positive numbers returned are the compacted SCSI error codes (4
* bytes in one int) where the lowest byte is the SCSI status.
*/
-#define OMAX_SB_LEN 16 /* For backward compatibility */
int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
struct scsi_ioctl_command __user *sic)
{
+ enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */
struct request *rq;
struct scsi_request *req;
int err;
@@ -469,7 +470,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
goto error;
- err = blk_verify_command(req->cmd, mode & FMODE_WRITE);
+ err = blk_verify_command(req->cmd, mode);
if (err)
goto error;
@@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
if (bd && bd == bd->bd_contains)
return 0;
- /* Actually none of these is particularly useful on a partition,
- * but they are safe.
- */
- switch (cmd) {
- case SCSI_IOCTL_GET_IDLUN:
- case SCSI_IOCTL_GET_BUS_NUMBER:
- case SCSI_IOCTL_GET_PCI:
- case SCSI_IOCTL_PROBE_HOST:
- case SG_GET_VERSION_NUM:
- case SG_SET_TIMEOUT:
- case SG_GET_TIMEOUT:
- case SG_GET_RESERVED_SIZE:
- case SG_SET_RESERVED_SIZE:
- case SG_EMULATED_HOST:
- return 0;
- case CDROM_GET_CAPABILITY:
- /* Keep this until we remove the printk below. udev sends it
- * and we do not want to spam dmesg about it. CD-ROMs do
- * not have partitions, so we get here only for disks.
- */
- return -ENOIOCTLCMD;
- default:
- break;
- }
-
if (capable(CAP_SYS_RAWIO))
return 0;
- /* In particular, rule out all resets and host-specific ioctls. */
- printk_ratelimited(KERN_WARNING
- "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
-
return -ENOIOCTLCMD;
}
EXPORT_SYMBOL(scsi_verify_blk_ioctl);
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 9ed51d0c6b1d..e4929eec547f 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -490,7 +490,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
if (!found_com_id) {
pr_debug("Could not find OPAL comid for device. Returning early\n");
- return -EOPNOTSUPP;;
+ return -EOPNOTSUPP;
}
dev->comid = comid;