aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/blkio-controller.txt24
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt61
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-cgroup.c524
-rw-r--r--block/blk-core.c4
-rw-r--r--block/blk-throttle.c505
-rw-r--r--block/blk.h5
-rw-r--r--block/cfq-iosched.c651
-rw-r--r--fs/fs-writeback.c139
-rw-r--r--fs/kernfs/dir.c23
-rw-r--r--include/linux/backing-dev.h26
-rw-r--r--include/linux/blk-cgroup.h340
-rw-r--r--include/linux/cgroup_subsys.h2
-rw-r--r--include/linux/kernfs.h4
-rw-r--r--include/trace/events/writeback.h180
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/page-writeback.c6
17 files changed, 1422 insertions, 1078 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 68b6a6a470b0..12686bec37b9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -201,7 +201,7 @@ Proportional weight policy files
specifies the number of bytes.
- blkio.io_serviced
- - Number of IOs completed to/from the disk by the group. These
+ - Number of IOs (bio) issued to the disk by the group. These
are further divided by the type of operation - read or write, sync
or async. First two fields specify the major and minor number of the
device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
subjected to both the constraints.
- blkio.throttle.io_serviced
- - Number of IOs (bio) completed to/from the disk by the group (as
- seen by throttling policy). These are further divided by the type
- of operation - read or write, sync or async. First two fields specify
- the major and minor number of the device, third field specifies the
- operation type and the fourth field specifies the number of IOs.
-
- blkio.io_serviced does accounting as seen by CFQ and counts are in
- number of requests (struct request). On the other hand,
- blkio.throttle.io_serviced counts number of IO in terms of number
- of bios as seen by throttling policy. These bios can later be
- merged by elevator and total number of requests completed can be
- lesser.
+ - Number of IOs (bio) issued to the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of IOs.
- blkio.throttle.io_service_bytes
- Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
device, third field specifies the operation type and the fourth field
specifies the number of bytes.
- These numbers should roughly be same as blkio.io_service_bytes as
- updated by CFQ. The difference between two is that
- blkio.io_service_bytes will not be updated if CFQ is not operating
- on request queue.
-
Common files among various policies
-----------------------------------
- blkio.reset_stats
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 1ee9caf29e57..e0975c2cf03d 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -27,7 +27,7 @@ CONTENTS
5-3-1. Format
5-3-2. Control Knobs
5-4. Per-Controller Changes
- 5-4-1. blkio
+ 5-4-1. io
5-4-2. cpuset
5-4-3. memory
6. Planned Changes
@@ -203,7 +203,7 @@ other issues. The mapping from nice level to weight isn't obvious or
universal, and there are various other knobs which simply aren't
available for tasks.
-The blkio controller implicitly creates a hidden leaf node for each
+The io controller implicitly creates a hidden leaf node for each
cgroup to host the tasks. The hidden leaf has its own copies of all
the knobs with "leaf_" prefixed. While this allows equivalent control
over internal tasks, it's with serious drawbacks. It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
5-4. Per-Controller Changes
-5-4-1. blkio
+5-4-1. io
-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io. The interface is overhauled anyway. The
+ new name is more in line with the other two major controllers, cpu
+ and memory, and better suited given that it may be used for cgroup
+ writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+ recursive stat files pointless and, as no internal node can have
+ tasks, leaf weights are meaningless. The operation model is
+ simplified and the interface is overhauled accordingly.
+
+ io.stat
+
+ The stat file. The reported stats are from the point where
+ bio's are issued to request_queue. The stats are counted
+ independent of which policies are enabled. Each line in the
+ file follows the following format. More fields may later be
+ added at the end.
+
+ $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+ io.weight
+
+ The weight setting, currently only available and effective if
+ cfq-iosched is in use for the target device. The weight is
+ between 1 and 10000 and defaults to 100. The first line
+ always contains the default weight in the following format to
+ use when per-device setting is missing.
+
+ default $WEIGHT
+
+ Subsequent lines list per-device weights of the following
+ format.
+
+ $MAJ:$MIN $WEIGHT
+
+ Writing "$WEIGHT" or "default $WEIGHT" changes the default
+ setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+ while "$MAJ:$MIN default" clears it.
+
+ This file is available only on non-root cgroups.
+
+ io.max
+
+ The maximum bandwidth and/or iops setting, only available if
+ blk-throttle is enabled. The file is of the following format.
+
+ $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+ ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+ read/write IOs per second. "max" indicates no limit. Writing
+ to the file follows the same format but the individual
+ settings may be ommitted or specified in any order.
+
+ This file is available only on non-root cgroups.
5-4-2. cpuset
diff --git a/block/bio.c b/block/bio.c
index 515b5434fe2d..ad3f276d74bc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
get_io_context_active(ioc);
bio->bi_ioc = ioc;
- bio->bi_css = task_get_css(current, blkio_cgrp_id);
+ bio->bi_css = task_get_css(current, io_cgrp_id);
return 0;
}
EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6283b3f5db5..ac8370cb2515 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
+#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
return;
for (i = 0; i < BLKCG_MAX_POLS; i++)
- kfree(blkg->pd[i]);
+ if (blkg->pd[i])
+ blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- blk_exit_rl(&blkg->rl);
+ if (blkg->blkcg != &blkcg_root)
+ blk_exit_rl(&blkg->rl);
+
+ blkg_rwstat_exit(&blkg->stat_ios);
+ blkg_rwstat_exit(&blkg->stat_bytes);
kfree(blkg);
}
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg)
return NULL;
+ if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+ blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+ goto err_free;
+
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
continue;
/* alloc per-policy data and attach it to blkg */
- pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+ pd = pol->pd_alloc_fn(gfp_mask, q->node);
if (!pd)
goto err_free;
@@ -129,26 +139,11 @@ err_free:
return NULL;
}
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state. If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
- bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint)
{
struct blkcg_gq *blkg;
- blkg = rcu_dereference(blkcg->blkg_hint);
- if (blkg && blkg->q == q)
- return blkg;
-
/*
* Hint didn't match. Look up from the radix tree. Note that the
* hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
return NULL;
}
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair. This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- if (unlikely(blk_queue_bypass(q)))
- return NULL;
- return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
/*
* If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
*/
static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* blkg holds a reference to blkcg */
if (!css_tryget_online(&blkcg->css)) {
- ret = -EINVAL;
+ ret = -ENODEV;
goto err_free_blkg;
}
wb_congested = wb_congested_get_create(&q->backing_dev_info,
- blkcg->css.id, GFP_ATOMIC);
+ blkcg->css.id, GFP_NOWAIT);
if (!wb_congested) {
ret = -ENOMEM;
goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* allocate */
if (!new_blkg) {
- new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+ new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
- ret = -EINVAL;
+ ret = -ENODEV;
goto err_put_congested;
}
blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_init_fn)
- pol->pd_init_fn(blkg);
+ pol->pd_init_fn(blkg->pd[i]);
}
/* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_online_fn)
- pol->pd_online_fn(blkg);
+ pol->pd_online_fn(blkg->pd[i]);
}
}
blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
* we shouldn't allow anything to go through for a bypassing queue.
*/
if (unlikely(blk_queue_bypass(q)))
- return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+ return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
blkg = __blkg_lookup(blkcg, q, true);
if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
return blkg;
}
}
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
static void blkg_destroy(struct blkcg_gq *blkg)
{
struct blkcg *blkcg = blkg->blkcg;
+ struct blkcg_gq *parent = blkg->parent;
int i;
lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_offline_fn)
- pol->pd_offline_fn(blkg);
+ pol->pd_offline_fn(blkg->pd[i]);
+ }
+
+ if (parent) {
+ blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+ blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
}
+
blkg->online = false;
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q)
void __blkg_release_rcu(struct rcu_head *rcu_head)
{
struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
- int i;
-
- /* tell policies that this one is being freed */
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
-
- if (blkg->pd[i] && pol->pd_exit_fn)
- pol->pd_exit_fn(blkg);
- }
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
* anyway. If you get hit by a race, retry.
*/
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ blkg_rwstat_reset(&blkg->stat_bytes);
+ blkg_rwstat_reset(&blkg->stat_ios);
+
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
- if (blkcg_policy_enabled(blkg->q, pol) &&
- pol->pd_reset_stats_fn)
- pol->pd_reset_stats_fn(blkg);
+ if (blkg->pd[i] && pol->pd_reset_stats_fn)
+ pol->pd_reset_stats_fn(blkg->pd[i]);
}
}
@@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
return 0;
}
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
{
/* some drivers (floppy) instantiate a queue w/o disk registered */
if (blkg->q->backing_dev_info.dev)
return dev_name(blkg->q->backing_dev_info.dev);
return NULL;
}
+EXPORT_SYMBOL_GPL(blkg_dev_name);
/**
* blkcg_print_blkgs - helper for printing per-blkg data
@@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
for (i = 0; i < BLKG_RWSTAT_NR; i++)
seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
- (unsigned long long)rwstat->cnt[i]);
+ (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
- v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+ v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
return v;
}
@@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
}
EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_bytes), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_ios), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd,
+ int off)
+{
+ struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+ NULL, off);
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field_recursive,
+ (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_bytes), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat_field_recursive,
+ (void *)seq_cft(sf)->private,
+ offsetof(struct blkcg_gq, stat_ios), true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
/**
* blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
*
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum. The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
*/
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off)
{
- struct blkcg_policy *pol = blkcg_policy[pd->plid];
struct blkcg_gq *pos_blkg;
struct cgroup_subsys_state *pos_css;
u64 sum = 0;
- lockdep_assert_held(pd->blkg->q->queue_lock);
+ lockdep_assert_held(blkg->q->queue_lock);
rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
- struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
- struct blkg_stat *stat = (void *)pos_pd + off;
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_stat *stat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ if (pol)
+ stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ stat = (void *)blkg + off;
- if (pos_blkg->online)
- sum += blkg_stat_read(stat);
+ sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
}
rcu_read_unlock();
@@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
/**
* blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
*
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum. The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
*/
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
- int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off)
{
- struct blkcg_policy *pol = blkcg_policy[pd->plid];
struct blkcg_gq *pos_blkg;
struct cgroup_subsys_state *pos_css;
struct blkg_rwstat sum = { };
int i;
- lockdep_assert_held(pd->blkg->q->queue_lock);
+ lockdep_assert_held(blkg->q->queue_lock);
rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
- struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
- struct blkg_rwstat *rwstat = (void *)pos_pd + off;
- struct blkg_rwstat tmp;
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_rwstat *rwstat;
if (!pos_blkg->online)
continue;
- tmp = blkg_rwstat_read(rwstat);
+ if (pol)
+ rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ rwstat = (void *)pos_blkg + off;
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- sum.cnt[i] += tmp.cnt[i];
+ atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+ &sum.aux_cnt[i]);
}
rcu_read_unlock();
@@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
* @ctx: blkg_conf_ctx to be filled
*
* Parse per-blkg config update from @input and initialize @ctx with the
- * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value. This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result. @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN. This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- const char *input, struct blkg_conf_ctx *ctx)
+ char *input, struct blkg_conf_ctx *ctx)
__acquires(rcu) __acquires(disk->queue->queue_lock)
{
struct gendisk *disk;
struct blkcg_gq *blkg;
unsigned int major, minor;
- unsigned long long v;
- int part, ret;
+ int key_len, part, ret;
+ char *body;
- if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+ if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
return -EINVAL;
+ body = input + key_len;
+ if (!isspace(*body))
+ return -EINVAL;
+ body = skip_spaces(body);
+
disk = get_gendisk(MKDEV(major, minor), &part);
if (!disk)
- return -EINVAL;
+ return -ENODEV;
if (part) {
put_disk(disk);
- return -EINVAL;
+ return -ENODEV;
}
rcu_read_lock();
@@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (blkcg_policy_enabled(disk->queue, pol))
blkg = blkg_lookup_create(blkcg, disk->queue);
else
- blkg = ERR_PTR(-EINVAL);
+ blkg = ERR_PTR(-EOPNOTSUPP);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
@@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
ctx->disk = disk;
ctx->blkg = blkg;
- ctx->v = v;
+ ctx->body = body;
return 0;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -774,8 +854,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ const char *dname;
+ struct blkg_rwstat rwstat;
+ u64 rbytes, wbytes, rios, wios;
+
+ dname = blkg_dev_name(blkg);
+ if (!dname)
+ continue;
+
+ spin_lock_irq(blkg->q->queue_lock);
+
+ rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+ wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_ios));
+ rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+ wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ spin_unlock_irq(blkg->q->queue_lock);
+
+ if (rbytes || wbytes || rios || wios)
+ seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+ dname, rbytes, wbytes, rios, wios);
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+
struct cftype blkcg_files[] = {
{
+ .name = "stat",
+ .seq_show = blkcg_print_stat,
+ },
+ { } /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
+ {
.name = "reset_stats",
.write_u64 = blkcg_reset_stats,
},
@@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
static void blkcg_css_free(struct cgroup_subsys_state *css)
{
struct blkcg *blkcg = css_to_blkcg(css);
+ int i;
mutex_lock(&blkcg_pol_mutex);
+
list_del(&blkcg->all_blkcgs_node);
- mutex_unlock(&blkcg_pol_mutex);
- if (blkcg != &blkcg_root) {
- int i;
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ if (blkcg->cpd[i])
+ blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
- for (i = 0; i < BLKCG_MAX_POLS; i++)
- kfree(blkcg->pd[i]);
- kfree(blkcg);
- }
+ mutex_unlock(&blkcg_pol_mutex);
+
+ kfree(blkcg);
}
static struct cgroup_subsys_state *
@@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css) {
blkcg = &blkcg_root;
- goto done;
- }
-
- blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- if (!blkcg) {
- ret = ERR_PTR(-ENOMEM);
- goto free_blkcg;
+ } else {
+ blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+ if (!blkcg) {
+ ret = ERR_PTR(-ENOMEM);
+ goto free_blkcg;
+ }
}
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
* check if the policy requires any specific per-cgroup
* data: if it does, allocate and initialize it.
*/
- if (!pol || !pol->cpd_size)
+ if (!pol || !pol->cpd_alloc_fn)
continue;
- BUG_ON(blkcg->pd[i]);
- cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ cpd = pol->cpd_alloc_fn(GFP_KERNEL);
if (!cpd) {
ret = ERR_PTR(-ENOMEM);
goto free_pd_blkcg;
}
- blkcg->pd[i] = cpd;
+ blkcg->cpd[i] = cpd;
+ cpd->blkcg = blkcg;
cpd->plid = i;
- pol->cpd_init_fn(blkcg);
+ if (pol->cpd_init_fn)
+ pol->cpd_init_fn(cpd);
}
-done:
spin_lock_init(&blkcg->lock);
- INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+ INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -894,7 +1021,8 @@ done:
free_pd_blkcg:
for (i--; i >= 0; i--)
- kfree(blkcg->pd[i]);
+ if (blkcg->cpd[i])
+ blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
free_blkcg:
kfree(blkcg);
mutex_unlock(&blkcg_pol_mutex);
@@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q)
radix_tree_preload_end();
if (IS_ERR(blkg)) {
- kfree(new_blkg);
+ blkg_free(new_blkg);
return PTR_ERR(blkg);
}
@@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
return ret;
}
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+ int i;
+
+ mutex_lock(&blkcg_pol_mutex);
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+ struct blkcg *blkcg;
+
+ if (!pol || !pol->cpd_bind_fn)
+ continue;
+
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+ if (blkcg->cpd[pol->plid])
+ pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+ }
+ mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
- .legacy_cftypes = blkcg_files,
+ .bind = blkcg_bind,
+ .dfl_cftypes = blkcg_files,
+ .legacy_cftypes = blkcg_legacy_files,
+ .legacy_name = "blkio",
#ifdef CONFIG_MEMCG
/*
* This ensures that, if available, memcg is automatically enabled
@@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
.depends_on = 1 << memory_cgrp_id,
#endif
};
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
/**
* blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol)
{
- LIST_HEAD(pds);
+ struct blkg_policy_data *pd_prealloc = NULL;
struct blkcg_gq *blkg;
- struct blkg_policy_data *pd, *nd;
- int cnt = 0, ret;
+ int ret;
if (blkcg_policy_enabled(q, pol))
return 0;
- /* count and allocate policy_data for all existing blkgs */
blk_queue_bypass_start(q);
- spin_lock_irq(q->queue_lock);
- list_for_each_entry(blkg, &q->blkg_list, q_node)
- cnt++;
- spin_unlock_irq(q->queue_lock);
-
- /* allocate per-blkg policy data for all existing blkgs */
- while (cnt--) {
- pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
- if (!pd) {
+pd_prealloc:
+ if (!pd_prealloc) {
+ pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+ if (!pd_prealloc) {
ret = -ENOMEM;
- goto out_free;
+ goto out_bypass_end;
}
- list_add_tail(&pd->alloc_node, &pds);
}
- /*
- * Install the allocated pds and cpds. With @q bypassing, no new blkg
- * should have been created while the queue lock was dropped.
- */
spin_lock_irq(q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
- if (WARN_ON(list_empty(&pds))) {
- /* umm... this shouldn't happen, just abort */
- ret = -ENOMEM;
- goto out_unlock;
- }
- pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
- list_del_init(&pd->alloc_node);
+ struct blkg_policy_data *pd;
- /* grab blkcg lock too while installing @pd on @blkg */
- spin_lock(&blkg->blkcg->lock);
+ if (blkg->pd[pol->plid])
+ continue;
+
+ pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+ if (!pd)
+ swap(pd, pd_prealloc);
+ if (!pd) {
+ spin_unlock_irq(q->queue_lock);
+ goto pd_prealloc;
+ }
blkg->pd[pol->plid] = pd;
pd->blkg = blkg;
pd->plid = pol->plid;
- pol->pd_init_fn(blkg);
-
- spin_unlock(&blkg->blkcg->lock);
+ if (pol->pd_init_fn)
+ pol->pd_init_fn(pd);
}
__set_bit(pol->plid, q->blkcg_pols);
ret = 0;
-out_unlock:
+
spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
blk_queue_bypass_end(q);
- list_for_each_entry_safe(pd, nd, &pds, alloc_node)
- kfree(pd);
+ if (pd_prealloc)
+ pol->pd_free_fn(pd_prealloc);
return ret;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
/* grab blkcg lock too while removing @pd from @blkg */
spin_lock(&blkg->blkcg->lock);
- if (pol->pd_offline_fn)
- pol->pd_offline_fn(blkg);
- if (pol->pd_exit_fn)
- pol->pd_exit_fn(blkg);
-
- kfree(blkg->pd[pol->plid]);
- blkg->pd[pol->plid] = NULL;
+ if (blkg->pd[pol->plid]) {
+ if (pol->pd_offline_fn)
+ pol->pd_offline_fn(blkg->pd[pol->plid]);
+ pol->pd_free_fn(blkg->pd[pol->plid]);
+ blkg->pd[pol->plid] = NULL;
+ }
spin_unlock(&blkg->blkcg->lock);
}
@@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg *blkcg;
int i, ret;
- if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
- return -EINVAL;
-
mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
@@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
blkcg_policy[pol->plid] = pol;
/* allocate and install cpd's */
- if (pol->cpd_size) {
+ if (pol->cpd_alloc_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
struct blkcg_policy_data *cpd;
- cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ cpd = pol->cpd_alloc_fn(GFP_KERNEL);
if (!cpd) {
mutex_unlock(&blkcg_pol_mutex);
goto err_free_cpds;
}
- blkcg->pd[pol->plid] = cpd;
+ blkcg->cpd[pol->plid] = cpd;
+ cpd->blkcg = blkcg;
cpd->plid = pol->plid;
- pol->cpd_init_fn(blkcg);
+ pol->cpd_init_fn(cpd);
}
}
mutex_unlock(&blkcg_pol_mutex);
/* everything is in place, add intf files for the new policy */
- if (pol->cftypes)
- WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
- pol->cftypes));
+ if (pol->dfl_cftypes)
+ WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+ pol->dfl_cftypes));
+ if (pol->legacy_cftypes)
+ WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+ pol->legacy_cftypes));
mutex_unlock(&blkcg_pol_register_mutex);
return 0;
err_free_cpds:
- if (pol->cpd_size) {
+ if (pol->cpd_alloc_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- kfree(blkcg->pd[pol->plid]);
- blkcg->pd[pol->plid] = NULL;
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
}
}
blkcg_policy[pol->plid] = NULL;
@@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
goto out_unlock;
/* kill the intf files first */
- if (pol->cftypes)
- cgroup_rm_cftypes(pol->cftypes);
+ if (pol->dfl_cftypes)
+ cgroup_rm_cftypes(pol->dfl_cftypes);
+ if (pol->legacy_cftypes)
+ cgroup_rm_cftypes(pol->legacy_cftypes);
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_size) {
+ if (pol->cpd_alloc_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- kfree(blkcg->pd[pol->plid]);
- blkcg->pd[pol->plid] = NULL;
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
}
}
blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index 60912e983f16..2eb722d48773 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio)
*/
create_io_context(GFP_ATOMIC, q->node);
- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
+ if (!blkcg_bio_issue_check(q, bio))
+ return false;
trace_block_bio_queue(q, bio);
return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518ac7..c75a2636dd40 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
-/* Per-cpu group stats */
-struct tg_stats_cpu {
- /* total bytes transferred */
- struct blkg_rwstat service_bytes;
- /* total IOs serviced, post merge */
- struct blkg_rwstat serviced;
-};
-
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
-
- /* Per cpu stats pointer */
- struct tg_stats_cpu __percpu *stats_cpu;
-
- /* List of tgs waiting for per cpu stats memory to be allocated */
- struct list_head stats_alloc_node;
};
struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
struct work_struct dispatch_work;
};
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
static void throtl_pending_timer_fn(unsigned long arg);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
return pd_to_blkg(&tg->pd);
}
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
- return blkg_to_tg(td->queue->root_blkg);
-}
-
/**
* sq_to_tg - return the throl_grp the specified service queue belongs to
* @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
} \
} while (0)
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
- blkg_rwstat_init(&tg_stats->service_bytes);
- blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
- static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
- struct delayed_work *dwork = to_delayed_work(work);
- bool empty = false;
-
-alloc_stats:
- if (!stats_cpu) {
- int cpu;
-
- stats_cpu = alloc_percpu(struct tg_stats_cpu);
- if (!stats_cpu) {
- /* allocation failed, try again after some time */
- schedule_delayed_work(dwork, msecs_to_jiffies(10));
- return;
- }
- for_each_possible_cpu(cpu)
- tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
- }
-
- spin_lock_irq(&tg_stats_alloc_lock);
-
- if (!list_empty(&tg_stats_alloc_list)) {
- struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
- struct throtl_grp,
- stats_alloc_node);
- swap(tg->stats_cpu, stats_cpu);
- list_del_init(&tg->stats_alloc_node);
- }
-
- empty = list_empty(&tg_stats_alloc_list);
- spin_unlock_irq(&tg_stats_alloc_lock);
- if (!empty)
- goto alloc_stats;
-}
-
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
}
/* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
- struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
{
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
sq->pending_tree = RB_ROOT;
- sq->parent_sq = parent_sq;
setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
(unsigned long)sq);
}
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
{
- del_timer_sync(&sq->pending_timer);
+ struct throtl_grp *tg;
+ int rw;
+
+ tg = kzalloc_node(sizeof(*tg), gfp, node);
+ if (!tg)
+ return NULL;
+
+ throtl_service_queue_init(&tg->service_queue);
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+ throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ }
+
+ RB_CLEAR_NODE(&tg->rb_node);
+ tg->bps[READ] = -1;
+ tg->bps[WRITE] = -1;
+ tg->iops[READ] = -1;
+ tg->iops[WRITE] = -1;
+
+ return &tg->pd;
}
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
{
- struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_grp *tg = pd_to_tg(pd);
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
struct throtl_data *td = blkg->q->td;
- struct throtl_service_queue *parent_sq;
- unsigned long flags;
- int rw;
+ struct throtl_service_queue *sq = &tg->service_queue;
/*
* If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
* Limits of a group don't interact with limits of other groups
* regardless of the position of the group in the hierarchy.
*/
- parent_sq = &td->service_queue;
-
+ sq->parent_sq = &td->service_queue;
if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
- parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
- throtl_service_queue_init(&tg->service_queue, parent_sq);
-
- for (rw = READ; rw <= WRITE; rw++) {
- throtl_qnode_init(&tg->qnode_on_self[rw], tg);
- throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
- }
-
- RB_CLEAR_NODE(&tg->rb_node);
+ sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
tg->td = td;
-
- tg->bps[READ] = -1;
- tg->bps[WRITE] = -1;
- tg->iops[READ] = -1;
- tg->iops[WRITE] = -1;
-
- /*
- * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
- * but percpu allocator can't be called from IO path. Queue tg on
- * tg_stats_alloc_list and allocate from work item.
- */
- spin_lock_irqsave(&tg_stats_alloc_lock, flags);
- list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
- schedule_delayed_work(&tg_stats_alloc_work, 0);
- spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
}
/*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
(tg->bps[rw] != -1 || tg->iops[rw] != -1);
}
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
{
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
- tg_update_has_rules(blkg_to_tg(blkg));
-}
-
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
- struct throtl_grp *tg = blkg_to_tg(blkg);
- unsigned long flags;
-
- spin_lock_irqsave(&tg_stats_alloc_lock, flags);
- list_del_init(&tg->stats_alloc_node);
- spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
- free_percpu(tg->stats_cpu);
-
- throtl_service_queue_exit(&tg->service_queue);
-}
-
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
-{
- struct throtl_grp *tg = blkg_to_tg(blkg);
- int cpu;
-
- if (tg->stats_cpu == NULL)
- return;
-
- for_each_possible_cpu(cpu) {
- struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
- blkg_rwstat_reset(&sc->service_bytes);
- blkg_rwstat_reset(&sc->serviced);
- }
-}
-
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
- struct blkcg *blkcg)
-{
- /*
- * This is the common case when there are no blkcgs. Avoid lookup
- * in this case
- */
- if (blkcg == &blkcg_root)
- return td_root_tg(td);
-
- return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+ tg_update_has_rules(pd_to_tg(pd));
}
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
- struct blkcg *blkcg)
+static void throtl_pd_free(struct blkg_policy_data *pd)
{
- struct request_queue *q = td->queue;
- struct throtl_grp *tg = NULL;
-
- /*
- * This is the common case when there are no blkcgs. Avoid lookup
- * in this case
- */
- if (blkcg == &blkcg_root) {
- tg = td_root_tg(td);
- } else {
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup_create(blkcg, q);
-
- /* if %NULL and @q is alive, fall back to root_tg */
- if (!IS_ERR(blkg))
- tg = blkg_to_tg(blkg);
- else if (!blk_queue_dying(q))
- tg = td_root_tg(td);
- }
+ struct throtl_grp *tg = pd_to_tg(pd);
- return tg;
+ del_timer_sync(&tg->service_queue.pending_timer);
+ kfree(tg);
}
static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
return 0;
}
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
- int rw)
-{
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct tg_stats_cpu *stats_cpu;
- unsigned long flags;
-
- /* If per cpu stats are not allocated yet, don't do any accounting. */
- if (tg->stats_cpu == NULL)
- return;
-
- /*
- * Disabling interrupts to provide mutual exclusion between two
- * writes on same cpu. It probably is not needed for 64bit. Not
- * optimizing that case yet.
- */
- local_irq_save(flags);
-
- stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
- blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
- blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
- local_irq_restore(flags);
-}
-
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
* more than once as a throttled bio will go through blk-throtl the
* second time when it eventually gets issued. Set it when a bio
* is being charged to a tg.
- *
- * Dispatch stats aren't recursive and each @bio should only be
- * accounted by the @tg it was originally associated with. Let's
- * update the stats when setting REQ_THROTTLED for the first time
- * which is guaranteed to be for the @bio's original tg.
*/
- if (!(bio->bi_rw & REQ_THROTTLED)) {
+ if (!(bio->bi_rw & REQ_THROTTLED))
bio->bi_rw |= REQ_THROTTLED;
- throtl_update_dispatch_stats(tg_to_blkg(tg),
- bio->bi_iter.bi_size, bio->bi_rw);
- }
}
/**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
}
}
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct throtl_grp *tg = pd_to_tg(pd);
- struct blkg_rwstat rwstat = { }, tmp;
- int i, cpu;
-
- if (tg->stats_cpu == NULL)
- return 0;
-
- for_each_possible_cpu(cpu) {
- struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
- tmp = blkg_rwstat_read((void *)sc + off);
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- rwstat.cnt[i] += tmp.cnt[i];
- }
-
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
- &blkcg_policy_throtl, seq_cft(sf)->private, true);
- return 0;
-}
-
static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
return 0;
}
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
{
- struct blkcg *blkcg = css_to_blkcg(of_css(of));
- struct blkg_conf_ctx ctx;
- struct throtl_grp *tg;
- struct throtl_service_queue *sq;
- struct blkcg_gq *blkg;
+ struct throtl_service_queue *sq = &tg->service_queue;
struct cgroup_subsys_state *pos_css;
- int ret;
-
- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
- if (ret)
- return ret;
-
- tg = blkg_to_tg(ctx.blkg);
- sq = &tg->service_queue;
-
- if (!ctx.v)
- ctx.v = -1;
-
- if (is_u64)
- *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
- else
- *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+ struct blkcg_gq *blkg;
throtl_log(&tg->service_queue,
"limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
* restrictions in the whole hierarchy and allows them to bypass
* blk-throttle.
*/
- blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+ blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
tg_update_has_rules(blkg_to_tg(blkg));
/*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
tg_update_disptime(tg);
throtl_schedule_next_dispatch(sq->parent_sq, true);
}
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct throtl_grp *tg;
+ int ret;
+ u64 v;
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if (sscanf(ctx.body, "%llu", &v) != 1)
+ goto out_finish;
+ if (!v)
+ v = -1;
+
+ tg = blkg_to_tg(ctx.blkg);
+
+ if (is_u64)
+ *(u64 *)((void *)tg + of_cft(of)->private) = v;
+ else
+ *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+
+ tg_conf_updated(tg);
+ ret = 0;
+out_finish:
blkg_conf_finish(&ctx);
- return nbytes;
+ return ret ?: nbytes;
}
static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
return tg_set_conf(of, buf, nbytes, off, false);
}
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
.private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
},
{
.name = "throttle.io_service_bytes",
- .private = offsetof(struct tg_stats_cpu, service_bytes),
- .seq_show = tg_print_cpu_rwstat,
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_bytes,
},
{
.name = "throttle.io_serviced",
- .private = offsetof(struct tg_stats_cpu, serviced),
- .seq_show = tg_print_cpu_rwstat,
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_ios,
+ },
+ { } /* terminate */
+};
+
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+ const char *dname = blkg_dev_name(pd->blkg);
+ char bufs[4][21] = { "max", "max", "max", "max" };
+
+ if (!dname)
+ return 0;
+ if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+ tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+ return 0;
+
+ if (tg->bps[READ] != -1)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+ if (tg->bps[WRITE] != -1)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+ if (tg->iops[READ] != -1)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+ if (tg->iops[WRITE] != -1)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+ return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct throtl_grp *tg;
+ u64 v[4];
+ int ret;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ if (ret)
+ return ret;
+
+ tg = blkg_to_tg(ctx.blkg);
+
+ v[0] = tg->bps[READ];
+ v[1] = tg->bps[WRITE];
+ v[2] = tg->iops[READ];
+ v[3] = tg->iops[WRITE];
+
+ while (true) {
+ char tok[27]; /* wiops=18446744073709551616 */
+ char *p;
+ u64 val = -1;
+ int len;
+
+ if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+ break;
+ if (tok[0] == '\0')
+ break;
+ ctx.body += len;
+
+ ret = -EINVAL;
+ p = tok;
+ strsep(&p, "=");
+ if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+ goto out_finish;
+
+ ret = -ERANGE;
+ if (!val)
+ goto out_finish;
+
+ ret = -EINVAL;
+ if (!strcmp(tok, "rbps"))
+ v[0] = val;
+ else if (!strcmp(tok, "wbps"))
+ v[1] = val;
+ else if (!strcmp(tok, "riops"))
+ v[2] = min_t(u64, val, UINT_MAX);
+ else if (!strcmp(tok, "wiops"))
+ v[3] = min_t(u64, val, UINT_MAX);
+ else
+ goto out_finish;
+ }
+
+ tg->bps[READ] = v[0];
+ tg->bps[WRITE] = v[1];
+ tg->iops[READ] = v[2];
+ tg->iops[WRITE] = v[3];
+
+ tg_conf_updated(tg);
+ ret = 0;
+out_finish:
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_max,
+ .write = tg_set_max,
},
{ } /* terminate */
};
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
}
static struct blkcg_policy blkcg_policy_throtl = {
- .pd_size = sizeof(struct throtl_grp),
- .cftypes = throtl_files,
+ .dfl_cftypes = throtl_files,
+ .legacy_cftypes = throtl_legacy_files,
+ .pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
- .pd_exit_fn = throtl_pd_exit,
- .pd_reset_stats_fn = throtl_pd_reset_stats,
+ .pd_free_fn = throtl_pd_free,
};
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+ struct bio *bio)
{
- struct throtl_data *td = q->td;
struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg;
+ struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
- struct blkcg *blkcg;
bool throttled = false;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
/* see throtl_charge_bio() */
- if (bio->bi_rw & REQ_THROTTLED)
+ if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
goto out;
- /*
- * A throtl_grp pointer retrieved under rcu can be used to access
- * basic fields like stats and io rates. If a group has no rules,
- * just update the dispatch stats in lockless manner and return.
- */
- rcu_read_lock();
- blkcg = bio_blkcg(bio);
- tg = throtl_lookup_tg(td, blkcg);
- if (tg) {
- if (!tg->has_rules[rw]) {
- throtl_update_dispatch_stats(tg_to_blkg(tg),
- bio->bi_iter.bi_size, bio->bi_rw);
- goto out_unlock_rcu;
- }
- }
-
- /*
- * Either group has not been allocated yet or it is not an unlimited
- * IO group
- */
spin_lock_irq(q->queue_lock);
- tg = throtl_lookup_create_tg(td, blkcg);
- if (unlikely(!tg))
+
+ if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
out_unlock:
spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
- rcu_read_unlock();
out:
/*
* As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
return -ENOMEM;
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
- throtl_service_queue_init(&td->service_queue, NULL);
+ throtl_service_queue_init(&td->service_queue);
q->td = td;
td->queue = q;
diff --git a/block/blk.h b/block/blk.h
index 838188b35a83..98614ad37c81 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
- return false;
-}
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c62bb2e650b8..04de88463a98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
/* blkio-related constants */
-#define CFQ_WEIGHT_MIN 10
-#define CFQ_WEIGHT_MAX 1000
-#define CFQ_WEIGHT_DEFAULT 500
+#define CFQ_WEIGHT_LEGACY_MIN 10
+#define CFQ_WEIGHT_LEGACY_DFL 500
+#define CFQ_WEIGHT_LEGACY_MAX 1000
struct cfq_ttime {
unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
- /* total bytes transferred */
- struct blkg_rwstat service_bytes;
- /* total IOs serviced, post merge */
- struct blkg_rwstat serviced;
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
struct blkg_rwstat wait_time;
/* number of IOs queued up */
struct blkg_rwstat queued;
- /* total sectors transferred */
- struct blkg_stat sectors;
/* total disk time and nr sectors dispatched by this group */
struct blkg_stat time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
/* Per-cgroup data */
struct cfq_group_data {
/* must be the first member */
- struct blkcg_policy_data pd;
+ struct blkcg_policy_data cpd;
unsigned int weight;
unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
int dispatched;
struct cfq_ttime ttime;
struct cfqg_stats stats; /* stats for this cfqg */
- struct cfqg_stats dead_stats; /* stats pushed from dead children */
+
+ /* async queue for each priority case */
+ struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+ struct cfq_queue *async_idle_cfqq;
+
};
struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
struct cfq_queue *active_queue;
struct cfq_io_cq *active_cic;
- /*
- * async queue for each priority case
- */
- struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
- struct cfq_queue *async_idle_cfqq;
-
sector_t last_position;
/*
@@ -401,6 +393,7 @@ struct cfq_data {
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
static struct cfq_group_data
*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
{
- return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+ return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
}
static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
}
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
- uint64_t bytes, int rw)
-{
- blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
- blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
- blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
uint64_t start_time, uint64_t io_start_time, int rw)
{
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
static void cfqg_stats_reset(struct cfqg_stats *stats)
{
/* queued stats shouldn't be cleared */
- blkg_rwstat_reset(&stats->service_bytes);
- blkg_rwstat_reset(&stats->serviced);
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
}
/* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
{
/* queued stats shouldn't be cleared */
- blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
- blkg_rwstat_merge(&to->serviced, &from->serviced);
- blkg_rwstat_merge(&to->merged, &from->merged);
- blkg_rwstat_merge(&to->service_time, &from->service_time);
- blkg_rwstat_merge(&to->wait_time, &from->wait_time);
- blkg_stat_merge(&from->time, &from->time);
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+ blkg_stat_add_aux(&from->time, &from->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
- blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
- blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
- blkg_stat_merge(&to->dequeue, &from->dequeue);
- blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_merge(&to->idle_time, &from->idle_time);
- blkg_stat_merge(&to->empty_time, &from->empty_time);
+ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
#endif
}
/*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
* recursive stats can still account for the amount used by this cfqg after
* it's gone.
*/
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
if (unlikely(!parent))
return;
- cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
- cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+ cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
cfqg_stats_reset(&cfqg->stats);
- cfqg_stats_reset(&cfqg->dead_stats);
}
#else /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
unsigned long time, unsigned long unaccounted_time) { }
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
- uint64_t bytes, int rw) { }
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
uint64_t start_time, uint64_t io_start_time, int rw) { }
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
- struct cfq_io_cq *cic, struct bio *bio,
- gfp_t gfp_mask);
+ struct cfq_io_cq *cic, struct bio *bio);
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+ bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
+static void cfqg_stats_exit(struct cfqg_stats *stats)
{
- blkg_rwstat_init(&stats->service_bytes);
- blkg_rwstat_init(&stats->serviced);
- blkg_rwstat_init(&stats->merged);
- blkg_rwstat_init(&stats->service_time);
- blkg_rwstat_init(&stats->wait_time);
- blkg_rwstat_init(&stats->queued);
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_exit(&stats->unaccounted_time);
+ blkg_stat_exit(&stats->avg_queue_size_sum);
+ blkg_stat_exit(&stats->avg_queue_size_samples);
+ blkg_stat_exit(&stats->dequeue);
+ blkg_stat_exit(&stats->group_wait_time);
+ blkg_stat_exit(&stats->idle_time);
+ blkg_stat_exit(&stats->empty_time);
+#endif
+}
- blkg_stat_init(&stats->sectors);
- blkg_stat_init(&stats->time);
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
+ blkg_rwstat_init(&stats->service_time, gfp) ||
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
+ blkg_rwstat_init(&stats->queued, gfp) ||
+ blkg_stat_init(&stats->time, gfp))
+ goto err;
#ifdef CONFIG_DEBUG_BLK_CGROUP
- blkg_stat_init(&stats->unaccounted_time);
- blkg_stat_init(&stats->avg_queue_size_sum);
- blkg_stat_init(&stats->avg_queue_size_samples);
- blkg_stat_init(&stats->dequeue);
- blkg_stat_init(&stats->group_wait_time);
- blkg_stat_init(&stats->idle_time);
- blkg_stat_init(&stats->empty_time);
+ if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ blkg_stat_init(&stats->dequeue, gfp) ||
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
+ blkg_stat_init(&stats->idle_time, gfp) ||
+ blkg_stat_init(&stats->empty_time, gfp))
+ goto err;
#endif
+ return 0;
+err:
+ cfqg_stats_exit(stats);
+ return -ENOMEM;
}
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
{
- struct cfq_group_data *cgd =
- cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+ struct cfq_group_data *cgd;
- if (blkcg == &blkcg_root) {
- cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
- cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
- } else {
- cgd->weight = CFQ_WEIGHT_DEFAULT;
- cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
- }
+ cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+ if (!cgd)
+ return NULL;
+ return &cgd->cpd;
+}
+
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+ struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+ unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+ CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+ if (cpd_to_blkcg(cpd) == &blkcg_root)
+ weight *= 2;
+
+ cgd->weight = weight;
+ cgd->leaf_weight = weight;
}
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
{
- struct cfq_group *cfqg = blkg_to_cfqg(blkg);
- struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+ kfree(cpd_to_cfqgd(cpd));
+}
+
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+ struct blkcg *blkcg = cpd_to_blkcg(cpd);
+ bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+ unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+ if (blkcg == &blkcg_root)
+ weight *= 2;
+
+ WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+ WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+ struct cfq_group *cfqg;
+
+ cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+ if (!cfqg)
+ return NULL;
cfq_init_cfqg_base(cfqg);
+ if (cfqg_stats_init(&cfqg->stats, gfp)) {
+ kfree(cfqg);
+ return NULL;
+ }
+
+ return &cfqg->pd;
+}
+
+static void cfq_pd_init(struct blkg_policy_data *pd)
+{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
+
cfqg->weight = cgd->weight;
cfqg->leaf_weight = cgd->leaf_weight;
- cfqg_stats_init(&cfqg->stats);
- cfqg_stats_init(&cfqg->dead_stats);
}
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
+ int i;
+
+ for (i = 0; i < IOPRIO_BE_NR; i++) {
+ if (cfqg->async_cfqq[0][i])
+ cfq_put_queue(cfqg->async_cfqq[0][i]);
+ if (cfqg->async_cfqq[1][i])
+ cfq_put_queue(cfqg->async_cfqq[1][i]);
+ }
+
+ if (cfqg->async_idle_cfqq)
+ cfq_put_queue(cfqg->async_idle_cfqq);
+
/*
* @blkg is going offline and will be ignored by
* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
* that they don't get lost. If IOs complete after this point, the
* stats for them will be lost. Oh well...
*/
- cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+ cfqg_stats_xfer_dead(cfqg);
}
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
- offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void cfq_pd_free(struct blkg_policy_data *pd)
{
- u64 sum = 0;
-
- sum += blkg_stat_recursive_sum(pd, off);
- sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
- return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat a, b;
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
- a = blkg_rwstat_recursive_sum(pd, off);
- b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
- blkg_rwstat_merge(&a, &b);
- return a;
+ cfqg_stats_exit(&cfqg->stats);
+ return kfree(cfqg);
}
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
{
- struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
cfqg_stats_reset(&cfqg->stats);
- cfqg_stats_reset(&cfqg->dead_stats);
}
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
- struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+ struct blkcg *blkcg)
{
- struct request_queue *q = cfqd->queue;
- struct cfq_group *cfqg = NULL;
-
- /* avoid lookup for the common case where there's no blkcg */
- if (blkcg == &blkcg_root) {
- cfqg = cfqd->root_group;
- } else {
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup_create(blkcg, q);
- if (!IS_ERR(blkg))
- cfqg = blkg_to_cfqg(blkg);
- }
+ struct blkcg_gq *blkg;
- return cfqg;
+ blkg = blkg_lookup(blkcg, cfqd->queue);
+ if (likely(blkg))
+ return blkg_to_cfqg(blkg);
+ return NULL;
}
static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
- /* Currently, all async queues are mapped to root group */
- if (!cfq_cfqq_sync(cfqq))
- cfqg = cfqq->cfqd->root_group;
-
cfqq->cfqg = cfqg;
/* cfqq reference on cfqg */
cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
- bool is_leaf_weight)
+ bool on_dfl, bool is_leaf_weight)
{
+ unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+ unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct cfq_group *cfqg;
struct cfq_group_data *cfqgd;
int ret;
+ u64 v;
ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
if (ret)
return ret;
- ret = -EINVAL;
+ if (sscanf(ctx.body, "%llu", &v) == 1) {
+ /* require "default" on dfl */
+ ret = -ERANGE;
+ if (!v && on_dfl)
+ goto out_finish;
+ } else if (!strcmp(strim(ctx.body), "default")) {
+ v = 0;
+ } else {
+ ret = -EINVAL;
+ goto out_finish;
+ }
+
cfqg = blkg_to_cfqg(ctx.blkg);
cfqgd = blkcg_to_cfqgd(blkcg);
- if (!cfqg || !cfqgd)
- goto err;
- if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+ ret = -ERANGE;
+ if (!v || (v >= min && v <= max)) {
if (!is_leaf_weight) {
- cfqg->dev_weight = ctx.v;
- cfqg->new_weight = ctx.v ?: cfqgd->weight;
+ cfqg->dev_weight = v;
+ cfqg->new_weight = v ?: cfqgd->weight;
} else {
- cfqg->dev_leaf_weight = ctx.v;
- cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+ cfqg->dev_leaf_weight = v;
+ cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
}
ret = 0;
}
-
-err:
+out_finish:
blkg_conf_finish(&ctx);
return ret ?: nbytes;
}
@@ -1776,25 +1805,27 @@ err:
static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
}
static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
}
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+ bool on_dfl, bool reset_dev, bool is_leaf_weight)
{
+ unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+ unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
struct cfq_group_data *cfqgd;
int ret = 0;
- if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
- return -EINVAL;
+ if (val < min || val > max)
+ return -ERANGE;
spin_lock_irq(&blkcg->lock);
cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
continue;
if (!is_leaf_weight) {
+ if (reset_dev)
+ cfqg->dev_weight = 0;
if (!cfqg->dev_weight)
cfqg->new_weight = cfqgd->weight;
} else {
+ if (reset_dev)
+ cfqg->dev_leaf_weight = 0;
if (!cfqg->dev_leaf_weight)
cfqg->new_leaf_weight = cfqgd->leaf_weight;
}
@@ -1831,13 +1866,13 @@ out:
static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
u64 val)
{
- return __cfq_set_weight(css, cft, val, false);
+ return __cfq_set_weight(css, val, false, false, false);
}
static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- return __cfq_set_weight(css, cft, val, true);
+ return __cfq_set_weight(css, val, false, false, true);
}
static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
-
+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_cfq, off);
return __blkg_prfill_u64(sf, pd, sum);
}
static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
-
+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_cfq, off);
return __blkg_prfill_rwstat(sf, pd, &sum);
}
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
return 0;
}
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+ return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+ false);
+ return 0;
+}
+
#ifdef CONFIG_DEBUG_BLK_CGROUP
static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
}
#endif /* CONFIG_DEBUG_BLK_CGROUP */
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
/* on root, weight is mapped to leaf_weight */
{
.name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
},
{
.name = "sectors",
- .private = offsetof(struct cfq_group, stats.sectors),
- .seq_show = cfqg_print_stat,
+ .seq_show = cfqg_print_stat_sectors,
},
{
.name = "io_service_bytes",
- .private = offsetof(struct cfq_group, stats.service_bytes),
- .seq_show = cfqg_print_rwstat,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_bytes,
},
{
.name = "io_serviced",
- .private = offsetof(struct cfq_group, stats.serviced),
- .seq_show = cfqg_print_rwstat,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_ios,
},
{
.name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
},
{
.name = "sectors_recursive",
- .private = offsetof(struct cfq_group, stats.sectors),
- .seq_show = cfqg_print_stat_recursive,
+ .seq_show = cfqg_print_stat_sectors_recursive,
},
{
.name = "io_service_bytes_recursive",
- .private = offsetof(struct cfq_group, stats.service_bytes),
- .seq_show = cfqg_print_rwstat_recursive,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_bytes_recursive,
},
{
.name = "io_serviced_recursive",
- .private = offsetof(struct cfq_group, stats.serviced),
- .seq_show = cfqg_print_rwstat_recursive,
+ .private = (unsigned long)&blkcg_policy_cfq,
+ .seq_show = blkg_print_stat_ios_recursive,
},
{
.name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
};
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+ seq_printf(sf, "default %u\n", cgd->weight);
+ blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+ &blkcg_policy_cfq, 0, false);
+ return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char *endp;
+ int ret;
+ u64 v;
+
+ buf = strim(buf);
+
+ /* "WEIGHT" or "default WEIGHT" sets the default weight */
+ v = simple_strtoull(buf, &endp, 0);
+ if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+ ret = __cfq_set_weight(of_css(of), v, true, false, false);
+ return ret ?: nbytes;
+ }
+
+ /* "MAJ:MIN WEIGHT" */
+ return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cfq_print_weight_on_dfl,
+ .write = cfq_set_weight_on_dfl,
+ },
+ { } /* terminate */
+};
+
#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
- struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+ struct blkcg *blkcg)
{
return cfqd->root_group;
}
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
cfqq->nr_sectors += blk_rq_sectors(rq);
- cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
}
/*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
struct cfq_io_cq *cic = icq_to_cic(icq);
struct cfq_data *cfqd = cic_to_cfqd(cic);
- if (cic->cfqq[BLK_RW_ASYNC]) {
- cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
- cic->cfqq[BLK_RW_ASYNC] = NULL;
+ if (cic_to_cfqq(cic, false)) {
+ cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+ cic_set_cfqq(cic, NULL, false);
}
- if (cic->cfqq[BLK_RW_SYNC]) {
- cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
- cic->cfqq[BLK_RW_SYNC] = NULL;
+ if (cic_to_cfqq(cic, true)) {
+ cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+ cic_set_cfqq(cic, NULL, true);
}
}
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
return;
- cfqq = cic->cfqq[BLK_RW_ASYNC];
+ cfqq = cic_to_cfqq(cic, false);
if (cfqq) {
- struct cfq_queue *new_cfqq;
- new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
- GFP_ATOMIC);
- if (new_cfqq) {
- cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
- cfq_put_queue(cfqq);
- }
+ cfq_put_queue(cfqq);
+ cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
+ cic_set_cfqq(cic, cfqq, false);
}
- cfqq = cic->cfqq[BLK_RW_SYNC];
+ cfqq = cic_to_cfqq(cic, true);
if (cfqq)
cfq_mark_cfqq_prio_changed(cfqq);
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
- struct cfq_queue *sync_cfqq;
+ struct cfq_queue *cfqq;
uint64_t serial_nr;
rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
return;
- sync_cfqq = cic_to_cfqq(cic, 1);
- if (sync_cfqq) {
- /*
- * Drop reference to sync queue. A new sync queue will be
- * assigned in new group upon arrival of a fresh request.
- */
- cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
- cic_set_cfqq(cic, NULL, 1);
- cfq_put_queue(sync_cfqq);
+ /*
+ * Drop reference to queues. New queues will be assigned in new
+ * group upon arrival of fresh requests.
+ */
+ cfqq = cic_to_cfqq(cic, false);
+ if (cfqq) {
+ cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+ cic_set_cfqq(cic, NULL, false);
+ cfq_put_queue(cfqq);
+ }
+
+ cfqq = cic_to_cfqq(cic, true);
+ if (cfqq) {
+ cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+ cic_set_cfqq(cic, NULL, true);
+ cfq_put_queue(cfqq);
}
cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
- struct bio *bio, gfp_t gfp_mask)
-{
- struct blkcg *blkcg;
- struct cfq_queue *cfqq, *new_cfqq = NULL;
- struct cfq_group *cfqg;
-
-retry:
- rcu_read_lock();
-
- blkcg = bio_blkcg(bio);
- cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
- if (!cfqg) {
- cfqq = &cfqd->oom_cfqq;
- goto out;
- }
-
- cfqq = cic_to_cfqq(cic, is_sync);
-
- /*
- * Always try a new alloc if we fell back to the OOM cfqq
- * originally, since it should just be a temporary situation.
- */
- if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = NULL;
- if (new_cfqq) {
- cfqq = new_cfqq;
- new_cfqq = NULL;
- } else if (gfp_mask & __GFP_WAIT) {
- rcu_read_unlock();
- spin_unlock_irq(cfqd->queue->queue_lock);
- new_cfqq = kmem_cache_alloc_node(cfq_pool,
- gfp_mask | __GFP_ZERO,
- cfqd->queue->node);
- spin_lock_irq(cfqd->queue->queue_lock);
- if (new_cfqq)
- goto retry;
- else
- return &cfqd->oom_cfqq;
- } else {
- cfqq = kmem_cache_alloc_node(cfq_pool,
- gfp_mask | __GFP_ZERO,
- cfqd->queue->node);
- }
-
- if (cfqq) {
- cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
- cfq_init_prio_data(cfqq, cic);
- cfq_link_cfqq_cfqg(cfqq, cfqg);
- cfq_log_cfqq(cfqd, cfqq, "alloced");
- } else
- cfqq = &cfqd->oom_cfqq;
- }
-out:
- if (new_cfqq)
- kmem_cache_free(cfq_pool, new_cfqq);
-
- rcu_read_unlock();
- return cfqq;
-}
-
static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
{
switch (ioprio_class) {
case IOPRIO_CLASS_RT:
- return &cfqd->async_cfqq[0][ioprio];
+ return &cfqg->async_cfqq[0][ioprio];
case IOPRIO_CLASS_NONE:
ioprio = IOPRIO_NORM;
/* fall through */
case IOPRIO_CLASS_BE:
- return &cfqd->async_cfqq[1][ioprio];
+ return &cfqg->async_cfqq[1][ioprio];
case IOPRIO_CLASS_IDLE:
- return &cfqd->async_idle_cfqq;
+ return &cfqg->async_idle_cfqq;
default:
BUG();
}
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
static struct cfq_queue *
cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio)
{
int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
struct cfq_queue **async_cfqq = NULL;
- struct cfq_queue *cfqq = NULL;
+ struct cfq_queue *cfqq;
+ struct cfq_group *cfqg;
+
+ rcu_read_lock();
+ cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+ if (!cfqg) {
+ cfqq = &cfqd->oom_cfqq;
+ goto out;
+ }
if (!is_sync) {
if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
ioprio = task_nice_ioprio(tsk);
ioprio_class = task_nice_ioclass(tsk);
}
- async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+ async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
cfqq = *async_cfqq;
+ if (cfqq)
+ goto out;
}
- if (!cfqq)
- cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+ cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+ cfqd->queue->node);
+ if (!cfqq) {
+ cfqq = &cfqd->oom_cfqq;
+ goto out;
+ }
- /*
- * pin the queue now that it's allocated, scheduler exit will prune it
- */
- if (!is_sync && !(*async_cfqq)) {
+ cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+ cfq_init_prio_data(cfqq, cic);
+ cfq_link_cfqq_cfqg(cfqq, cfqg);
+ cfq_log_cfqq(cfqd, cfqq, "alloced");
+
+ if (async_cfqq) {
+ /* a new async queue is created, pin and remember */
cfqq->ref++;
*async_cfqq = cfqq;
}
-
+out:
cfqq->ref++;
+ rcu_read_unlock();
return cfqq;
}
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- might_sleep_if(gfp_mask & __GFP_WAIT);
-
spin_lock_irq(q->queue_lock);
check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+ if (cfqq)
+ cfq_put_queue(cfqq);
+ cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
cic_set_cfqq(cic, cfqq, is_sync);
} else {
/*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
cancel_work_sync(&cfqd->unplug_work);
}
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
- int i;
-
- for (i = 0; i < IOPRIO_BE_NR; i++) {
- if (cfqd->async_cfqq[0][i])
- cfq_put_queue(cfqd->async_cfqq[0][i]);
- if (cfqd->async_cfqq[1][i])
- cfq_put_queue(cfqd->async_cfqq[1][i]);
- }
-
- if (cfqd->async_idle_cfqq)
- cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
static void cfq_exit_queue(struct elevator_queue *e)
{
struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
if (cfqd->active_queue)
__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
- cfq_put_async_queues(cfqd);
-
spin_unlock_irq(q->queue_lock);
cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
goto out_free;
cfq_init_cfqg_base(cfqd->root_group);
+ cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+ cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
#endif
- cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
- cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
/*
* Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
cfqd->prio_trees[i] = RB_ROOT;
/*
- * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+ * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
* Grab a permanent reference to it, so that the normal code flow
* will not attempt to free it. oom_cfqq is linked to root_group
* but shouldn't hold a reference as it'll never be unlinked. Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
static struct blkcg_policy blkcg_policy_cfq = {
- .pd_size = sizeof(struct cfq_group),
- .cpd_size = sizeof(struct cfq_group_data),
- .cftypes = cfq_blkcg_files,
+ .dfl_cftypes = cfq_blkcg_files,
+ .legacy_cftypes = cfq_blkcg_legacy_files,
+ .cpd_alloc_fn = cfq_cpd_alloc,
.cpd_init_fn = cfq_cpd_init,
+ .cpd_free_fn = cfq_cpd_free,
+ .cpd_bind_fn = cfq_cpd_bind,
+
+ .pd_alloc_fn = cfq_pd_alloc,
.pd_init_fn = cfq_pd_init,
.pd_offline_fn = cfq_pd_offline,
+ .pd_free_fn = cfq_pd_free,
.pd_reset_stats_fn = cfq_pd_reset_stats,
};
#endif
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae0f438c2ee6..24489126f8ca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
- unsigned int single_wait:1;
- unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- trace_writeback_queue(wb->bdi, work);
+ trace_writeback_queue(wb, work);
spin_lock_bh(&wb->work_lock);
- if (!test_bit(WB_registered, &wb->state)) {
- if (work->single_wait)
- work->single_done = 1;
+ if (!test_bit(WB_registered, &wb->state))
goto out_unlock;
- }
if (work->done)
atomic_inc(&work->done->cnt);
list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
/**
* inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
@@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits)
EXPORT_SYMBOL_GPL(inode_congested);
/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's. The caller must have set @work->single_wait before
- * issuing it. This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
-{
- if (WARN_ON_ONCE(!work->single_wait))
- return;
-
- wait_event(bdi->wb_waitq, work->single_done);
-
- /*
- * Paired with smp_wmb() in wb_do_writeback() and ensures that all
- * modifications to @work prior to assertion of ->single_done is
- * visible to the caller once this function returns.
- */
- smp_rmb();
-}
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
}
/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb. If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned. In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion. @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
- struct wb_writeback_work *base_work)
-{
- struct wb_writeback_work *work;
-
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- *work = *base_work;
- work->auto_free = 1;
- work->single_wait = 0;
- } else {
- work = base_work;
- work->auto_free = 0;
- work->single_wait = 1;
- }
- work->single_done = 0;
- wb_queue_work(wb, work);
- return work != base_work;
-}
-
-/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
* @base_work: wb_writeback_work to issue
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- long nr_pages = base_work->nr_pages;
- int next_blkcg_id = 0;
+ int next_memcg_id = 0;
struct bdi_writeback *wb;
struct wb_iter iter;
might_sleep();
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+ struct wb_writeback_work fallback_work;
+ struct wb_writeback_work *work;
+ long nr_pages;
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
if (skip_if_busy && writeback_in_progress(wb))
continue;
- base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
- if (!wb_clone_and_queue_work(wb, base_work)) {
- next_blkcg_id = wb->blkcg_css->id + 1;
- rcu_read_unlock();
- wb_wait_for_single_work(bdi, base_work);
- goto restart;
+ nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 1;
+ wb_queue_work(wb, work);
+ continue;
}
+
+ /* alloc failed, execute synchronously using on-stack fallback */
+ work = &fallback_work;
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 0;
+ work->done = &fallback_work_done;
+
+ wb_queue_work(wb, work);
+
+ next_memcg_id = wb->memcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_completion(bdi, &fallback_work_done);
+ goto restart;
}
rcu_read_unlock();
}
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
- base_work->single_wait = 0;
- base_work->single_done = 0;
wb_queue_work(&bdi->wb, base_work);
}
}
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(wb->bdi);
+ trace_writeback_nowork(wb);
wb_wakeup(wb);
return;
}
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(wb->bdi);
+ trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
@@ -1660,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb,
} else if (work->for_background)
oldest_jif = jiffies;
- trace_writeback_start(wb->bdi, work);
+ trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
- trace_writeback_written(wb->bdi, work);
+ trace_writeback_written(wb, work);
wb_update_bandwidth(wb, wb_start);
@@ -1692,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb->bdi, work);
+ trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
@@ -1797,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
struct wb_completion *done = work->done;
- bool need_wake_up = false;
- trace_writeback_exec(wb->bdi, work);
+ trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
- if (work->single_wait) {
- WARN_ON_ONCE(work->auto_free);
- /* paired w/ rmb in wb_wait_for_single_work() */
- smp_wmb();
- work->single_done = 1;
- need_wake_up = true;
- } else if (work->auto_free) {
+ if (work->auto_free)
kfree(work);
- }
-
if (done && atomic_dec_and_test(&done->cnt))
- need_wake_up = true;
-
- if (need_wake_up)
wake_up_all(&wb->bdi->wb_waitq);
}
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d48d28e1640..91e004518237 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+ size_t len = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+ do {
+ len += strlen(kn->name) + 1;
+ kn = kn->parent;
+ } while (kn && kn->parent);
+
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+ return len;
+}
+
+/**
* kernfs_path - build full path of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0fe9df983ab7..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
* %current's blkcg equals the effective blkcg of its memcg. No
* need to use the relatively expensive cgroup_get_e_css().
*/
- if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
return wb;
return NULL;
}
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
}
struct wb_iter {
- int start_blkcg_id;
+ int start_memcg_id;
struct radix_tree_iter tree_iter;
void **slot;
};
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
WARN_ON_ONCE(!rcu_read_lock_held());
- if (iter->start_blkcg_id >= 0) {
- iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
- iter->start_blkcg_id = -1;
+ if (iter->start_memcg_id >= 0) {
+ iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+ iter->start_memcg_id = -1;
} else {
iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
}
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
struct backing_dev_info *bdi,
- int start_blkcg_id)
+ int start_memcg_id)
{
- iter->start_blkcg_id = start_blkcg_id;
+ iter->start_memcg_id = start_memcg_id;
- if (start_blkcg_id)
+ if (start_memcg_id)
return __wb_iter_next(iter, bdi);
else
return &bdi->wb;
}
/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
* @wb_cur: cursor struct bdi_writeback pointer
* @bdi: bdi to walk wb's of
* @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
*
* Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
* to be used as temp storage during iteration. rcu_read_lock() must be
* held throughout iteration.
*/
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
- for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
+ for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
(wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a4cd1641e9e2..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
*/
#include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
#include <linux/seq_file.h>
#include <linux/radix-tree.h>
#include <linux/blkdev.h>
#include <linux/atomic.h>
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
+
/* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX
@@ -45,7 +48,7 @@ struct blkcg {
struct blkcg_gq *blkg_hint;
struct hlist_head blkg_list;
- struct blkcg_policy_data *pd[BLKCG_MAX_POLS];
+ struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
#endif
};
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive. Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
+ */
struct blkg_stat {
- struct u64_stats_sync syncp;
- uint64_t cnt;
+ struct percpu_counter cpu_cnt;
+ atomic64_t aux_cnt;
};
struct blkg_rwstat {
- struct u64_stats_sync syncp;
- uint64_t cnt[BLKG_RWSTAT_NR];
+ struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
+ atomic64_t aux_cnt[BLKG_RWSTAT_NR];
};
/*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
* request_queue (q). This is used by blkcg policies which need to track
* information per blkcg - q pair.
*
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size. blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods. A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
*/
struct blkg_policy_data {
/* the blkg and policy id this per-policy data belongs to */
struct blkcg_gq *blkg;
int plid;
-
- /* used during policy activation */
- struct list_head alloc_node;
};
/*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it. cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods. A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
*/
struct blkcg_policy_data {
- /* the policy id this per-policy data belongs to */
+ /* the blkcg and policy id this per-policy data belongs to */
+ struct blkcg *blkcg;
int plid;
};
@@ -123,40 +127,50 @@ struct blkcg_gq {
/* is this blkg online? protected by both blkcg and q locks */
bool online;
+ struct blkg_rwstat stat_bytes;
+ struct blkg_rwstat stat_ios;
+
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
struct rcu_head rcu_head;
};
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
struct blkcg_policy {
int plid;
- /* policy specific private data size */
- size_t pd_size;
- /* policy specific per-blkcg data size */
- size_t cpd_size;
/* cgroup files for the policy */
- struct cftype *cftypes;
+ struct cftype *dfl_cftypes;
+ struct cftype *legacy_cftypes;
/* operations */
+ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
blkcg_pol_init_cpd_fn *cpd_init_fn;
+ blkcg_pol_free_cpd_fn *cpd_free_fn;
+ blkcg_pol_bind_cpd_fn *cpd_bind_fn;
+
+ blkcg_pol_alloc_pd_fn *pd_alloc_fn;
blkcg_pol_init_pd_fn *pd_init_fn;
blkcg_pol_online_pd_fn *pd_online_fn;
blkcg_pol_offline_pd_fn *pd_offline_fn;
- blkcg_pol_exit_pd_fn *pd_exit_fn;
+ blkcg_pol_free_pd_fn *pd_free_fn;
blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
};
extern struct blkcg blkcg_root;
extern struct cgroup_subsys_state * const blkcg_root_css;
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint);
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q);
int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol);
+const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
u64 (*prfill)(struct seq_file *,
struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
- int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off);
struct blkg_conf_ctx {
struct gendisk *disk;
struct blkcg_gq *blkg;
- u64 v;
+ char *body;
};
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- const char *input, struct blkg_conf_ctx *ctx);
+ char *input, struct blkg_conf_ctx *ctx);
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
static inline struct blkcg *task_blkcg(struct task_struct *tsk)
{
- return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+ return css_to_blkcg(task_css(tsk, io_cgrp_id));
}
static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
static inline struct cgroup_subsys_state *
task_get_blkcg_css(struct task_struct *task)
{
- return task_get_css(task, blkio_cgrp_id);
+ return task_get_css(task, io_cgrp_id);
}
/**
@@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
}
/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state. If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q,
+ bool update_hint)
+{
+ struct blkcg_gq *blkg;
+
+ if (blkcg == &blkcg_root)
+ return q->root_blkg;
+
+ blkg = rcu_dereference(blkcg->blkg_hint);
+ if (blkg && blkg->q == q)
+ return blkg;
+
+ return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair. This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (unlikely(blk_queue_bypass(q)))
+ return NULL;
+ return __blkg_lookup(blkcg, q, false);
+}
+
+/**
* blkg_to_pdata - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
struct blkcg_policy *pol)
{
- return blkcg ? blkcg->pd[pol->plid] : NULL;
+ return blkcg ? blkcg->cpd[pol->plid] : NULL;
}
/**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
return pd ? pd->blkg : NULL;
}
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+ return cpd ? cpd->blkcg : NULL;
+}
+
/**
* blkg_path - format cgroup path of blkg
* @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
- bool update_hint);
-
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
* or if either the blkcg or queue is going away. Fall back to
* root_rl in such cases.
*/
- blkg = blkg_lookup_create(blkcg, q);
- if (IS_ERR(blkg))
+ blkg = blkg_lookup(blkcg, q);
+ if (unlikely(!blkg))
goto root_rl;
blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
*/
static inline void blk_put_rl(struct request_list *rl)
{
- /* root_rl may not have blkg set */
- if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+ if (rl->blkg->blkcg != &blkcg_root)
blkg_put(rl->blkg);
}
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
{
- u64_stats_init(&stat->syncp);
+ int ret;
+
+ ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+ if (ret)
+ return ret;
+
+ atomic64_set(&stat->aux_cnt, 0);
+ return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+ percpu_counter_destroy(&stat->cpu_cnt);
}
/**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
* @stat: target blkg_stat
* @val: value to add
*
- * Add @val to @stat. The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat. The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
*/
static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
{
- u64_stats_update_begin(&stat->syncp);
- stat->cnt += val;
- u64_stats_update_end(&stat->syncp);
+ __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
}
/**
* blkg_stat_read - read the current value of a blkg_stat
* @stat: blkg_stat to read
- *
- * Read the current value of @stat. This function can be called without
- * synchroniztion and takes care of u64 atomicity.
*/
static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
{
- unsigned int start;
- uint64_t v;
-
- do {
- start = u64_stats_fetch_begin_irq(&stat->syncp);
- v = stat->cnt;
- } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
- return v;
+ return percpu_counter_sum_positive(&stat->cpu_cnt);
}
/**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
*/
static inline void blkg_stat_reset(struct blkg_stat *stat)
{
- stat->cnt = 0;
+ percpu_counter_set(&stat->cpu_cnt, 0);
+ atomic64_set(&stat->aux_cnt, 0);
}
/**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
* @to: the destination blkg_stat
* @from: the source
*
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
*/
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+ struct blkg_stat *from)
{
- blkg_stat_add(to, blkg_stat_read(from));
+ atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+ &to->aux_cnt);
}
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
- u64_stats_init(&rwstat->syncp);
+ int i, ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+ if (ret) {
+ while (--i >= 0)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ return ret;
+ }
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+ return 0;
+}
+
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
}
/**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
int rw, uint64_t val)
{
- u64_stats_update_begin(&rwstat->syncp);
+ struct percpu_counter *cnt;
if (rw & REQ_WRITE)
- rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
else
- rwstat->cnt[BLKG_RWSTAT_READ] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+ __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
if (rw & REQ_SYNC)
- rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
else
- rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
- u64_stats_update_end(&rwstat->syncp);
+ __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
}
/**
* blkg_rwstat_read - read the current values of a blkg_rwstat
* @rwstat: blkg_rwstat to read
*
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
*/
static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
{
- unsigned int start;
- struct blkg_rwstat tmp;
-
- do {
- start = u64_stats_fetch_begin_irq(&rwstat->syncp);
- tmp = *rwstat;
- } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+ struct blkg_rwstat result;
+ int i;
- return tmp;
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ atomic64_set(&result.aux_cnt[i],
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+ return result;
}
/**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
{
struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
- return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+ return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
}
/**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
*/
static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
{
- memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
}
/**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
* @to: the destination blkg_rwstat
* @from: the source
*
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
*/
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
- struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+ struct blkg_rwstat *from)
{
struct blkg_rwstat v = blkg_rwstat_read(from);
int i;
- u64_stats_update_begin(&to->syncp);
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- to->cnt[i] += v.cnt[i];
- u64_stats_update_end(&to->syncp);
+ atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+ atomic64_read(&from->aux_cnt[i]),
+ &to->aux_cnt[i]);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+ struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+ struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+ struct bio *bio)
+{
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ bool throtl = false;
+
+ rcu_read_lock();
+ blkcg = bio_blkcg(bio);
+
+ blkg = blkg_lookup(blkcg, q);
+ if (unlikely(!blkg)) {
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_create(blkcg, q);
+ if (IS_ERR(blkg))
+ blkg = NULL;
+ spin_unlock_irq(q->queue_lock);
+ }
+
+ throtl = blk_throtl_bio(q, blkg, bio);
+
+ if (!throtl) {
+ blkg = blkg ?: q->root_blkg;
+ blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+ bio->bi_iter.bi_size);
+ blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+ }
+
+ rcu_read_unlock();
+ return !throtl;
}
#else /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+ struct bio *bio) { return true; }
+
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1f36945fd23d..1a96fdaa33d5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
#endif
#if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
#endif
#if IS_ENABLED(CONFIG_MEMCG)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 123be25ea15a..5d4e9c4b821d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
}
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen)
{ return NULL; }
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a7aa607a4c55..fff846b512e6 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
TP_ARGS(inode, flags)
);
+#ifdef CREATE_TRACE_POINTS
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+ return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+ struct cgroup *cgrp = wb->memcg_css->cgroup;
+ char *path;
+
+ path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
+ WARN_ON_ONCE(path != buf);
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+ if (wbc->wb)
+ return __trace_wb_cgroup_size(wbc->wb);
+ else
+ return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+ struct writeback_control *wbc)
+{
+ if (wbc->wb)
+ __trace_wb_assign_cgroup(buf, wbc->wb);
+ else
+ strcpy(buf, "/");
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+ return 2;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+ strcpy(buf, "/");
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+ return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+ struct writeback_control *wbc)
+{
+ strcpy(buf, "/");
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+#endif /* CREATE_TRACE_POINTS */
+
DECLARE_EVENT_CLASS(writeback_write_inode_template,
TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
__array(char, name, 32)
__field(unsigned long, ino)
__field(int, sync_mode)
+ __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
),
TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
dev_name(inode_to_bdi(inode)->dev), 32);
__entry->ino = inode->i_ino;
__entry->sync_mode = wbc->sync_mode;
+ __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
),
- TP_printk("bdi %s: ino=%lu sync_mode=%d",
+ TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
__entry->name,
__entry->ino,
- __entry->sync_mode
+ __entry->sync_mode,
+ __get_str(cgroup)
)
);
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
);
DECLARE_EVENT_CLASS(writeback_work_class,
- TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
- TP_ARGS(bdi, work),
+ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
+ TP_ARGS(wb, work),
TP_STRUCT__entry(
__array(char, name, 32)
__field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__field(int, range_cyclic)
__field(int, for_background)
__field(int, reason)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
strncpy(__entry->name,
- bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+ wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
__entry->nr_pages = work->nr_pages;
__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
__entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__entry->range_cyclic = work->range_cyclic;
__entry->for_background = work->for_background;
__entry->reason = work->reason;
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
- "kupdate=%d range_cyclic=%d background=%d reason=%s",
+ "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
__entry->name,
MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
__entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__entry->for_kupdate,
__entry->range_cyclic,
__entry->for_background,
- __print_symbolic(__entry->reason, WB_WORK_REASON)
+ __print_symbolic(__entry->reason, WB_WORK_REASON),
+ __get_str(cgroup)
)
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
- TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
- TP_ARGS(bdi, work))
+ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
+ TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
);
DECLARE_EVENT_CLASS(writeback_class,
- TP_PROTO(struct backing_dev_info *bdi),
- TP_ARGS(bdi),
+ TP_PROTO(struct bdi_writeback *wb),
+ TP_ARGS(wb),
TP_STRUCT__entry(
__array(char, name, 32)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
- TP_printk("bdi %s",
- __entry->name
+ TP_printk("bdi %s: cgroup=%s",
+ __entry->name,
+ __get_str(cgroup)
)
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
- TP_PROTO(struct backing_dev_info *bdi), \
- TP_ARGS(bdi))
+ TP_PROTO(struct bdi_writeback *wb), \
+ TP_ARGS(wb))
DEFINE_WRITEBACK_EVENT(writeback_nowork);
DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+
+TRACE_EVENT(writeback_bdi_register,
+ TP_PROTO(struct backing_dev_info *bdi),
+ TP_ARGS(bdi),
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ ),
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
+ ),
+ TP_printk("bdi %s",
+ __entry->name
+ )
+);
DECLARE_EVENT_CLASS(wbc_class,
TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
__field(int, range_cyclic)
__field(long, range_start)
__field(long, range_end)
+ __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
),
TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->range_cyclic = wbc->range_cyclic;
__entry->range_start = (long)wbc->range_start;
__entry->range_end = (long)wbc->range_end;
+ __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
),
TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
"bgrd=%d reclm=%d cyclic=%d "
- "start=0x%lx end=0x%lx",
+ "start=0x%lx end=0x%lx cgroup=%s",
__entry->name,
__entry->nr_to_write,
__entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->for_reclaim,
__entry->range_cyclic,
__entry->range_start,
- __entry->range_end)
+ __entry->range_end,
+ __get_str(cgroup)
+ )
)
#define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
__field(long, age)
__field(int, moved)
__field(int, reason)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
(jiffies - *older_than_this) * 1000 / HZ : -1;
__entry->moved = moved;
__entry->reason = work->reason;
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
- TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+ TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
__entry->name,
__entry->older, /* older_than_this in jiffies */
__entry->age, /* older_than_this in relative milliseconds */
__entry->moved,
- __print_symbolic(__entry->reason, WB_WORK_REASON)
+ __print_symbolic(__entry->reason, WB_WORK_REASON),
+ __get_str(cgroup)
)
);
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
TRACE_EVENT(bdi_dirty_ratelimit,
- TP_PROTO(struct backing_dev_info *bdi,
+ TP_PROTO(struct bdi_writeback *wb,
unsigned long dirty_rate,
unsigned long task_ratelimit),
- TP_ARGS(bdi, dirty_rate, task_ratelimit),
+ TP_ARGS(wb, dirty_rate, task_ratelimit),
TP_STRUCT__entry(
__array(char, bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
__field(unsigned long, dirty_ratelimit)
__field(unsigned long, task_ratelimit)
__field(unsigned long, balanced_dirty_ratelimit)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
- strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
- __entry->write_bw = KBps(bdi->wb.write_bandwidth);
- __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth);
+ strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+ __entry->write_bw = KBps(wb->write_bandwidth);
+ __entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
__entry->dirty_rate = KBps(dirty_rate);
- __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
+ __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
__entry->task_ratelimit = KBps(task_ratelimit);
__entry->balanced_dirty_ratelimit =
- KBps(bdi->wb.balanced_dirty_ratelimit);
+ KBps(wb->balanced_dirty_ratelimit);
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
TP_printk("bdi %s: "
"write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
- "balanced_dirty_ratelimit=%lu",
+ "balanced_dirty_ratelimit=%lu cgroup=%s",
__entry->bdi,
__entry->write_bw, /* write bandwidth */
__entry->avg_write_bw, /* avg write bandwidth */
__entry->dirty_rate, /* bdi dirty rate */
__entry->dirty_ratelimit, /* base ratelimit */
__entry->task_ratelimit, /* ratelimit with position control */
- __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
+ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
+ __get_str(cgroup)
)
);
TRACE_EVENT(balance_dirty_pages,
- TP_PROTO(struct backing_dev_info *bdi,
+ TP_PROTO(struct bdi_writeback *wb,
unsigned long thresh,
unsigned long bg_thresh,
unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
long pause,
unsigned long start_time),
- TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+ TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
dirty_ratelimit, task_ratelimit,
dirtied, period, pause, start_time),
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
__field( long, pause)
__field(unsigned long, period)
__field( long, think)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
unsigned long freerun = (thresh + bg_thresh) / 2;
- strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+ strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
__entry->limit = global_wb_domain.dirty_limit;
__entry->setpoint = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
__entry->period = period * 1000 / HZ;
__entry->pause = pause * 1000 / HZ;
__entry->paused = (jiffies - start_time) * 1000 / HZ;
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
"bdi_setpoint=%lu bdi_dirty=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
"dirtied=%u dirtied_pause=%u "
- "paused=%lu pause=%ld period=%lu think=%ld",
+ "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
__entry->bdi,
__entry->limit,
__entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
__entry->paused, /* ms */
__entry->pause, /* ms */
__entry->period, /* ms */
- __entry->think /* ms */
+ __entry->think, /* ms */
+ __get_str(cgroup)
)
);
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
__field(unsigned long, ino)
__field(unsigned long, state)
__field(unsigned long, dirtied_when)
+ __dynamic_array(char, cgroup,
+ __trace_wb_cgroup_size(inode_to_wb(inode)))
),
TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
__entry->ino = inode->i_ino;
__entry->state = inode->i_state;
__entry->dirtied_when = inode->dirtied_when;
+ __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
),
- TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+ TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
__entry->name,
__entry->ino,
show_inode_state(__entry->state),
__entry->dirtied_when,
- (jiffies - __entry->dirtied_when) / HZ
+ (jiffies - __entry->dirtied_when) / HZ,
+ __get_str(cgroup)
)
);
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
__field(unsigned long, writeback_index)
__field(long, nr_to_write)
__field(unsigned long, wrote)
+ __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
),
TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
__entry->writeback_index = inode->i_mapping->writeback_index;
__entry->nr_to_write = nr_to_write;
__entry->wrote = nr_to_write - wbc->nr_to_write;
+ __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
),
TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
- "index=%lu to_write=%ld wrote=%lu",
+ "index=%lu to_write=%ld wrote=%lu cgroup=%s",
__entry->name,
__entry->ino,
show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
(jiffies - __entry->dirtied_when) / HZ,
__entry->writeback_index,
__entry->nr_to_write,
- __entry->wrote
+ __entry->wrote,
+ __get_str(cgroup)
)
);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ee8d7fd07be3..2df8ddcb0ca0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
int ret = 0;
memcg = mem_cgroup_from_css(memcg_css);
- blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
- &blkio_cgrp_subsys);
+ &io_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css ||
!wb_tryget(wb)))
wb = NULL;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5cccc127ef81..0a931cdd4f6b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
- trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
+ trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
@@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* do a reset, as it may be a light dirtier.
*/
if (pause < min_pause) {
- trace_balance_dirty_pages(bdi,
+ trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
@@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping,
}
pause:
- trace_balance_dirty_pages(bdi,
+ trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,