From 1d316e658374f700fdfff9299c70ce65d8d145e6 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:36 -0700 Subject: bcache: implement PI controller for writeback rate bcache uses a control system to attempt to keep the amount of dirty data in cache at a user-configured level, while not responding excessively to transients and variations in write rate. Previously, the system was a PD controller; but the output from it was integrated, turning the Proportional term into an Integral term, and turning the Derivative term into a crude Proportional term. Performance of the controller has been uneven in production, and it has tended to respond slowly, oscillate, and overshoot. This patch set replaces the current control system with an explicit PI controller and tuning that should be correct for most hardware. By default, it attempts to write at a rate that would retire 1/40th of the current excess blocks per second. An integral term in turn works to remove steady state errors. IMO, this yields benefits in simplicity (removing weighted average filtering, etc) and system performance. Another small change is a tunable parameter is introduced to allow the user to specify a minimum rate at which dirty blocks are retired. There is a slight difference from earlier versions of the patch in integral handling to prevent excessive negative integral windup. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2ed9bd231d84..eb83be693d60 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,9 +265,6 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - unsigned long sectors_dirty_last; - long sectors_dirty_derivative; - struct bio_set *bio_split; unsigned data_csum:1; @@ -362,12 +359,14 @@ struct cached_dev { uint64_t writeback_rate_target; int64_t writeback_rate_proportional; - int64_t writeback_rate_derivative; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; - unsigned writeback_rate_d_term; + unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; }; enum alloc_reserve { -- cgit v1.2.3-59-g8ed1b From e41166c5c44e30dbd620f7c77a27efe5d5cc551a Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:38 -0700 Subject: bcache: writeback rate shouldn't artifically clamp The previous code artificially limited writeback rate to 1000000 blocks/second (NSEC_PER_MSEC), which is a rate that can be met on fast hardware. The rate limiting code works fine (though with decreased precision) up to 3 orders of magnitude faster, so use NSEC_PER_SEC. Additionally, ensure that uint32_t is used as a type for rate throughout the rate management so that type checking/clamp_t can work properly. bch_next_delay should be rewritten for increased precision and better handling of high rates and long sleep periods, but this is adequate for now. Signed-off-by: Michael Lyle Reported-by: Coly Li Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/util.h | 4 ++-- drivers/md/bcache/writeback.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index eb83be693d60..d77c4829c497 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -361,7 +361,7 @@ struct cached_dev { int64_t writeback_rate_proportional; int64_t writeback_rate_integral; int64_t writeback_rate_integral_scaled; - int64_t writeback_rate_change; + int32_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_i_term_inverse; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cb8d2ccbb6c6..8f509290bb02 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -441,10 +441,10 @@ struct bch_ratelimit { uint64_t next; /* - * Rate at which we want to do work, in units per nanosecond + * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - unsigned rate; + uint32_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8deb721c355e..897d28050656 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -52,7 +52,8 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t error = dirty - target; int64_t proportional_scaled = div_s64(error, dc->writeback_rate_p_term_inverse); - int64_t integral_scaled, new_rate; + int64_t integral_scaled; + uint32_t new_rate; if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), @@ -74,8 +75,8 @@ static void __update_writeback_rate(struct cached_dev *dc) integral_scaled = div_s64(dc->writeback_rate_integral, dc->writeback_rate_i_term_inverse); - new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), - dc->writeback_rate_minimum, NSEC_PER_MSEC); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; -- cgit v1.2.3-59-g8ed1b From 3b304d24a718ae779ee9c7f2014dd3b2d0893b70 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 30 Oct 2017 14:46:32 -0700 Subject: bcache: convert cached_dev.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable cached_dev.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Michael Lyle Signed-off-by: Elena Reshetova Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 7 ++++--- drivers/md/bcache/super.c | 6 +++--- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d77c4829c497..363ea6256b39 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -184,6 +184,7 @@ #include #include #include +#include #include #include @@ -296,7 +297,7 @@ struct cached_dev { struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ - atomic_t count; + refcount_t count; struct work_struct detach; /* @@ -805,13 +806,13 @@ do { \ static inline void cached_dev_put(struct cached_dev *dc) { - if (atomic_dec_and_test(&dc->count)) + if (refcount_dec_and_test(&dc->count)) schedule_work(&dc->detach); } static inline bool cached_dev_get(struct cached_dev *dc) { - if (!atomic_inc_not_zero(&dc->count)) + if (!refcount_inc_not_zero(&dc->count)) return false; /* Paired with the mb in cached_dev_attach */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 72c3b2929ef0..46134c45c6f6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -902,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w) closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(atomic_read(&dc->count)); + BUG_ON(refcount_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -1029,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ - atomic_set(&dc->count, 1); + refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ down_write(&dc->writeback_lock); @@ -1041,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); - atomic_inc(&dc->count); + refcount_inc(&dc->count); bch_writeback_queue(dc); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 34bcf49d737b..7d25bff37a9b 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -91,7 +91,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); + refcount_inc(&dc->count); if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); -- cgit v1.2.3-59-g8ed1b From d44c2f9e7cc0041f0cd88df1fe7a1fceb713ab14 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Mon, 30 Oct 2017 14:46:33 -0700 Subject: bcache: update bucket_in_use in real time bucket_in_use is updated in gc thread which triggered by invalidating or writing sectors_to_gc dirty data, It's a long interval. Therefore, when we use it to compare with the threshold, it is often not timely, which leads to inaccurate judgment and often results in bucket depletion. We have send a patch before, by the means of updating bucket_in_use periodically In gc thread, which Coly thought that would lead high latency, In this patch, we add avail_nbuckets to record the count of available buckets, and we calculate bucket_in_use when alloc or free bucket in real time. [edited by ML: eliminated some whitespace errors] Signed-off-by: Tang Junhui Signed-off-by: Michael Lyle Reviewed-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 10 ++++++++++ drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 17 ++++++++++------- drivers/md/bcache/btree.h | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c40870e99f5..8c5a626343d4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -442,6 +442,11 @@ out: b->prio = INITIAL_PRIO; } + if (ca->set->avail_nbuckets > 0) { + ca->set->avail_nbuckets--; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } + return r; } @@ -449,6 +454,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + + if (ca->set->avail_nbuckets < ca->set->nbuckets) { + ca->set->avail_nbuckets++; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } } void bch_bucket_free(struct cache_set *c, struct bkey *k) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 363ea6256b39..e274082330dc 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -581,6 +581,7 @@ struct cache_set { uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; + size_t avail_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 866dcf78ff8e..d8865e6ead37 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1240,6 +1240,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) __bch_btree_mark_key(c, level, k); } +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1651,9 +1656,8 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -static size_t bch_btree_gc_finish(struct cache_set *c) +static void bch_btree_gc_finish(struct cache_set *c) { - size_t available = 0; struct bucket *b; struct cache *ca; unsigned i; @@ -1690,6 +1694,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c) } rcu_read_unlock(); + c->avail_nbuckets = 0; for_each_cache(ca, c, i) { uint64_t *i; @@ -1711,18 +1716,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c) BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - available++; + c->avail_nbuckets++; } } mutex_unlock(&c->bucket_lock); - return available; } static void bch_btree_gc(struct cache_set *c) { int ret; - unsigned long available; struct gc_stat stats; struct closure writes; struct btree_op op; @@ -1745,14 +1748,14 @@ static void bch_btree_gc(struct cache_set *c) pr_warn("gc failed!"); } while (ret); - available = bch_btree_gc_finish(c); + bch_btree_gc_finish(c); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); stats.data <<= 9; - stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + bch_update_bucket_in_use(c, &stats); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 73da1f5626cb..4073aca09a49 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -305,5 +305,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); - +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif -- cgit v1.2.3-59-g8ed1b