From 429b3fb027492b2b9f96e00bb1a9255fc56d4934 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:14:04 -0400 Subject: mpage: make __mpage_writepage() honor cgroup writeback __mpage_writepage() is used to implement mpage_writepages() which in turn is used for ->writepages() of various filesystems. All writeback logic is now updated to handle cgroup writeback and the block cgroup to issue IOs for is encoded in writeback_control and can be retrieved from the inode; however, __mpage_writepage() currently ignores the blkcg indicated by the inode and issues all bio's without explicit blkcg association. This patch updates __mpage_writepage() so that the issued bio's are associated with inode_to_writeback_blkcg_css(inode). v2: Updated for per-inode wb association. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Andrew Morton Cc: Alexander Viro Signed-off-by: Jens Axboe --- fs/mpage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/mpage.c') diff --git a/fs/mpage.c b/fs/mpage.c index 3e79220babac..a3ccb0bd465a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,8 @@ alloc_new: bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; + + bio_associate_blkcg(bio, inode_to_wb_blkcg_css(inode)); } /* -- cgit v1.2.3-59-g8ed1b From b16b1deb553adcd7b3b7ce3e6d6fd1b923f314da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:39:48 -0600 Subject: writeback: make writeback_control track the inode being written back Currently, for cgroup writeback, the IO submission paths directly associate the bio's with the blkcg from inode_to_wb_blkcg_css(); however, it'd be necessary to keep more writeback context to implement foreign inode writeback detection. wbc (writeback_control) is the natural fit for the extra context - it persists throughout the writeback of each inode and is passed all the way down to IO submission paths. This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and wbc_attach_fdatawrite_inode() which are used to associate wbc with the inode being written back. IO submission paths now use wbc_init_bio() instead of directly associating bio's with blkcg themselves. This leaves inode_to_wb_blkcg_css() w/o any user. The function is removed. wbc currently only tracks the associated wb (bdi_writeback). Future patches will add more for foreign inode detection. The association is established under i_lock which will be depended upon when migrating foreign inodes to other wb's. As currently, once established, inode to wb association never changes, going through wbc when initializing bio's doesn't cause any behavior changes. v2: submit_blk_blkcg() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/buffer.c | 25 ++++++++--------- fs/fs-writeback.c | 37 ++++++++++++++++++++++-- fs/mpage.c | 2 +- include/linux/backing-dev.h | 12 -------- include/linux/writeback.h | 68 +++++++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 2 ++ 6 files changed, 118 insertions(+), 28 deletions(-) (limited to 'fs/mpage.c') diff --git a/fs/buffer.c b/fs/buffer.c index b85e94134ea6..d883c799fb45 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,9 +45,9 @@ #include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_blkcg(int rw, struct buffer_head *bh, - unsigned long bio_flags, - struct cgroup_subsys_state *blkcg_css); +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1709,7 +1709,6 @@ static int __block_write_full_page(struct inode *inode, struct page *page, unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - struct cgroup_subsys_state *blkcg_css = inode_to_wb_blkcg_css(inode); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1798,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_blkcg(write_op, bh, 0, blkcg_css); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -1852,7 +1851,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_blkcg(write_op, bh, 0, blkcg_css); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -3017,11 +3016,11 @@ void guard_bio_eod(int rw, struct bio *bio) } } -static int submit_bh_blkcg(int rw, struct buffer_head *bh, - unsigned long bio_flags, - struct cgroup_subsys_state *blkcg_css) +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; + int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); @@ -3041,8 +3040,8 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (blkcg_css) - bio_associate_blkcg(bio, blkcg_css); + if (wbc) + wbc_init_bio(wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; @@ -3071,13 +3070,13 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh, int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { - return submit_bh_blkcg(rw, bh, bio_flags, NULL); + return submit_bh_wbc(rw, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); int submit_bh(int rw, struct buffer_head *bh) { - return submit_bh_blkcg(rw, bh, 0, NULL); + return submit_bh_wbc(rw, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cf6ccfb01e03..755e8ef8d1f0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -244,6 +244,37 @@ void __inode_attach_wb(struct inode *inode, struct page *page) wb_put(wb); } +/** + * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it + * @wbc: writeback_control of interest + * @inode: target inode + * + * @inode is locked and about to be written back under the control of @wbc. + * Record @inode's writeback context into @wbc and unlock the i_lock. On + * writeback completion, wbc_detach_inode() should be called. This is used + * to track the cgroup writeback context. + */ +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) +{ + wbc->wb = inode_to_wb(inode); + wb_get(wbc->wb); + spin_unlock(&inode->i_lock); +} + +/** + * wbc_detach_inode - disassociate wbc from its target inode + * @wbc: writeback_control of interest + * + * To be called after a writeback attempt of an inode finishes and undoes + * wbc_attach_and_unlock_inode(). Can be called under any context. + */ +void wbc_detach_inode(struct writeback_control *wbc) +{ + wb_put(wbc->wb); + wbc->wb = NULL; +} + /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion @@ -877,10 +908,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); + wbc_detach_inode(wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); /* @@ -1013,7 +1045,7 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(&wbc, inode); write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; @@ -1025,6 +1057,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); diff --git a/fs/mpage.c b/fs/mpage.c index a3ccb0bd465a..388fde6ac255 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: if (bio == NULL) goto confused; - bio_associate_blkcg(bio, inode_to_wb_blkcg_css(inode)); + wbc_init_bio(wbc, bio); } /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5c978a924157..b1d2489a6536 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return inode_to_wb(inode)->blkcg_css; -} - struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return blkcg_root_css; -} - struct wb_iter { int next_id; }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6726b7e56beb..8f964e558af5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -86,6 +86,9 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *wb; /* wb this writeback is issued under */ +#endif }; /* @@ -176,7 +179,14 @@ static inline void wait_on_inode(struct inode *inode) #ifdef CONFIG_CGROUP_WRITEBACK +#include +#include + void __inode_attach_wb(struct inode *inode, struct page *page); +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock); +void wbc_detach_inode(struct writeback_control *wbc); /** * inode_attach_wb - associate an inode with its wb @@ -207,6 +217,44 @@ static inline void inode_detach_wb(struct inode *inode) } } +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} + +/** + * wbc_init_bio - writeback specific initializtion of bio + * @wbc: writeback_control for the writeback in progress + * @bio: bio to be initialized + * + * @bio is a part of the writeback in progress controlled by @wbc. Perform + * writeback specific initialization. This is used to apply the cgroup + * writeback context. + */ +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (wbc->wb) + bio_associate_blkcg(bio, wbc->wb->blkcg_css); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct page *page) @@ -217,6 +265,26 @@ static inline void inode_detach_wb(struct inode *inode) { } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ +} + +static inline void wbc_detach_inode(struct writeback_control *wbc) +{ +} + +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/mm/filemap.c b/mm/filemap.c index 7b1443dc3ad0..2f065b19b635 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -290,7 +290,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); return ret; } -- cgit v1.2.3-59-g8ed1b From 2a81490811d0296d390c571bb64eaa93e5ed7def Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:51 -0400 Subject: writeback: implement foreign cgroup inode detection As concurrent write sharing of an inode is expected to be very rare and memcg only tracks page ownership on first-use basis severely confining the usefulness of such sharing, cgroup writeback tracks ownership per-inode. While the support for concurrent write sharing of an inode is deemed unnecessary, an inode being written to by different cgroups at different points in time is a lot more common, and, more importantly, charging only by first-use can too readily lead to grossly incorrect behaviors (single foreign page can lead to gigabytes of writeback to be incorrectly attributed). To resolve this issue, cgroup writeback detects the majority dirtier of an inode and will transfer the ownership to it. To avoid unnnecessary oscillation, the detection mechanism keeps track of history and gives out the switch verdict only if the foreign usage pattern is stable over a certain amount of time and/or writeback attempts. The detection mechanism has fairly low space and computation overhead. It adds 8 bytes to struct inode (one int and two u16's) and minimal amount of calculation per IO. The detection mechanism converges to the correct answer usually in several seconds of IO time when there's a clear majority dirtier. Even when there isn't, it can reach an acceptable answer fairly quickly under most circumstances. Please see wb_detach_inode() for more details. This patch only implements detection. Following patches will implement actual switching. v2: wbc_account_io() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/buffer.c | 4 +- fs/fs-writeback.c | 177 +++++++++++++++++++++++++++++++++++++++++++++- fs/mpage.c | 1 + include/linux/fs.h | 5 ++ include/linux/writeback.h | 16 +++++ 5 files changed, 200 insertions(+), 3 deletions(-) (limited to 'fs/mpage.c') diff --git a/fs/buffer.c b/fs/buffer.c index d883c799fb45..aca687f966d7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3040,8 +3040,10 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) + if (wbc) { wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 755e8ef8d1f0..f98d40333c85 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -214,6 +214,20 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #ifdef CONFIG_CGROUP_WRITEBACK +/* parameters for foreign inode detection, see wb_detach_inode() */ +#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ +#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ +#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ + +#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ +#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) + /* each slot's duration is 2s / 16 */ +#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) + /* if foreign slots >= 8, switch */ +#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) + /* one round can affect upto 5 slots */ + void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -258,23 +272,182 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) { wbc->wb = inode_to_wb(inode); + wbc->inode = inode; + + wbc->wb_id = wbc->wb->memcg_css->id; + wbc->wb_lcand_id = inode->i_wb_frn_winner; + wbc->wb_tcand_id = 0; + wbc->wb_bytes = 0; + wbc->wb_lcand_bytes = 0; + wbc->wb_tcand_bytes = 0; + wb_get(wbc->wb); spin_unlock(&inode->i_lock); } /** - * wbc_detach_inode - disassociate wbc from its target inode - * @wbc: writeback_control of interest + * wbc_detach_inode - disassociate wbc from inode and perform foreign detection + * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. + * + * As concurrent write sharing of an inode is expected to be very rare and + * memcg only tracks page ownership on first-use basis severely confining + * the usefulness of such sharing, cgroup writeback tracks ownership + * per-inode. While the support for concurrent write sharing of an inode + * is deemed unnecessary, an inode being written to by different cgroups at + * different points in time is a lot more common, and, more importantly, + * charging only by first-use can too readily lead to grossly incorrect + * behaviors (single foreign page can lead to gigabytes of writeback to be + * incorrectly attributed). + * + * To resolve this issue, cgroup writeback detects the majority dirtier of + * an inode and transfers the ownership to it. To avoid unnnecessary + * oscillation, the detection mechanism keeps track of history and gives + * out the switch verdict only if the foreign usage pattern is stable over + * a certain amount of time and/or writeback attempts. + * + * On each writeback attempt, @wbc tries to detect the majority writer + * using Boyer-Moore majority vote algorithm. In addition to the byte + * count from the majority voting, it also counts the bytes written for the + * current wb and the last round's winner wb (max of last round's current + * wb, the winner from two rounds ago, and the last round's majority + * candidate). Keeping track of the historical winner helps the algorithm + * to semi-reliably detect the most active writer even when it's not the + * absolute majority. + * + * Once the winner of the round is determined, whether the winner is + * foreign or not and how much IO time the round consumed is recorded in + * inode->i_wb_frn_history. If the amount of recorded foreign IO time is + * over a certain threshold, the switch verdict is given. */ void wbc_detach_inode(struct writeback_control *wbc) { + struct bdi_writeback *wb = wbc->wb; + struct inode *inode = wbc->inode; + u16 history = inode->i_wb_frn_history; + unsigned long avg_time = inode->i_wb_frn_avg_time; + unsigned long max_bytes, max_time; + int max_id; + + /* pick the winner of this round */ + if (wbc->wb_bytes >= wbc->wb_lcand_bytes && + wbc->wb_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_id; + max_bytes = wbc->wb_bytes; + } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_lcand_id; + max_bytes = wbc->wb_lcand_bytes; + } else { + max_id = wbc->wb_tcand_id; + max_bytes = wbc->wb_tcand_bytes; + } + + /* + * Calculate the amount of IO time the winner consumed and fold it + * into the running average kept per inode. If the consumed IO + * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for + * deciding whether to switch or not. This is to prevent one-off + * small dirtiers from skewing the verdict. + */ + max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, + wb->avg_write_bandwidth); + if (avg_time) + avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - + (avg_time >> WB_FRN_TIME_AVG_SHIFT); + else + avg_time = max_time; /* immediate catch up on first run */ + + if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { + int slots; + + /* + * The switch verdict is reached if foreign wb's consume + * more than a certain proportion of IO time in a + * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot + * history mask where each bit represents one sixteenth of + * the period. Determine the number of slots to shift into + * history from @max_time. + */ + slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), + (unsigned long)WB_FRN_HIST_MAX_SLOTS); + history <<= slots; + if (wbc->wb_id != max_id) + history |= (1U << slots) - 1; + + /* + * Switch if the current wb isn't the consistent winner. + * If there are multiple closely competing dirtiers, the + * inode may switch across them repeatedly over time, which + * is okay. The main goal is avoiding keeping an inode on + * the wrong wb for an extended period of time. + */ + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) { + /* switch */ + max_id = 0; + avg_time = 0; + history = 0; + } + } + + /* + * Multiple instances of this function may race to update the + * following fields but we don't mind occassional inaccuracies. + */ + inode->i_wb_frn_winner = max_id; + inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); + inode->i_wb_frn_history = history; + wb_put(wbc->wb); wbc->wb = NULL; } +/** + * wbc_account_io - account IO issued during writeback + * @wbc: writeback_control of the writeback in progress + * @page: page being written out + * @bytes: number of bytes being written out + * + * @bytes from @page are about to written out during the writeback + * controlled by @wbc. Keep the book for foreign inode detection. See + * wbc_detach_inode(). + */ +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes) +{ + int id; + + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (!wbc->wb) + return; + + rcu_read_lock(); + id = mem_cgroup_css_from_page(page)->id; + rcu_read_unlock(); + + if (id == wbc->wb_id) { + wbc->wb_bytes += bytes; + return; + } + + if (id == wbc->wb_lcand_id) + wbc->wb_lcand_bytes += bytes; + + /* Boyer-Moore majority vote algorithm */ + if (!wbc->wb_tcand_bytes) + wbc->wb_tcand_id = id; + if (id == wbc->wb_tcand_id) + wbc->wb_tcand_bytes += bytes; + else + wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); +} + /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion diff --git a/fs/mpage.c b/fs/mpage.c index 388fde6ac255..ca0244b69de8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -614,6 +614,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ + wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(WRITE, bio); diff --git a/include/linux/fs.h b/include/linux/fs.h index 67a42ec95065..740126d7c44e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -638,6 +638,11 @@ struct inode { struct list_head i_wb_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ + + /* foreign inode detection, see wbc_detach_inode() */ + int i_wb_frn_winner; + u16 i_wb_frn_avg_time; + u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f964e558af5..b333c945e571 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -88,6 +88,15 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ + struct inode *inode; /* inode being written out */ + + /* foreign inode detection, see wbc_detach_inode() */ + int wb_id; /* current wb id */ + int wb_lcand_id; /* last foreign candidate wb id */ + int wb_tcand_id; /* this foreign candidate wb id */ + size_t wb_bytes; /* bytes written by current wb */ + size_t wb_lcand_bytes; /* bytes written by last candidate */ + size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; @@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes); /** * inode_attach_wb - associate an inode with its wb @@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } +static inline void wbc_account_io(struct writeback_control *wbc, + struct page *page, size_t bytes) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3-59-g8ed1b