diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 820 |
1 files changed, 477 insertions, 343 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..443f83382b9b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,7 +42,6 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; - unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -121,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode, struct list_head *head) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); list_move(&inode->i_io_list, head); @@ -132,29 +132,12 @@ static bool inode_io_list_move_locked(struct inode *inode, return false; } -/** - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list - * @inode: inode to be removed - * @wb: bdi_writeback @inode is being removed from - * - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and - * clear %WB_has_dirty_io if all are empty afterwards. - */ -static void inode_io_list_del_locked(struct inode *inode, - struct bdi_writeback *wb) -{ - assert_spin_locked(&wb->list_lock); - - list_del_init(&inode->i_io_list); - wb_io_lists_depopulated(wb); -} - static void wb_wakeup(struct bdi_writeback *wb) { - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); } static void finish_writeback_work(struct bdi_writeback *wb, @@ -181,7 +164,7 @@ static void wb_queue_work(struct bdi_writeback *wb, if (work->done) atomic_inc(&work->done->cnt); - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); @@ -189,7 +172,7 @@ static void wb_queue_work(struct bdi_writeback *wb, } else finish_writeback_work(wb, work); - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); } /** @@ -243,6 +226,13 @@ void wb_wait_for_completion(struct wb_completion *done) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ +/* + * Maximum inodes per isw. A specific value has been chosen to make + * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. + */ +#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ + / sizeof(struct inode *)) + static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -278,6 +268,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page) EXPORT_SYMBOL_GPL(__inode_attach_wb); /** + * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list + * @inode: inode of interest with i_lock held + * @wb: target bdi_writeback + * + * Remove the inode from wb's io lists and if necessarily put onto b_attached + * list. Only inodes attached to cgwb's are kept on this list. + */ +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + if (wb != &wb->bdi->wb) + list_move(&inode->i_io_list, &wb->b_attached); + else + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + +/** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * @@ -331,11 +343,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct inode *inode; - struct bdi_writeback *new_wb; + struct rcu_work work; - struct rcu_head rcu_head; - struct work_struct work; + /* + * Multiple inodes can be switched at once. The switching procedure + * consists of two parts, separated by a RCU grace period. To make + * sure that the second part is executed for each inode gone through + * the first part, all inode pointers are placed into a NULL-terminated + * array embedded into struct inode_switch_wbs_context. Otherwise + * an inode could be left in a non-consistent state. + */ + struct bdi_writeback *new_wb; + struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -348,91 +367,78 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static bool inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *old_wb, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = - container_of(work, struct inode_switch_wbs_context, work); - struct inode *inode = isw->inode; - struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; - struct bdi_writeback *old_wb = inode->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; XA_STATE(xas, &mapping->i_pages, 0); - struct page *page; + struct folio *folio; bool switched = false; - /* - * If @inode switches cgwb membership while sync_inodes_sb() is - * being issued, sync_inodes_sb() might miss it. Synchronize. - */ - down_read(&bdi->wb_switch_rwsem); - - /* - * By the time control reaches here, RCU grace period has passed - * since I_WB_SWITCH assertion and all wb stat update transactions - * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against the i_pages lock. - * - * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock - * gives us exclusion against all wb related operations on @inode - * including IO list manipulations and stat updates. - */ - if (old_wb < new_wb) { - spin_lock(&old_wb->list_lock); - spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&new_wb->list_lock); - spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); - } spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* - * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_io_list. + * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction + * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & I_FREEING)) + if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points - * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under writeback. + * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to + * folios actually under writeback. */ - xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { - if (PageDirty(page)) { - dec_wb_stat(old_wb, WB_RECLAIMABLE); - inc_wb_stat(new_wb, WB_RECLAIMABLE); + xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (folio_test_dirty(folio)) { + long nr = folio_nr_pages(folio); + wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); + wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); } } xas_set(&xas, 0); - xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); + xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + long nr = folio_nr_pages(folio); + WARN_ON_ONCE(!folio_test_writeback(folio)); + wb_stat_mod(old_wb, WB_WRITEBACK, -nr); + wb_stat_mod(new_wb, WB_WRITEBACK, nr); + } + + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + atomic_dec(&old_wb->writeback_inodes); + atomic_inc(&new_wb->writeback_inodes); } wb_get(new_wb); /* - * Transfer to @new_wb's IO list if necessary. The specific list - * @inode was on is ignored and the inode is put on ->b_dirty which - * is always correct including from ->b_dirty_time. The transfer - * preserves @inode->dirtied_when ordering. + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. + * The transfer preserves @inode->dirtied_when ordering. If the @inode + * was clean, it means it was on the b_attached list, so move it onto + * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { - struct inode *pos; - - inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; - inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); + + if (inode->i_state & I_DIRTY_ALL) { + struct inode *pos; + + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_io_list_move_locked(inode, new_wb, + pos->i_io_list.prev); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } } else { inode->i_wb = new_wb; } @@ -451,31 +457,94 @@ skip_switch: xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); + + return switched; +} + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + unsigned long nr_switched = 0; + struct inode **inodep; + + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against the i_pages lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + + for (inodep = isw->inodes; *inodep; inodep++) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; + } + spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); - if (switched) { + if (nr_switched) { wb_wakeup(new_wb); - wb_put(old_wb); + wb_put_many(old_wb, nr_switched); } - wb_put(new_wb); - iput(inode); + for (inodep = isw->inodes; *inodep; inodep++) + iput(*inodep); + wb_put(new_wb); kfree(isw); - atomic_dec(&isw_nr_in_flight); } -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +static bool inode_prepare_wbs_switch(struct inode *inode, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = container_of(rcu_head, - struct inode_switch_wbs_context, rcu_head); + /* + * Paired with smp_mb() in cgroup_writeback_umount(). + * isw_nr_in_flight must be increased before checking SB_ACTIVE and + * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 + * in cgroup_writeback_umount() and the isw_wq will be not flushed. + */ + smp_mb(); + + if (IS_DAX(inode)) + return false; + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (!(inode->i_sb->s_flags & SB_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_to_wb(inode) == new_wb) { + spin_unlock(&inode->i_lock); + return false; + } + inode->i_state |= I_WB_SWITCH; + __iget(inode); + spin_unlock(&inode->i_lock); - /* needs to grab bh-unsafe locks, bounce to work item */ - INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_work(isw_wq, &isw->work); + return true; } /** @@ -500,32 +569,30 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; - isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC); if (!isw) return; + atomic_inc(&isw_nr_in_flight); + /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); - if (memcg_css) - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + if (memcg_css && !css_tryget(memcg_css)) + memcg_css = NULL; rcu_read_unlock(); + if (!memcg_css) + goto out_free; + + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); if (!isw->new_wb) goto out_free; - /* while holding I_WB_SWITCH, no one else can update the association */ - spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING) || - inode_to_wb(inode) == isw->new_wb) { - spin_unlock(&inode->i_lock); + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; - } - inode->i_state |= I_WB_SWITCH; - __iget(inode); - spin_unlock(&inode->i_lock); - isw->inode = inode; + isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells @@ -533,18 +600,85 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ - call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - - atomic_inc(&isw_nr_in_flight); + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); return; out_free: + atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); } /** + * cleanup_offline_cgwb - detach associated inodes + * @wb: target wb + * + * Switch all inodes attached to @wb to a nearest living ancestor's wb in order + * to eventually release the dying @wb. Returns %true if not all inodes were + * switched and the function has to be restarted. + */ +bool cleanup_offline_cgwb(struct bdi_writeback *wb) +{ + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + struct inode *inode; + int nr; + bool restart = false; + + isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW), + GFP_KERNEL); + if (!isw) + return restart; + + atomic_inc(&isw_nr_in_flight); + + for (memcg_css = wb->memcg_css->parent; memcg_css; + memcg_css = memcg_css->parent) { + isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (isw->new_wb) + break; + } + if (unlikely(!isw->new_wb)) + isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + + nr = 0; + spin_lock(&wb->list_lock); + list_for_each_entry(inode, &wb->b_attached, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[nr++] = inode; + + if (nr >= WB_MAX_INODES_PER_ISW - 1) { + restart = true; + break; + } + } + spin_unlock(&wb->list_lock); + + /* no attached inodes? bail out */ + if (nr == 0) { + atomic_dec(&isw_nr_in_flight); + wb_put(isw->new_wb); + kfree(isw); + return restart; + } + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); + + return restart; +} + +/** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode @@ -605,7 +739,7 @@ EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of - * an inode and transfers the ownership to it. To avoid unnnecessary + * an inode and transfers the ownership to it. To avoid unnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. @@ -761,43 +895,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** - * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion (may be NULL) - * @cong_bits: mask of WB_[a]sync_congested bits to test - * - * Tests whether @inode is congested. @cong_bits is the mask of congestion - * bits to test and the return value is the mask of set bits. - * - * If cgroup writeback is enabled for @inode, the congestion state is - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg - * associated with @inode is congested; otherwise, the root wb's congestion - * state is used. - * - * @inode is allowed to be NULL as this function is often called on - * mapping->host which is NULL for the swapper space. - */ -int inode_congested(struct inode *inode, int cong_bits) -{ - /* - * Once set, ->i_wb never becomes NULL while the inode is alive. - * Start transaction iff ->i_wb is visible. - */ - if (inode && inode_to_wb_is_valid(inode)) { - struct bdi_writeback *wb; - struct wb_lock_cookie lock_cookie = {}; - bool congested; - - wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); - congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, &lock_cookie); - return congested; - } - - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} -EXPORT_SYMBOL_GPL(inode_congested); - -/** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi @@ -908,20 +1005,20 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; + unsigned long dirty; int ret; /* lookup bdi and memcg */ @@ -950,24 +1047,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, } /* - * If @nr is zero, the caller is attempting to write out most of + * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. + * + * BTW the memcg stats are flushed periodically and this is best-effort + * estimation, so some potential error is ok. */ - if (!nr) { - unsigned long filepages, headroom, dirty, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, - &writeback); - nr = dirty * 10 / 8; - } + dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); + dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { - work->nr_pages = nr; + work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; @@ -999,6 +1094,12 @@ out_bdi_put: */ void cgroup_writeback_umount(void) { + /* + * SB_ACTIVE should be reliably cleared before checking + * isw_nr_in_flight, see generic_shutdown_super(). + */ + smp_mb(); + if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to @@ -1023,6 +1124,17 @@ fs_initcall(cgroup_writeback_init); static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -1070,7 +1182,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } @@ -1123,9 +1234,16 @@ void inode_io_list_del(struct inode *inode) struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - inode_io_list_del_locked(inode, wb); + spin_lock(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } +EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb @@ -1172,8 +1290,10 @@ void sb_clear_inode_writeback(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { + assert_spin_locked(&inode->i_lock); + if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1182,6 +1302,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); + inode->i_state &= ~I_SYNC_QUEUED; +} + +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +{ + spin_lock(&inode->i_lock); + redirty_tail_locked(inode, wb); + spin_unlock(&inode->i_lock); } /* @@ -1220,16 +1348,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) #define EXPIRE_DIRTY_ATIME 0x0001 /* - * Move expired (dirtied before work->older_than_this) dirty inodes from + * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - int flags, - struct wb_writeback_work *work) + unsigned long dirtied_before) { - unsigned long *older_than_this = NULL; - unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -1237,21 +1362,15 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; - if ((flags & EXPIRE_DIRTY_ATIME) == 0) - older_than_this = work->older_than_this; - else if (!work->for_sync) { - expire_time = jiffies - (dirtytime_expire_interval * HZ); - older_than_this = &expire_time; - } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (inode_dirtied_after(inode, dirtied_before)) break; + spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - if (flags & EXPIRE_DIRTY_ATIME) - set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); + inode->i_state |= I_SYNC_QUEUED; + spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -1265,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue, goto out; } - /* Move inodes from one superblock together */ + /* + * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', + * we don't take inode->i_lock here because it is just a pointless overhead. + * Inode is already marked as I_SYNC_QUEUED so writeback list handling is + * fully under our control. + */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { @@ -1289,18 +1413,22 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, + unsigned long dirtied_before) { int moved; + unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); + if (!work->for_sync) + time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, - EXPIRE_DIRTY_ATIME, work); + time_expire_jif); if (moved) wb_io_lists_populated(wb); - trace_writeback_queue_io(wb, work, moved); + trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -1394,7 +1522,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); return; } @@ -1414,7 +1542,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -1422,20 +1550,27 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * such as delayed allocation during submission or metadata * updates after data IO completion. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); } } /* - * Write out an inode and its dirty pages. Do not update the writeback list - * linkage. That is left to the caller. The caller is also responsible for - * setting I_SYNC flag and calling inode_sync_complete() to clear it. + * Write out an inode and its dirty pages (or some of its dirty pages, depending + * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state. + * + * This doesn't remove the inode from the writeback list it is on, except + * potentially to move it from b_dirty_time to b_dirty due to timestamp + * expiration. The caller is otherwise responsible for writeback list handling. + * + * The caller is also responsible for setting the I_SYNC flag beforehand and + * calling inode_sync_complete() to clear it afterwards. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -1465,25 +1600,26 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Some filesystems may redirty the inode during the writeback - * due to delalloc, clear dirty metadata flags right before - * write_inode() + * If the inode has dirty timestamps and we need to write them, call + * mark_inode_dirty_sync() to notify the filesystem about it and to + * change I_DIRTY_TIME into I_DIRTY_SYNC. */ - spin_lock(&inode->i_lock); + if ((inode->i_state & I_DIRTY_TIME) && + (wbc->sync_mode == WB_SYNC_ALL || + time_after(jiffies, inode->dirtied_time_when + + dirtytime_expire_interval * HZ))) { + trace_writeback_lazytime(inode); + mark_inode_dirty_sync(inode); + } + /* + * Get and clear the dirty flags from i_state. This needs to be done + * after calling writepages because some filesystems may redirty the + * inode during writepages due to delalloc. It also needs to be done + * after handling timestamp expiration, as that may dirty the inode too. + */ + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - if (inode->i_state & I_DIRTY_TIME) { - if ((dirty & I_DIRTY_INODE) || - wbc->sync_mode == WB_SYNC_ALL || - unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || - unlikely(time_after(jiffies, - (inode->dirtied_time_when + - dirtytime_expire_interval * HZ)))) { - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; - trace_writeback_lazytime(inode); - } - } else - inode->i_state &= ~I_DIRTY_TIME_EXPIRED; inode->i_state &= ~dirty; /* @@ -1501,28 +1637,35 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; + else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + if (!(inode->i_state & I_DIRTY_PAGES)) { + inode->i_state &= ~I_PINNING_FSCACHE_WB; + wbc->unpinned_fscache_wb = true; + dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + } + } spin_unlock(&inode->i_lock); - if (dirty & I_DIRTY_TIME) - mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; } + wbc->unpinned_fscache_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } /* - * Write out an inode's dirty pages. Either the caller has an active reference - * on the inode or the inode has I_WILL_FREE set. + * Write out an inode's dirty data and metadata on-demand, i.e. separately from + * the regular batched writeback done by the flusher threads in + * writeback_sb_inodes(). @wbc controls various aspects of the write, such as + * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE). * - * This function is designed to be called for writing back one inode which - * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() - * and does more profound writeback list handling in writeback_sb_inodes(). + * To prevent the inode from going away, either the caller must have a reference + * to the inode, or the inode must have I_WILL_FREE or I_FREEING set. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -1537,23 +1680,23 @@ static int writeback_single_inode(struct inode *inode, WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { - if (wbc->sync_mode != WB_SYNC_ALL) - goto out; /* - * It's a data-integrity sync. We must wait. Since callers hold - * inode reference or inode has I_WILL_FREE set, it cannot go - * away under us. + * Writeback is already running on the inode. For WB_SYNC_NONE, + * that's enough and we can just return. For WB_SYNC_ALL, we + * must wait for the existing writeback to complete, then do + * writeback again if there's anything left. */ + if (wbc->sync_mode != WB_SYNC_ALL) + goto out; __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean and we have no outstanding writeback in - * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this - * function since flusher thread may be doing for example sync in - * parallel and if we move the inode, it could get skipped. So here we - * make sure inode is on some writeback list and leave it there unless - * we have completely cleaned the inode. + * If the inode is already fully clean, then there's nothing to do. + * + * For data-integrity syncs we also need to check whether any pages are + * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If + * there are any such pages, we'll need to wait for them. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || @@ -1569,11 +1712,21 @@ static int writeback_single_inode(struct inode *inode, wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* - * If inode is clean, remove it from writeback lists. Otherwise don't - * touch it. See comment above for explanation. + * If the inode is now fully clean, then it can be safely removed from + * its writeback list (if any). Otherwise the flusher threads are + * responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED)) { + if ((inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + } + } + spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1637,11 +1790,12 @@ static long writeback_sb_inodes(struct super_block *sb, }; unsigned long start_time = jiffies; long write_chunk; - long wrote = 0; /* count both pages and inodes */ + long total_wrote = 0; /* count both pages and inodes */ while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; + long wrote; if (inode->i_sb != sb) { if (work->sb) { @@ -1669,8 +1823,8 @@ static long writeback_sb_inodes(struct super_block *sb, */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); - redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { @@ -1683,8 +1837,8 @@ static long writeback_sb_inodes(struct super_block *sb, * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ - spin_unlock(&inode->i_lock); requeue_io(inode, wb); + spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } @@ -1717,7 +1871,9 @@ static long writeback_sb_inodes(struct super_block *sb, wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; + wrote = wrote < 0 ? 0 : wrote; + total_wrote += wrote; if (need_resched()) { /* @@ -1728,7 +1884,7 @@ static long writeback_sb_inodes(struct super_block *sb, * unplug, so get our IOs out the door before we * give up the CPU. */ - blk_flush_plug(current); + blk_flush_plug(current->plug, false); cond_resched(); } @@ -1739,7 +1895,7 @@ static long writeback_sb_inodes(struct super_block *sb, tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) - wrote++; + total_wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); @@ -1753,14 +1909,14 @@ static long writeback_sb_inodes(struct super_block *sb, * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ - if (wrote) { + if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } - return wrote; + return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, @@ -1811,7 +1967,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, &work); + queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); @@ -1831,22 +1987,18 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * - * older_than_this takes precedence over nr_to_write. So we'll only write back + * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; - unsigned long oldest_jif; + unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; - oldest_jif = jiffies; - work->older_than_this = &oldest_jif; - blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { @@ -1880,22 +2032,20 @@ static long wb_writeback(struct bdi_writeback *wb, * safe. */ if (work->for_kupdate) { - oldest_jif = jiffies - + dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) - oldest_jif = jiffies; + dirtied_before = jiffies; trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) - queue_io(wb, work); + queue_io(wb, work, dirtied_before); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); - wb_update_bandwidth(wb, wb_start); - /* * Did we write something? Try for more * @@ -1937,13 +2087,13 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (!list_empty(&wb->work_list)) { work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); return work; } @@ -2064,7 +2214,6 @@ void wb_workfn(struct work_struct *work) long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); - current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { @@ -2093,8 +2242,6 @@ void wb_workfn(struct work_struct *work) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); - - current->flags &= ~PF_SWAPWRITE; } /* @@ -2131,8 +2278,7 @@ void wakeup_flusher_threads(enum wb_reason reason) /* * If we are expecting writeback progress we must submit plugged IO. */ - if (blk_needs_flush_plug(current)) - blk_schedule_flush_plug(current); + blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) @@ -2182,7 +2328,7 @@ static int __init start_dirtytime_writeback(void) __initcall(start_dirtytime_writeback); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2192,46 +2338,25 @@ int dirtytime_interval_handler(struct ctl_table *table, int write, return ret; } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) -{ - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; - - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } -} - /** - * __mark_inode_dirty - internal function + * __mark_inode_dirty - internal function to mark an inode dirty * * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of + * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined + * with I_DIRTY_PAGES. * - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. + * Mark an inode as dirty. We notify the filesystem, then update the inode's + * dirty flags. Then, if needed we add the inode to the appropriate dirty list. * - * Put the inode on the super block's dirty list. + * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync() + * instead of calling this directly. * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. + * CAREFUL! We only add the inode to the dirty list if it is hashed or if it + * refers to a blockdev. Unhashed inodes will never be added to the dirty list + * even if they are later hashed, as they will have been marked dirty already. * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. + * In short, ensure you hash any inodes _before_ you start marking them dirty. * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of @@ -2243,25 +2368,50 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - int dirtytime; + int dirtytime = 0; + struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); - /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself - */ - if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) { - trace_writeback_dirty_inode_start(inode, flags); + if (flags & I_DIRTY_INODE) { + /* + * Inode timestamp update will piggback on this dirtying. + * We tell ->dirty_inode callback that timestamps need to + * be updated by setting I_DIRTY_TIME in flags. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + if (inode->i_state & I_DIRTY_TIME) { + inode->i_state &= ~I_DIRTY_TIME; + flags |= I_DIRTY_TIME; + } + spin_unlock(&inode->i_lock); + } + /* + * Notify the filesystem about the inode being dirtied, so that + * (if needed) it can update on-disk fields and journal the + * inode. This is only needed when the inode itself is being + * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not + * for just I_DIRTY_PAGES or I_DIRTY_TIME. + */ + trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode, flags); - + sb->s_op->dirty_inode(inode, + flags & (I_DIRTY_INODE | I_DIRTY_TIME)); trace_writeback_dirty_inode(inode, flags); - } - if (flags & I_DIRTY_INODE) + + /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ flags &= ~I_DIRTY_TIME; - dirtytime = flags & I_DIRTY_TIME; + } else { + /* + * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing. + * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME + * in one call to __mark_inode_dirty().) + */ + dirtytime = flags & I_DIRTY_TIME; + WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME); + } /* * Paired with smp_mb() in __writeback_single_inode() for the @@ -2269,32 +2419,36 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if (((inode->i_state & flags) == flags) || - (dirtytime && (inode->i_state & I_DIRTY_INODE))) + if ((inode->i_state & flags) == flags) return; - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); - spin_lock(&inode->i_lock); - if (dirtytime && (inode->i_state & I_DIRTY_INODE)) - goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode_attach_wb(inode, NULL); - if (flags & I_DIRTY_INODE) - inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * Grab inode's wb early because it requires dropping i_lock and we + * need to make sure following checks happen atomically with dirty + * list handling so that we don't move inodes under flush worker's + * hands. + */ + if (!was_dirty) { + wb = locked_inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); + } + + /* + * If the inode is queued for writeback by flush worker, just + * update its dirty state. Once the flush worker is done with + * the inode it will place it on the appropriate superblock + * list, based upon its state. */ - if (inode->i_state & I_SYNC) - goto out_unlock_inode; + if (inode->i_state & I_SYNC_QUEUED) + goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's @@ -2302,26 +2456,19 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out_unlock_inode; + goto out_unlock; } if (inode->i_state & I_FREEING) - goto out_unlock_inode; + goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb; struct list_head *dirty_list; bool wakeup_bdi = false; - wb = locked_inode_to_wb_and_lock_list(inode); - - WARN(bdi_cap_writeback_dirty(wb->bdi) && - !test_bit(WB_registered, &wb->state), - "bdi-%s not registered\n", wb->bdi->name); - inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; @@ -2335,6 +2482,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) dirty_list); spin_unlock(&wb->list_lock); + spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* @@ -2343,12 +2491,15 @@ void __mark_inode_dirty(struct inode *inode, int flags) * to make sure background write-back happens * later. */ - if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + if (wakeup_bdi && + (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } } -out_unlock_inode: +out_unlock: + if (wb) + spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -2578,7 +2729,7 @@ int write_inode_now(struct inode *inode, int sync) .range_end = LLONG_MAX, }; - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); @@ -2587,23 +2738,6 @@ int write_inode_now(struct inode *inode, int sync) EXPORT_SYMBOL(write_inode_now); /** - * sync_inode - write an inode and its pages to disk. - * @inode: the inode to sync - * @wbc: controls the writeback mode - * - * sync_inode() will write an inode and its pages to disk. It will also - * correctly update the inode on its superblock's dirty inode lists and will - * update inode->i_state. - * - * The caller must have a ref on the inode. - */ -int sync_inode(struct inode *inode, struct writeback_control *wbc) -{ - return writeback_single_inode(inode, wbc); -} -EXPORT_SYMBOL(sync_inode); - -/** * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. @@ -2619,6 +2753,6 @@ int sync_inode_metadata(struct inode *inode, int wait) .nr_to_write = 0, /* metadata-only */ }; - return sync_inode(inode, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata); |