aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c366
1 files changed, 242 insertions, 124 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e91980f49388..06d04a74ab6c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode,
return false;
}
-/**
- * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- * @inode: inode to be removed
- * @wb: bdi_writeback @inode is being removed from
- *
- * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- * clear %WB_has_dirty_io if all are empty afterwards.
- */
-static void inode_io_list_del_locked(struct inode *inode,
- struct bdi_writeback *wb)
-{
- assert_spin_locked(&wb->list_lock);
- assert_spin_locked(&inode->i_lock);
-
- inode->i_state &= ~I_SYNC_QUEUED;
- list_del_init(&inode->i_io_list);
- wb_io_lists_depopulated(wb);
-}
-
static void wb_wakeup(struct bdi_writeback *wb)
{
spin_lock_bh(&wb->work_lock);
@@ -244,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done)
/* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
+/*
+ * Maximum inodes per isw. A specific value has been chosen to make
+ * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
+ */
+#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
+ / sizeof(struct inode *))
+
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;
@@ -279,6 +267,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
+ * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
+ * @inode: inode of interest with i_lock held
+ * @wb: target bdi_writeback
+ *
+ * Remove the inode from wb's io lists and if necessarily put onto b_attached
+ * list. Only inodes attached to cgwb's are kept on this list.
+ */
+static void inode_cgwb_move_to_attached(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ if (wb != &wb->bdi->wb)
+ list_move(&inode->i_io_list, &wb->b_attached);
+ else
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+}
+
+/**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
* @inode: inode of interest with i_lock held
*
@@ -332,11 +342,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct inode *inode;
- struct bdi_writeback *new_wb;
+ struct rcu_work work;
- struct rcu_head rcu_head;
- struct work_struct work;
+ /*
+ * Multiple inodes can be switched at once. The switching procedure
+ * consists of two parts, separated by a RCU grace period. To make
+ * sure that the second part is executed for each inode gone through
+ * the first part, all inode pointers are placed into a NULL-terminated
+ * array embedded into struct inode_switch_wbs_context. Otherwise
+ * an inode could be left in a non-consistent state.
+ */
+ struct bdi_writeback *new_wb;
+ struct inode *inodes[];
};
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
@@ -349,50 +366,23 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
up_write(&bdi->wb_switch_rwsem);
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static bool inode_do_switch_wbs(struct inode *inode,
+ struct bdi_writeback *old_wb,
+ struct bdi_writeback *new_wb)
{
- struct inode_switch_wbs_context *isw =
- container_of(work, struct inode_switch_wbs_context, work);
- struct inode *inode = isw->inode;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping;
- struct bdi_writeback *old_wb = inode->i_wb;
- struct bdi_writeback *new_wb = isw->new_wb;
XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
bool switched = false;
- /*
- * If @inode switches cgwb membership while sync_inodes_sb() is
- * being issued, sync_inodes_sb() might miss it. Synchronize.
- */
- down_read(&bdi->wb_switch_rwsem);
-
- /*
- * By the time control reaches here, RCU grace period has passed
- * since I_WB_SWITCH assertion and all wb stat update transactions
- * between unlocked_inode_to_wb_begin/end() are guaranteed to be
- * synchronizing against the i_pages lock.
- *
- * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
- * gives us exclusion against all wb related operations on @inode
- * including IO list manipulations and stat updates.
- */
- if (old_wb < new_wb) {
- spin_lock(&old_wb->list_lock);
- spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
- } else {
- spin_lock(&new_wb->list_lock);
- spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
- }
spin_lock(&inode->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
- * Once I_FREEING is visible under i_lock, the eviction path owns
- * the inode and we shouldn't modify ->i_io_list.
+ * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
+ * path owns the inode and we shouldn't modify ->i_io_list.
*/
- if (unlikely(inode->i_state & I_FREEING))
+ if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
goto skip_switch;
trace_inode_switch_wbs(inode, old_wb, new_wb);
@@ -419,21 +409,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
wb_get(new_wb);
/*
- * Transfer to @new_wb's IO list if necessary. The specific list
- * @inode was on is ignored and the inode is put on ->b_dirty which
- * is always correct including from ->b_dirty_time. The transfer
- * preserves @inode->dirtied_when ordering.
+ * Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
+ * the specific list @inode was on is ignored and the @inode is put on
+ * ->b_dirty which is always correct including from ->b_dirty_time.
+ * The transfer preserves @inode->dirtied_when ordering. If the @inode
+ * was clean, it means it was on the b_attached list, so move it onto
+ * the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
- struct inode *pos;
-
- inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
- if (time_after_eq(inode->dirtied_when,
- pos->dirtied_when))
- break;
- inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
+
+ if (inode->i_state & I_DIRTY_ALL) {
+ struct inode *pos;
+
+ list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_io_list_move_locked(inode, new_wb,
+ pos->i_io_list.prev);
+ } else {
+ inode_cgwb_move_to_attached(inode, new_wb);
+ }
} else {
inode->i_wb = new_wb;
}
@@ -452,31 +449,91 @@ skip_switch:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
+
+ return switched;
+}
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+ struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
+ struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ unsigned long nr_switched = 0;
+ struct inode **inodep;
+
+ /*
+ * If @inode switches cgwb membership while sync_inodes_sb() is
+ * being issued, sync_inodes_sb() might miss it. Synchronize.
+ */
+ down_read(&bdi->wb_switch_rwsem);
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against the i_pages lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+
+ for (inodep = isw->inodes; *inodep; inodep++) {
+ WARN_ON_ONCE((*inodep)->i_wb != old_wb);
+ if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
+ nr_switched++;
+ }
+
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
up_read(&bdi->wb_switch_rwsem);
- if (switched) {
+ if (nr_switched) {
wb_wakeup(new_wb);
- wb_put(old_wb);
+ wb_put_many(old_wb, nr_switched);
}
- wb_put(new_wb);
- iput(inode);
+ for (inodep = isw->inodes; *inodep; inodep++)
+ iput(*inodep);
+ wb_put(new_wb);
kfree(isw);
-
atomic_dec(&isw_nr_in_flight);
}
-static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+static bool inode_prepare_wbs_switch(struct inode *inode,
+ struct bdi_writeback *new_wb)
{
- struct inode_switch_wbs_context *isw = container_of(rcu_head,
- struct inode_switch_wbs_context, rcu_head);
+ /*
+ * Paired with smp_mb() in cgroup_writeback_umount().
+ * isw_nr_in_flight must be increased before checking SB_ACTIVE and
+ * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
+ * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+ */
+ smp_mb();
- /* needs to grab bh-unsafe locks, bounce to work item */
- INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_work(isw_wq, &isw->work);
+ /* while holding I_WB_SWITCH, no one else can update the association */
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+ inode_to_wb(inode) == new_wb) {
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ inode->i_state |= I_WB_SWITCH;
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+
+ return true;
}
/**
@@ -501,32 +558,30 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
if (!isw)
return;
+ atomic_inc(&isw_nr_in_flight);
+
/* find and pin the new wb */
rcu_read_lock();
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
- if (memcg_css)
- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ if (memcg_css && !css_tryget(memcg_css))
+ memcg_css = NULL;
rcu_read_unlock();
+ if (!memcg_css)
+ goto out_free;
+
+ isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
if (!isw->new_wb)
goto out_free;
- /* while holding I_WB_SWITCH, no one else can update the association */
- spin_lock(&inode->i_lock);
- if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
- inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- inode_to_wb(inode) == isw->new_wb) {
- spin_unlock(&inode->i_lock);
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
goto out_free;
- }
- inode->i_state |= I_WB_SWITCH;
- __iget(inode);
- spin_unlock(&inode->i_lock);
- isw->inode = inode;
+ isw->inodes[0] = inode;
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
@@ -534,18 +589,85 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
- call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-
- atomic_inc(&isw_nr_in_flight);
+ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+ queue_rcu_work(isw_wq, &isw->work);
return;
out_free:
+ atomic_dec(&isw_nr_in_flight);
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
}
/**
+ * cleanup_offline_cgwb - detach associated inodes
+ * @wb: target wb
+ *
+ * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
+ * to eventually release the dying @wb. Returns %true if not all inodes were
+ * switched and the function has to be restarted.
+ */
+bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+{
+ struct cgroup_subsys_state *memcg_css;
+ struct inode_switch_wbs_context *isw;
+ struct inode *inode;
+ int nr;
+ bool restart = false;
+
+ isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
+ sizeof(struct inode *), GFP_KERNEL);
+ if (!isw)
+ return restart;
+
+ atomic_inc(&isw_nr_in_flight);
+
+ for (memcg_css = wb->memcg_css->parent; memcg_css;
+ memcg_css = memcg_css->parent) {
+ isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+ if (isw->new_wb)
+ break;
+ }
+ if (unlikely(!isw->new_wb))
+ isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+
+ nr = 0;
+ spin_lock(&wb->list_lock);
+ list_for_each_entry(inode, &wb->b_attached, i_io_list) {
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ continue;
+
+ isw->inodes[nr++] = inode;
+
+ if (nr >= WB_MAX_INODES_PER_ISW - 1) {
+ restart = true;
+ break;
+ }
+ }
+ spin_unlock(&wb->list_lock);
+
+ /* no attached inodes? bail out */
+ if (nr == 0) {
+ atomic_dec(&isw_nr_in_flight);
+ wb_put(isw->new_wb);
+ kfree(isw);
+ return restart;
+ }
+
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH tells
+ * the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+ */
+ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+ queue_rcu_work(isw_wq, &isw->work);
+
+ return restart;
+}
+
+/**
* wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
* @wbc: writeback_control of interest
* @inode: target inode
@@ -1000,6 +1122,12 @@ out_bdi_put:
*/
void cgroup_writeback_umount(void)
{
+ /*
+ * SB_ACTIVE should be reliably cleared before checking
+ * isw_nr_in_flight, see generic_shutdown_super().
+ */
+ smp_mb();
+
if (atomic_read(&isw_nr_in_flight)) {
/*
* Use rcu_barrier() to wait for all pending callbacks to
@@ -1024,6 +1152,17 @@ fs_initcall(cgroup_writeback_init);
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void inode_cgwb_move_to_attached(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+}
+
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lock)
@@ -1124,7 +1263,11 @@ void inode_io_list_del(struct inode *inode)
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
- inode_io_list_del_locked(inode, wb);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
}
@@ -1437,7 +1580,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
- inode_io_list_del_locked(inode, wb);
+ inode_cgwb_move_to_attached(inode, wb);
}
}
@@ -1589,7 +1732,7 @@ static int writeback_single_inode(struct inode *inode,
* responsible for the writeback lists.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- inode_io_list_del_locked(inode, wb);
+ inode_cgwb_move_to_attached(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -2205,28 +2348,6 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
return ret;
}
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
-{
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- struct dentry *dentry;
- const char *name = "?";
-
- dentry = d_find_alias(inode);
- if (dentry) {
- spin_lock(&dentry->d_lock);
- name = (const char *) dentry->d_name.name;
- }
- printk(KERN_DEBUG
- "%s(%d): dirtied inode %lu (%s) on %s\n",
- current->comm, task_pid_nr(current), inode->i_ino,
- name, inode->i_sb->s_id);
- if (dentry) {
- spin_unlock(&dentry->d_lock);
- dput(dentry);
- }
- }
-}
-
/**
* __mark_inode_dirty - internal function to mark an inode dirty
*
@@ -2296,9 +2417,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
- if (unlikely(block_dump))
- block_dump___mark_inode_dirty(inode);
-
spin_lock(&inode->i_lock);
if (dirtytime && (inode->i_state & I_DIRTY_INODE))
goto out_unlock_inode;