aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c341
-rw-r--r--mm/page-writeback.c179
-rw-r--r--mm/vmscan.c2
3 files changed, 352 insertions, 170 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6f163e0f0509..7f3fa79f25c0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
#include <linux/wait.h>
#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
@@ -22,8 +25,18 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
-DEFINE_MUTEX(bdi_lock);
+DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
+LIST_HEAD(bdi_pending_list);
+
+static struct task_struct *sync_supers_tsk;
+static struct timer_list sync_supers_timer;
+
+static int bdi_sync_supers(void *);
+static void sync_supers_timer_fn(unsigned long);
+static void arm_supers_timer(void);
+
+static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -187,6 +200,13 @@ static int __init default_bdi_init(void)
{
int err;
+ sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
+ BUG_ON(IS_ERR(sync_supers_tsk));
+
+ init_timer(&sync_supers_timer);
+ setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
+ arm_supers_timer();
+
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
@@ -195,6 +215,242 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+}
+
+static void bdi_task_init(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct task_struct *tsk = current;
+
+ spin_lock(&bdi->wb_lock);
+ list_add_tail_rcu(&wb->list, &bdi->wb_list);
+ spin_unlock(&bdi->wb_lock);
+
+ tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(tsk, 0);
+}
+
+static int bdi_start_fn(void *ptr)
+{
+ struct bdi_writeback *wb = ptr;
+ struct backing_dev_info *bdi = wb->bdi;
+ int ret;
+
+ /*
+ * Add us to the active bdi_list
+ */
+ spin_lock(&bdi_lock);
+ list_add(&bdi->bdi_list, &bdi_list);
+ spin_unlock(&bdi_lock);
+
+ bdi_task_init(bdi, wb);
+
+ /*
+ * Clear pending bit and wakeup anybody waiting to tear us down
+ */
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
+
+ ret = bdi_writeback_task(wb);
+
+ /*
+ * Remove us from the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&wb->list);
+ spin_unlock(&bdi->wb_lock);
+
+ /*
+ * Flush any work that raced with us exiting. No new work
+ * will be added, since this bdi isn't discoverable anymore.
+ */
+ if (!list_empty(&bdi->work_list))
+ wb_do_writeback(wb, 1);
+
+ wb->task = NULL;
+ return ret;
+}
+
+int bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+ return wb_has_dirty_io(&bdi->wb);
+}
+
+static void bdi_flush_io(struct backing_dev_info *bdi)
+{
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .range_cyclic = 1,
+ .nr_to_write = 1024,
+ };
+
+ writeback_inodes_wbc(&wbc);
+}
+
+/*
+ * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * or we risk deadlocking on ->s_umount. The longer term solution would be
+ * to implement sync_supers_bdi() or similar and simply do it from the
+ * bdi writeback tasks individually.
+ */
+static int bdi_sync_supers(void *unused)
+{
+ set_user_nice(current, 0);
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+
+ /*
+ * Do this periodically, like kupdated() did before.
+ */
+ sync_supers();
+ }
+
+ return 0;
+}
+
+static void arm_supers_timer(void)
+{
+ unsigned long next;
+
+ next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
+ mod_timer(&sync_supers_timer, round_jiffies_up(next));
+}
+
+static void sync_supers_timer_fn(unsigned long unused)
+{
+ wake_up_process(sync_supers_tsk);
+ arm_supers_timer();
+}
+
+static int bdi_forker_task(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
+
+ bdi_task_init(me->bdi, me);
+
+ for (;;) {
+ struct backing_dev_info *bdi, *tmp;
+ struct bdi_writeback *wb;
+
+ /*
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
+ */
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+ wb_do_writeback(me, 0);
+
+ spin_lock(&bdi_lock);
+
+ /*
+ * Check if any existing bdi's have dirty data without
+ * a thread registered. If so, set that up.
+ */
+ list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+ if (bdi->wb.task)
+ continue;
+ if (list_empty(&bdi->work_list) &&
+ !bdi_has_dirty_io(bdi))
+ continue;
+
+ bdi_add_default_flusher_task(bdi);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (list_empty(&bdi_pending_list)) {
+ unsigned long wait;
+
+ spin_unlock(&bdi_lock);
+ wait = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout(wait);
+ try_to_freeze();
+ continue;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * This is our real job - check for pending entries in
+ * bdi_pending_list, and create the tasks that got added
+ */
+ bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
+ bdi_list);
+ list_del_init(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+
+ wb = &bdi->wb;
+ wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
+ dev_name(bdi->dev));
+ /*
+ * If task creation fails, then readd the bdi to
+ * the pending list and force writeout of the bdi
+ * from this forker thread. That will free some memory
+ * and we can try again.
+ */
+ if (IS_ERR(wb->task)) {
+ wb->task = NULL;
+
+ /*
+ * Add this 'bdi' to the back, so we get
+ * a chance to flush other bdi's to free
+ * memory.
+ */
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+ spin_unlock(&bdi_lock);
+
+ bdi_flush_io(bdi);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Add the default flusher task that gets created for any bdi
+ * that has dirty data pending writeout
+ */
+void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+{
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ /*
+ * Check with the helper whether to proceed adding a task. Will only
+ * abort if we two or more simultanous calls to
+ * bdi_add_default_flusher_task() occured, further additions will block
+ * waiting for previous additions to finish.
+ */
+ if (!test_and_set_bit(BDI_pending, &bdi->state)) {
+ list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+
+ /*
+ * We are now on the pending list, wake up bdi_forker_task()
+ * to finish the job and add us back to the active bdi_list
+ */
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -213,13 +469,34 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
- mutex_lock(&bdi_lock);
+ spin_lock(&bdi_lock);
list_add_tail(&bdi->bdi_list, &bdi_list);
- mutex_unlock(&bdi_lock);
+ spin_unlock(&bdi_lock);
bdi->dev = dev;
- bdi_debug_register(bdi, dev_name(dev));
+ /*
+ * Just start the forker thread for our default backing_dev_info,
+ * and add other bdi's to the list. They will get a thread created
+ * on-demand when they need it.
+ */
+ if (bdi_cap_flush_forker(bdi)) {
+ struct bdi_writeback *wb = &bdi->wb;
+
+ wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ dev_name(dev));
+ if (IS_ERR(wb->task)) {
+ wb->task = NULL;
+ ret = -ENOMEM;
+
+ spin_lock(&bdi_lock);
+ list_del(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+ goto exit;
+ }
+ }
+
+ bdi_debug_register(bdi, dev_name(dev));
exit:
return ret;
}
@@ -231,17 +508,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);
-static void bdi_remove_from_list(struct backing_dev_info *bdi)
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- mutex_lock(&bdi_lock);
+ struct bdi_writeback *wb;
+
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ /*
+ * If setup is pending, wait for that to complete first
+ */
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * Make sure nobody finds us on the bdi_list anymore
+ */
+ spin_lock(&bdi_lock);
list_del(&bdi->bdi_list);
- mutex_unlock(&bdi_lock);
+ spin_unlock(&bdi_lock);
+
+ /*
+ * Finally, kill the kernel threads. We don't need to be RCU
+ * safe anymore, since the bdi is gone from visibility.
+ */
+ list_for_each_entry(wb, &bdi->wb_list, list)
+ kthread_stop(wb->task);
}
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
- bdi_remove_from_list(bdi);
+ if (!bdi_cap_flush_forker(bdi))
+ bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
bdi->dev = NULL;
@@ -251,18 +553,25 @@ EXPORT_SYMBOL(bdi_unregister);
int bdi_init(struct backing_dev_info *bdi)
{
- int i;
- int err;
+ int i, err;
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
+ spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->b_io);
- INIT_LIST_HEAD(&bdi->b_dirty);
- INIT_LIST_HEAD(&bdi->b_more_io);
+ INIT_LIST_HEAD(&bdi->wb_list);
+ INIT_LIST_HEAD(&bdi->work_list);
+
+ bdi_wb_init(&bdi->wb, bdi);
+
+ /*
+ * Just one thread support for now, hard code mask and count
+ */
+ bdi->wb_mask = 1;
+ bdi->wb_cnt = 1;
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -277,8 +586,6 @@ int bdi_init(struct backing_dev_info *bdi)
err:
while (i--)
percpu_counter_destroy(&bdi->bdi_stat[i]);
-
- bdi_remove_from_list(bdi);
}
return err;
@@ -289,9 +596,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
- WARN_ON(!list_empty(&bdi->b_dirty));
- WARN_ON(!list_empty(&bdi->b_io));
- WARN_ON(!list_empty(&bdi->b_more_io));
+ WARN_ON(bdi_has_dirty_io(bdi));
bdi_unregister(bdi);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f8341b6019bf..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
#include <linux/pagevec.h>
/*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation. We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES 1024
-
-/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
-
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
*
@@ -326,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
- mutex_lock(&bdi_lock);
+ spin_lock(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
@@ -338,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
ret = -EINVAL;
}
}
- mutex_unlock(&bdi_lock);
+ spin_unlock(&bdi_lock);
return ret;
}
@@ -350,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
if (max_ratio > 100)
return -EINVAL;
- mutex_lock(&bdi_lock);
+ spin_lock(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
- mutex_unlock(&bdi_lock);
+ spin_unlock(&bdi_lock);
return ret;
}
@@ -543,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
* up.
*/
if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes(&wbc);
+ writeback_inodes_wbc(&wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
@@ -572,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
if (pages_written >= write_chunk)
break; /* We've done our duty */
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ schedule_timeout(1);
}
if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -591,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS)
- > background_thresh)))
- pdflush_operation(background_writeout, 0);
+ (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
+ + global_page_state(NR_UNSTABLE_NFS))
+ > background_thresh))) {
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = nr_writeback,
+ };
+
+
+ bdi_start_writeback(&wbc);
+ }
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -678,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
}
-/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
- */
-static void background_writeout(unsigned long _min_pages)
-{
- long min_pages = _min_pages;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = 0,
- .nonblocking = 1,
- .range_cyclic = 1,
- };
-
- for ( ; ; ) {
- unsigned long background_thresh;
- unsigned long dirty_thresh;
-
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
- && min_pages <= 0)
- break;
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- wbc.pages_skipped = 0;
- writeback_inodes(&wbc);
- min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- /* Wrote less than expected */
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- else
- break;
- }
- }
-}
-
-/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
- if (nr_pages == 0)
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
-}
-
-static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space. So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval. But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write. So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
- unsigned long oldest_jif;
- unsigned long start_jif;
- unsigned long next_jif;
- long nr_to_write;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = &oldest_jif,
- .nr_to_write = 0,
- .nonblocking = 1,
- .for_kupdate = 1,
- .range_cyclic = 1,
- };
-
- sync_supers();
-
- oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
- start_jif = jiffies;
- next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
- nr_to_write = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- while (nr_to_write > 0) {
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_inodes(&wbc);
- if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- else
- break; /* All the old data is written */
- }
- nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- }
- if (time_before(next_jif, jiffies + HZ))
- next_jif = jiffies + HZ;
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, next_jif);
-}
-
-/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, jiffies +
- msecs_to_jiffies(dirty_writeback_interval * 10));
- else
- del_timer(&wb_timer);
return 0;
}
-static void wb_timer_fn(unsigned long unused)
-{
- if (pdflush_operation(wb_kupdate, 0) < 0)
- mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
-}
-
-static void laptop_flush(unsigned long unused)
+static void do_laptop_sync(struct work_struct *work)
{
- sys_sync();
+ wakeup_flusher_threads(0);
+ kfree(work);
}
static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ struct work_struct *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(work, do_laptop_sync);
+ schedule_work(work);
+ }
}
/*
@@ -907,8 +786,6 @@ void __init page_writeback_init(void)
{
int shift;
- mod_timer(&wb_timer,
- jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (total_scanned > sc->swap_cluster_max +
sc->swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}