From 02afc27faec94c9e068517a22acf55400976c698 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Sep 2013 15:04:40 +0200 Subject: direct-io: Handle O_(D)SYNC AIO Call generic_write_sync() from the deferred I/O completion handler if O_DSYNC is set for a write request. Also make sure various callers don't call generic_write_sync if the direct I/O code returns -EIOCBQUEUED. Based on an earlier patch from Jan Kara with updates from Jeff Moyer and Darrick J. Wong . Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Al Viro --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 4b51ac1acae7..731a2c24532d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2550,7 +2550,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); -- cgit v1.2.3-59-g8ed1b From 5e4c0d974139a98741b829b27cf38dc8f9284490 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Sep 2013 14:26:05 -0700 Subject: lib/radix-tree.c: make radix_tree_node_alloc() work correctly within interrupt With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is one such possible user), the following race can happen: radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; ... radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; And we give out one radix tree node twice. That clearly results in radix tree corruption with different results (usually OOPS) depending on which two users of radix tree race. We fix the problem by making radix_tree_node_alloc() always allocate fresh radix tree nodes when in interrupt. Using preloading when in interrupt doesn't make sense since all the allocations have to be atomic anyway and we cannot steal nodes from process-context users because some users rely on radix_tree_insert() succeeding after radix_tree_preload(). in_interrupt() check is somewhat ugly but we cannot simply key off passed gfp_mask as that is acquired from root_gfp_mask() and thus the same for all preload users. Another part of the fix is to avoid node preallocation in radix_tree_preload() when passed gfp_mask doesn't allow waiting. Again, preallocation in such case doesn't make sense and when preallocation would happen in interrupt we could possibly leak some allocated nodes. However, some users of radix_tree_preload() require following radix_tree_insert() to succeed. To avoid unexpected effects for these users, radix_tree_preload() only warns if passed gfp mask doesn't allow waiting and we provide a new function radix_tree_maybe_preload() for those users which get different gfp mask from different call sites and which are prepared to handle radix_tree_insert() failure. Signed-off-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-ioc.c | 2 +- fs/fscache/page.c | 2 +- include/linux/radix-tree.h | 1 + lib/radix-tree.c | 41 +++++++++++++++++++++++++++++++++++++++-- mm/filemap.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 ++-- 7 files changed, 46 insertions(+), 8 deletions(-) (limited to 'mm/filemap.c') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 4464c823cff2..46cd7bd18b34 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (!icq) return NULL; - if (radix_tree_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 8702b732109a..73899c1c3449 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -913,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); - ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) goto nomem_free; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38b0a..403940787be1 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e7964296fd50..7811ed3b4e70 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -32,6 +32,7 @@ #include #include #include +#include /* in_interrupt() */ #ifdef __KERNEL__ @@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root) struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - if (!(gfp_mask & __GFP_WAIT)) { + /* + * Preload code isn't irq safe and it doesn't make sence to use + * preloading in the interrupt anyway as all the allocations have to + * be atomic. So just do normal allocation when in interrupt. + */ + if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -288,8 +294,39 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + /* Warn on non-sensical use... */ + WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + return __radix_tree_preload(gfp_mask); +} EXPORT_SYMBOL(radix_tree_preload); +/* + * The same as above function, except we don't guarantee preloading happens. + * We do it, if we decide it helps. On success, return zero with preemption + * disabled. On error, return -ENOMEM with preemption not disabled. + */ +int radix_tree_maybe_preload(gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_WAIT) + return __radix_tree_preload(gfp_mask); + /* Preloading doesn't help anything with this gfp mask, skip it */ + preempt_disable(); + return 0; +} +EXPORT_SYMBOL(radix_tree_maybe_preload); + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. diff --git a/mm/filemap.c b/mm/filemap.c index 731a2c24532d..e607728db4a8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (error) goto out; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { page_cache_get(page); page->mapping = mapping; diff --git a/mm/shmem.c b/mm/shmem.c index 526149846d0a..a1b8bf4391c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1205,7 +1205,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (error) goto decused; - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, NULL); diff --git a/mm/swap_state.c b/mm/swap_state.c index f24ab0dff554..e6f15f8ca2af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_preload(gfp_mask); + error = radix_tree_maybe_preload(gfp_mask); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * call radix_tree_preload() while we can wait. */ - err = radix_tree_preload(gfp_mask & GFP_KERNEL); + err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; -- cgit v1.2.3-59-g8ed1b From 519e52473ebe9db5cdef44670d5a97f1fd53d721 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:42 -0700 Subject: mm: memcg: enable memcg OOM killer only for user faults System calls and kernel faults (uaccess, gup) can handle an out of memory situation gracefully and just return -ENOMEM. Enable the memcg OOM killer only for user faults, where it's really the only option available. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: azurIt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 3 +++ mm/filemap.c | 11 ++++++++++- mm/memcontrol.c | 2 +- mm/memory.c | 40 ++++++++++++++++++++++++++++++---------- 5 files changed, 88 insertions(+), 12 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d4d1f9b0dbba..34ac6497d01a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -151,6 +151,37 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); +/** + * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task + * @new: true to enable, false to disable + * + * Toggle whether a failed memcg charge should invoke the OOM killer + * or just return -ENOMEM. Returns the previous toggle state. + */ +static inline bool mem_cgroup_toggle_oom(bool new) +{ + bool old; + + old = current->memcg_oom.may_oom; + current->memcg_oom.may_oom = new; + + return old; +} + +static inline void mem_cgroup_enable_oom(void) +{ + bool old = mem_cgroup_toggle_oom(true); + + WARN_ON(old == true); +} + +static inline void mem_cgroup_disable_oom(void) +{ + bool old = mem_cgroup_toggle_oom(false); + + WARN_ON(old == false); +} + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -383,6 +414,19 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, { } +static inline bool mem_cgroup_toggle_oom(bool new) +{ + return false; +} + +static inline void mem_cgroup_enable_oom(void) +{ +} + +static inline void mem_cgroup_disable_oom(void) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 45f254dddafc..9ce1fa53031f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1393,6 +1393,9 @@ struct task_struct { unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; unsigned int memcg_kmem_skip_account; + struct memcg_oom_info { + unsigned int may_oom:1; + } memcg_oom; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a8..e3b6fc8c0b7b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1614,6 +1614,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; + bool memcg_oom; pgoff_t size; int ret = 0; @@ -1622,7 +1623,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? + * Do we have something in the page cache already? Either + * way, try readahead, but disable the memcg OOM killer for it + * as readahead is optional and no errors are propagated up + * the fault stack. The OOM killer is enabled while trying to + * instantiate the faulting page individually below. */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1630,10 +1635,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ + memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); + mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ + memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); + mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c4524458b7d0..0980bbf6438d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2454,7 +2454,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_RETRY; /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check) + if (!oom_check || !current->memcg_oom.may_oom) return CHARGE_NOMEM; /* check OOM */ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..a8f9deab8719 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3754,22 +3754,14 @@ unlock: /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); - - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3850,6 +3842,34 @@ retry: return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_enable_oom(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) + mem_cgroup_disable_oom(); + + return ret; +} + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. -- cgit v1.2.3-59-g8ed1b From 66a0c8ee3dce78362d59f00a8efbd752fbeddfb1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 12 Sep 2013 15:13:59 -0700 Subject: mm: cleanup add_to_page_cache_locked() Make add_to_page_cache_locked() cleaner: - unindent most code of the function by inverting one condition; - streamline code no-error path; - move insert error path outside normal code path; - call radix_tree_preload_end() earlier; No functional changes. Signed-off-by: Kirill A. Shutemov Acked-by: Dave Hansen Cc: Andrea Arcangeli Cc: Al Viro Cc: Hugh Dickins Cc: Wu Fengguang Cc: Jan Kara Cc: Mel Gorman Cc: Andi Kleen Cc: Matthew Wilcox Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index e3b6fc8c0b7b..1e6aec4a2d2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) - goto out; + return error; error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); - if (error == 0) { - page_cache_get(page); - page->mapping = mapping; - page->index = offset; - - spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (likely(!error)) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); - trace_mm_filemap_add_to_page_cache(page); - } else { - page->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); - page_cache_release(page); - } - radix_tree_preload_end(); - } else + if (error) { mem_cgroup_uncharge_cache_page(page); -out: + return error; + } + + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); + return 0; +err_insert: + page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); return error; } EXPORT_SYMBOL(add_to_page_cache_locked); -- cgit v1.2.3-59-g8ed1b