From 7512102cf64d36e3c7444480273623c7aab3563f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 5 Mar 2012 14:59:18 -0800 Subject: memcg: fix GPF when cgroup removal races with last exit When moving tasks from old memcg (with move_charge_at_immigrate on new memcg), followed by removal of old memcg, hit General Protection Fault in mem_cgroup_lru_del_list() (called from release_pages called from free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from exit_mmap from mmput from exit_mm from do_exit). Somewhat reproducible, takes a few hours: the old struct mem_cgroup has been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is still trying to update its stats, and take page off lru before freeing. A task, or a charge, or a page on lru: each secures a memcg against removal. In this case, the last task has been moved out of the old memcg, and it is exiting: anonymous pages are uncharged one by one from the memcg, as they are zapped from its pagetables, so the charge gets down to 0; but the pages themselves are queued in an mmu_gather for freeing. Most of those pages will be on lru (and force_empty is careful to lru_add_drain_all, to add pages from pagevec to lru first), but not necessarily all: perhaps some have been isolated for page reclaim, perhaps some isolated for other reasons. So, force_empty may find no task, no charge and no page on lru, and let the removal proceed. There would still be no problem if these pages were immediately freed; but typically (and the put_page_testzero protocol demands it) they have to be added back to lru before they are found freeable, then removed from lru and freed. We don't see the issue when adding, because the mem_cgroup_iter() loops keep their own reference to the memcg being scanned; but when it comes to mem_cgroup_lru_del_list(). I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and PageCgroupUsed flags were used (like a trick with mirrors) to deflect view of pc->mem_cgroup to the stable root_mem_cgroup when neither set. 38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully removed those convolutions, but left this General Protection Fault. But it's surprisingly easy to restore the old behaviour: just check PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec to add), and reset pc to root_mem_cgroup if page is uncharged. A risky change? just going back to how it worked before; testing, and an audit of uses of pc->mem_cgroup, show no problem. And there's a nice bonus: with mem_cgroup_lru_add_list() itself making sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg: clear pc->mem_cgroup if necessary"). Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c is not strictly necessary: the lru_lock there, with RCU before memcg structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe without that; but it seems cleaner to rely on one dependency less. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4d34356fe644..b80de520670b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -129,7 +129,6 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -extern void mem_cgroup_reset_owner(struct page *page); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif @@ -392,10 +391,6 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } - -static inline void mem_cgroup_reset_owner(struct page *page) -{ -} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) -- cgit v1.2.3-59-g8ed1b From e845e199362cc5712ba0e7eedc14eed70e144258 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:10 -0700 Subject: mm, memcg: pass charge order to oom killer The oom killer typically displays the allocation order at the time of oom as a part of its diangostic messages (for global, cpuset, and mempolicy ooms). The memory controller may also pass the charge order to the oom killer so it can emit the same information. This is useful in determining how large the memory allocation is that triggered the oom killer. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 ++- mm/memcontrol.c | 6 +++--- mm/oom_kill.c | 7 ++++--- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b80de520670b..d90965086fae 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask); +extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 37281816ff67..bb04067269bc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1811,7 +1811,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ -bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) +bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { struct oom_wait_info owait; bool locked, need_to_kill; @@ -1841,7 +1841,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, mask); + mem_cgroup_out_of_memory(memcg, mask, order); } else { schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); @@ -2212,7 +2212,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!oom_check) return CHARGE_NOMEM; /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) + if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) return CHARGE_OOM_DIE; return CHARGE_RETRY; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f23f33454645..4198e000f41a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -554,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, } #ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) { unsigned long limit; unsigned int points = 0; @@ -570,12 +571,12 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) return; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); p = select_bad_process(&points, limit, memcg, NULL, false); if (p && PTR_ERR(p) != -1UL) - oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, + oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, "Memory cgroup out of memory"); read_unlock(&tasklist_lock); } -- cgit v1.2.3-59-g8ed1b From 31a79235fc75b506e282e43723107a40f3bc5c07 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:18 -0700 Subject: memcg: replace MEM_CONT by MEM_RES_CTLR Correct an #endif comment in memcontrol.h from MEM_CONT to MEM_RES_CTLR. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d90965086fae..e76f10731a86 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -392,7 +392,7 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } -#endif /* CONFIG_CGROUP_MEM_CONT */ +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) static inline bool -- cgit v1.2.3-59-g8ed1b From a710920caedfcf56543136bfea300a6c593f9838 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:34:22 -0700 Subject: memcg: kill dead prev_priority stubs This code was removed in 25edde033291 ("vmscan: kill prev_priority completely") Signed-off-by: Konstantin Khlebnikov Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e76f10731a86..c54e5dfa1962 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -299,21 +299,6 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } -static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) -{ - return 0; -} - -static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg, - int priority) -{ -} - -static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg, - int priority) -{ -} - static inline bool mem_cgroup_disabled(void) { return true; -- cgit v1.2.3-59-g8ed1b From 89c06bd52fb9ffceddf84f7309d2e8c9f1666216 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:25 -0700 Subject: memcg: use new logic for page stat accounting Now, page-stat-per-memcg is recorded into per page_cgroup flag by duplicating page's status into the flag. The reason is that memcg has a feature to move a page from a group to another group and we have race between "move" and "page stat accounting", Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B does "page stat accounting". When CPU-A goes 1st, CPU-A CPU-B update "struct page" info. move_lock_mem_cgroup(memcg) see pc->flags copy page stat to new group overwrite pc->mem_cgroup. move_unlock_mem_cgroup(memcg) move_lock_mem_cgroup(mem) set pc->flags update page stat accounting move_unlock_mem_cgroup(mem) stat accounting is guarded by move_lock_mem_cgroup() and "move" logic (CPU-A) doesn't see changes in "struct page" information. But it's costly to have the same information both in 'struct page' and 'struct page_cgroup'. And, there is a potential problem. For example, assume we have PG_dirty accounting in memcg. PG_..is a flag for struct page. PCG_ is a flag for struct page_cgroup. (This is just an example. The same problem can be found in any kind of page stat accounting.) CPU-A CPU-B TestSet PG_dirty (delay) TestClear PG_dirty if (TestClear(PCG_dirty)) memcg->nr_dirty-- if (TestSet(PCG_dirty)) memcg->nr_dirty++ Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg Thelen . Now, only FILE_MAPPED is supported but fortunately, it's serialized by page table lock and this is not real bug, _now_, If this potential problem is caused by having duplicated information in struct page and struct page_cgroup, we may be able to fix this by using original 'struct page' information. But we'll have a problem in "move account" Assume we use only PG_dirty. CPU-A CPU-B TestSet PG_dirty (delay) move_lock_mem_cgroup() if (PageDirty(page)) new_memcg->nr_dirty++ pc->mem_cgroup = new_memcg; move_unlock_mem_cgroup() move_lock_mem_cgroup() memcg = pc->mem_cgroup new_memcg->nr_dirty++ accounting information may be double-counted. This was original reason to have PCG_xxx flags but it seems PCG_xxx has another problem. I think we need a bigger lock as move_lock_mem_cgroup(page) TestSetPageDirty(page) update page stats (without any checks) move_unlock_mem_cgroup(page) This fixes both of problems and we don't have to duplicate page flag into page_cgroup. Please note: move_lock_mem_cgroup() is held only when there are possibility of "account move" under the system. So, in most path, status update will go without atomic locks. This patch introduces mem_cgroup_begin_update_page_stat() and mem_cgroup_end_update_page_stat() both should be called at modifying 'struct page' information if memcg takes care of it. as mem_cgroup_begin_update_page_stat() modify page information mem_cgroup_update_page_stat() => never check any 'struct page' info, just update counters. mem_cgroup_end_update_page_stat(). This patch is slow because we need to call begin_update_page_stat()/ end_update_page_stat() regardless of accounted will be changed or not. A following patch adds an easy optimization and reduces the cost. [akpm@linux-foundation.org: s/lock/locked/] [hughd@google.com: fix deadlock by avoiding stat lock when anon] Signed-off-by: KAMEZAWA Hiroyuki Cc: Greg Thelen Acked-by: Johannes Weiner Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 35 ++++++++++++++++++++++++++ mm/memcontrol.c | 62 +++++++++++++++++++++++++++++++--------------- mm/rmap.c | 28 ++++++++++++++++++--- 3 files changed, 101 insertions(+), 24 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c54e5dfa1962..bf7ae01fc93b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -141,6 +141,31 @@ static inline bool mem_cgroup_disabled(void) return false; } +void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, + unsigned long *flags); + +static inline void mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + if (mem_cgroup_disabled()) + return; + rcu_read_lock(); + *locked = false; + return __mem_cgroup_begin_update_page_stat(page, locked, flags); +} + +void __mem_cgroup_end_update_page_stat(struct page *page, + unsigned long *flags); +static inline void mem_cgroup_end_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + if (mem_cgroup_disabled()) + return; + if (*locked) + __mem_cgroup_end_update_page_stat(page, flags); + rcu_read_unlock(); +} + void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val); @@ -341,6 +366,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } +static inline void mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ +} + +static inline void mem_cgroup_end_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8afed2819b8f..df1e180f6c30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1910,32 +1910,59 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * If there is, we take a lock. */ +void __mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); +again: + memcg = pc->mem_cgroup; + if (unlikely(!memcg || !PageCgroupUsed(pc))) + return; + /* + * If this memory cgroup is not under account moving, we don't + * need to take move_lock_page_cgroup(). Because we already hold + * rcu_read_lock(), any calls to move_account will be delayed until + * rcu_read_unlock() if mem_cgroup_stealed() == true. + */ + if (!mem_cgroup_stealed(memcg)) + return; + + move_lock_mem_cgroup(memcg, flags); + if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { + move_unlock_mem_cgroup(memcg, flags); + goto again; + } + *locked = true; +} + +void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + + /* + * It's guaranteed that pc->mem_cgroup never changes while + * lock is held because a routine modifies pc->mem_cgroup + * should take move_lock_page_cgroup(). + */ + move_unlock_mem_cgroup(pc->mem_cgroup, flags); +} + void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); - bool need_unlock = false; unsigned long uninitialized_var(flags); if (mem_cgroup_disabled()) return; -again: - rcu_read_lock(); + memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - goto out; - /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(memcg))) { - /* take a lock against to access pc->mem_cgroup */ - move_lock_mem_cgroup(memcg, &flags); - if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { - move_unlock_mem_cgroup(memcg, &flags); - rcu_read_unlock(); - goto again; - } - need_unlock = true; - } + return; switch (idx) { case MEMCG_NR_FILE_MAPPED: @@ -1950,11 +1977,6 @@ again: } this_cpu_add(memcg->stat->count[idx], val); - -out: - if (unlikely(need_unlock)) - move_unlock_mem_cgroup(memcg, &flags); - rcu_read_unlock(); } /* diff --git a/mm/rmap.c b/mm/rmap.c index ebeb95e9150a..5b5ad584ffb7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1148,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { + bool locked; + unsigned long flags; + + mem_cgroup_begin_update_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } + mem_cgroup_end_update_page_stat(page, &locked, &flags); } /** @@ -1162,9 +1167,21 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + bool anon = PageAnon(page); + bool locked; + unsigned long flags; + + /* + * The anon case has no mem_cgroup page_stat to update; but may + * uncharge_page() below, where the lock ordering can deadlock if + * we hold the lock against page_stat move: so avoid it on anon. + */ + if (!anon) + mem_cgroup_begin_update_page_stat(page, &locked, &flags); + /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * Now that the last pte has gone, s390 must transfer dirty @@ -1173,7 +1190,7 @@ void page_remove_rmap(struct page *page) * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. */ - if ((!PageAnon(page) || PageSwapCache(page)) && + if ((!anon || PageSwapCache(page)) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* @@ -1181,8 +1198,8 @@ void page_remove_rmap(struct page *page) * and not charged by memcg for now. */ if (unlikely(PageHuge(page))) - return; - if (PageAnon(page)) { + goto out; + if (anon) { mem_cgroup_uncharge_page(page); if (!PageTransHuge(page)) __dec_zone_page_state(page, NR_ANON_PAGES); @@ -1202,6 +1219,9 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + if (!anon) + mem_cgroup_end_update_page_stat(page, &locked, &flags); } /* -- cgit v1.2.3-59-g8ed1b From 4331f7d339ee0b54603344b9d13662a9c022540c Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:26 -0700 Subject: memcg: fix performance of mem_cgroup_begin_update_page_stat() mem_cgroup_begin_update_page_stat() should be very fast because it's called very frequently. Now, it needs to look up page_cgroup and its memcg....this is slow. This patch adds a global variable to check "any memcg is moving or not". With this, the caller doesn't need to visit page_cgroup and memcg. Here is a test result. A test program makes page faults onto a file, MAP_SHARED and makes each page's page_mapcount(page) > 1, and free the range by madvise() and page fault again. This program causes 26214400 times of page fault onto a file(size was 1G.) and shows shows the cost of mem_cgroup_begin_update_page_stat(). Before this patch for mem_cgroup_begin_update_page_stat() [kamezawa@bluextal test]$ time ./mmap 1G real 0m21.765s user 0m5.999s sys 0m15.434s 27.46% mmap mmap [.] reader 21.15% mmap [kernel.kallsyms] [k] page_fault 9.17% mmap [kernel.kallsyms] [k] filemap_fault 2.96% mmap [kernel.kallsyms] [k] __do_fault 2.83% mmap [kernel.kallsyms] [k] __mem_cgroup_begin_update_page_stat After this patch [root@bluextal test]# time ./mmap 1G real 0m21.373s user 0m6.113s sys 0m15.016s In usual path, calls to __mem_cgroup_begin_update_page_stat() goes away. Note: we may be able to remove this optimization in future if we can get pointer to memcg directly from struct page. [akpm@linux-foundation.org: don't return a void] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Greg Thelen Acked-by: Johannes Weiner Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- mm/memcontrol.c | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf7ae01fc93b..f94efd2f6c27 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -144,6 +144,8 @@ static inline bool mem_cgroup_disabled(void) void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, unsigned long *flags); +extern atomic_t memcg_moving; + static inline void mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, unsigned long *flags) { @@ -151,7 +153,8 @@ static inline void mem_cgroup_begin_update_page_stat(struct page *page, return; rcu_read_lock(); *locked = false; - return __mem_cgroup_begin_update_page_stat(page, locked, flags); + if (atomic_read(&memcg_moving)) + __mem_cgroup_begin_update_page_stat(page, locked, flags); } void __mem_cgroup_end_update_page_stat(struct page *page, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e13b2aeea61..eb1004f207b3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1306,8 +1306,13 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * rcu_read_unlock() * start move here. */ + +/* for quick checking without looking up memcg */ +atomic_t memcg_moving __read_mostly; + static void mem_cgroup_start_move(struct mem_cgroup *memcg) { + atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1318,8 +1323,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) + if (memcg) { + atomic_dec(&memcg_moving); atomic_dec(&memcg->moving_account); + } } /* -- cgit v1.2.3-59-g8ed1b