From e42d9d5d47961fb5db0be65b56dd52fe7b2421f1 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: memcg: rename and export try_get_mem_cgroup_from_page() So that the hwpoison injector can get mem_cgroup for arbitrary page and thus know whether it is owned by some mem_cgroup task(s). [AK: Merged with latest git tree] CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf9213b2db8f..fc9bae82ac42 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -68,6 +68,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); +extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); static inline @@ -189,6 +190,11 @@ mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) { } +static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) { return 1; -- cgit v1.3-8-gc7d7 From d324236b3333e87c8825b35f2104184734020d35 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: memcg: add accessor to mem_cgroup.css So that an outside user can free the reference count grabbed by try_get_mem_cgroup_from_page(). CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 5 +++++ 2 files changed, 12 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fc9bae82ac42..2c30a1116d84 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -81,6 +81,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) return cgroup == mem; } +extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); + extern int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, @@ -206,6 +208,11 @@ static inline int task_in_mem_cgroup(struct task_struct *task, return 1; } +static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return NULL; +} + static inline int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b5ac61ce7346..9eee80d6d490 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -282,6 +282,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) return &mem->info.nodeinfo[nid]->zoneinfo[zid]; } +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return &mem->css; +} + static struct mem_cgroup_per_zone * page_cgroup_zoneinfo(struct page_cgroup *pc) { -- cgit v1.3-8-gc7d7 From 569b846df54ffb2827b83ce3244c5f032394cba4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 15 Dec 2009 16:47:03 -0800 Subject: memcg: coalesce uncharge during unmap/truncate In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 +++++++ include/linux/sched.h | 8 ++++ kernel/fork.c | 4 ++ mm/memcontrol.c | 96 +++++++++++++++++++++++++++++++++++++++++++--- mm/memory.c | 2 + mm/truncate.c | 6 +++ 6 files changed, 123 insertions(+), 6 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf9213b2db8f..91300c972e76 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,6 +54,11 @@ extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); extern void mem_cgroup_del_lru(struct page *page); extern void mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to); + +/* For coalescing uncharge for reducing memcg' overhead*/ +extern void mem_cgroup_uncharge_start(void); +extern void mem_cgroup_uncharge_end(void); + extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); extern int mem_cgroup_shmem_charge_fallback(struct page *page, @@ -151,6 +156,14 @@ static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) { } +static inline void mem_cgroup_uncharge_start(void) +{ +} + +static inline void mem_cgroup_uncharge_end(void) +{ +} + static inline void mem_cgroup_uncharge_page(struct page *page) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c858f38e81a..f4c145410a8d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1544,6 +1544,14 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ unsigned long stack_start; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ + struct memcg_batch_info { + int do_batch; /* incremented when batch uncharge started */ + struct mem_cgroup *memcg; /* target memcg of uncharge */ + unsigned long bytes; /* uncharged usage */ + unsigned long memsw_bytes; /* uncharged mem+swap usage */ + } memcg_batch; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/kernel/fork.c b/kernel/fork.c index 9bd91447e052..b6cbd33dde80 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + p->memcg_batch.do_batch = 0; + p->memcg_batch.memcg = NULL; +#endif p->bts = NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b5b108c1c6b..a730c91b8e69 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) css_put(&mem->css); } +static void +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +{ + struct memcg_batch_info *batch = NULL; + bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ + if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + uncharge_memsw = false; + /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + + batch = ¤t->memcg_batch; + /* + * In usual, we do css_get() when we remember memcg pointer. + * But in this case, we keep res->usage until end of a series of + * uncharges. Then, it's ok to ignore memcg's refcnt. + */ + if (!batch->memcg) + batch->memcg = mem; + /* + * In typical case, batch->memcg == mem. This means we can + * merge a series of uncharges to an uncharge of res_counter. + * If not, we uncharge res_counter ony by one. + */ + if (batch->memcg != mem) + goto direct_uncharge; + /* remember freed charge and uncharge it later */ + batch->bytes += PAGE_SIZE; + if (uncharge_memsw) + batch->memsw_bytes += PAGE_SIZE; + return; +direct_uncharge: + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (uncharge_memsw) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + return; +} /* * uncharge if !page_mapped(page) @@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && - (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - } + if (!mem_cgroup_is_root(mem)) + __do_uncharge(mem, ctype); if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); @@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +/* + * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. + * In that cases, pages are freed continuously and we can expect pages + * are in the same memcg. All these calls itself limits the number of + * pages freed at once, then uncharge_start/end() is called properly. + * This may be called prural(2) times in a context, + */ + +void mem_cgroup_uncharge_start(void) +{ + current->memcg_batch.do_batch++; + /* We can do nest. */ + if (current->memcg_batch.do_batch == 1) { + current->memcg_batch.memcg = NULL; + current->memcg_batch.bytes = 0; + current->memcg_batch.memsw_bytes = 0; + } +} + +void mem_cgroup_uncharge_end(void) +{ + struct memcg_batch_info *batch = ¤t->memcg_batch; + + if (!batch->do_batch) + return; + + batch->do_batch--; + if (batch->do_batch) /* If stacked, do nothing. */ + return; + + if (!batch->memcg) + return; + /* + * This "batch->memcg" is valid without any css_get/put etc... + * bacause we hide charges behind us. + */ + if (batch->bytes) + res_counter_uncharge(&batch->memcg->res, batch->bytes); + if (batch->memsw_bytes) + res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + /* forget this pointer (for sanity check) */ + batch->memcg = NULL; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. diff --git a/mm/memory.c b/mm/memory.c index a54b2c498444..aed45eaf8ac9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, details = NULL; BUG_ON(addr >= end); + mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, zap_work, details); } while (pgd++, addr = next, (addr != end && *zap_work > 0)); tlb_end_vma(tlb, vma); + mem_cgroup_uncharge_end(); return addr; } diff --git a/mm/truncate.c b/mm/truncate.c index 2c147a7e5f2c..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); } } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pagevec_init(&pvec, 0); while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t index; @@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, break; } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } return ret; @@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, while (next <= end && !wrapped && pagevec_lookup(&pvec, mapping, next, min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index; @@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } return ret; -- cgit v1.3-8-gc7d7 From d8046582d5ee24448800e71c6933fdb6813aa062 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 15 Dec 2009 16:47:09 -0800 Subject: memcg: make memcg's file mapped consistent with global VM In global VM, FILE_MAPPED is used but memcg uses MAPPED_FILE. This makes grep difficult. Replace memcg's MAPPED_FILE with FILE_MAPPED And in global VM, mapped shared memory is accounted into FILE_MAPPED. But memcg doesn't. fix it. Note: page_is_file_cache() just checks SwapBacked or not. So, we need to check PageAnon. Cc: Balbir Singh Reviewed-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 21 +++++++++------------ mm/rmap.c | 4 ++-- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 91300c972e76..0b46c2068b96 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -122,7 +122,7 @@ static inline bool mem_cgroup_disabled(void) } extern bool mem_cgroup_oom_called(struct task_struct *task); -void mem_cgroup_update_mapped_file_stat(struct page *page, int val); +void mem_cgroup_update_file_mapped(struct page *page, int val); unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, int nid, int zid); @@ -287,7 +287,7 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_mapped_file_stat(struct page *page, +static inline void mem_cgroup_update_file_mapped(struct page *page, int val) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6587f657d57c..0b3efb843a87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -67,7 +67,7 @@ enum mem_cgroup_stat_index { */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ @@ -1227,7 +1227,7 @@ static void record_last_oom(struct mem_cgroup *mem) * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. */ -void mem_cgroup_update_mapped_file_stat(struct page *page, int val) +void mem_cgroup_update_file_mapped(struct page *page, int val) { struct mem_cgroup *mem; struct mem_cgroup_stat *stat; @@ -1235,9 +1235,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) int cpu; struct page_cgroup *pc; - if (!page_is_file_cache(page)) - return; - pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; @@ -1257,7 +1254,7 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) stat = &mem->stat; cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); done: unlock_page_cgroup(pc); } @@ -1654,18 +1651,18 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, pc, false); page = pc->page; - if (page_is_file_cache(page) && page_mapped(page)) { + if (page_mapped(page) && !PageAnon(page)) { cpu = smp_processor_id(); /* Update mapped_file data for mem_cgroup "from" */ stat = &from->stat; cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, -1); /* Update mapped_file data for mem_cgroup "to" */ stat = &to->stat; cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, 1); } @@ -2889,7 +2886,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) enum { MCS_CACHE, MCS_RSS, - MCS_MAPPED_FILE, + MCS_FILE_MAPPED, MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, @@ -2933,8 +2930,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_CACHE] += val * PAGE_SIZE; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); - s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); + s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); s->stat[MCS_PGPGIN] += val; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); diff --git a/mm/rmap.c b/mm/rmap.c index 98135dbd25ba..278cd277bdec 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -721,7 +721,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, 1); + mem_cgroup_update_file_mapped(page, 1); } } @@ -753,8 +753,8 @@ void page_remove_rmap(struct page *page) __dec_zone_page_state(page, NR_ANON_PAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_update_file_mapped(page, -1); } - mem_cgroup_update_mapped_file_stat(page, -1); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap -- cgit v1.3-8-gc7d7