From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001 From: Kuo-Hsin Yang Date: Thu, 11 Jul 2019 20:52:04 -0700 Subject: mm: vmscan: scan anonymous pages on file refaults When file refaults are detected and there are many inactive file pages, the system never reclaim anonymous pages, the file pages are dropped aggressively when there are still a lot of cold anonymous pages and system thrashes. This issue impacts the performance of applications with large executable, e.g. chrome. With this patch, when file refault is detected, inactive_list_is_low() always returns true for file pages in get_scan_count() to enable scanning anonymous pages. The problem can be reproduced by the following test program. ---8<--- void fallocate_file(const char *filename, off_t size) { struct stat st; int fd; if (!stat(filename, &st) && st.st_size >= size) return; fd = open(filename, O_WRONLY | O_CREAT, 0600); if (fd < 0) { perror("create file"); exit(1); } if (posix_fallocate(fd, 0, size)) { perror("fallocate"); exit(1); } close(fd); } long *alloc_anon(long size) { long *start = malloc(size); memset(start, 1, size); return start; } long access_file(const char *filename, long size, long rounds) { int fd, i; volatile char *start1, *end1, *start2; const int page_size = getpagesize(); long sum = 0; fd = open(filename, O_RDONLY); if (fd == -1) { perror("open"); exit(1); } /* * Some applications, e.g. chrome, use a lot of executable file * pages, map some of the pages with PROT_EXEC flag to simulate * the behavior. */ start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0); if (start1 == MAP_FAILED) { perror("mmap"); exit(1); } end1 = start1 + size / 2; start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2); if (start2 == MAP_FAILED) { perror("mmap"); exit(1); } for (i = 0; i < rounds; ++i) { struct timeval before, after; volatile char *ptr1 = start1, *ptr2 = start2; gettimeofday(&before, NULL); for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size) sum += *ptr1 + *ptr2; gettimeofday(&after, NULL); printf("File access time, round %d: %f (sec) ", i, (after.tv_sec - before.tv_sec) + (after.tv_usec - before.tv_usec) / 1000000.0); } return sum; } int main(int argc, char *argv[]) { const long MB = 1024 * 1024; long anon_mb, file_mb, file_rounds; const char filename[] = "large"; long *ret1; long ret2; if (argc != 4) { printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS "); exit(0); } anon_mb = atoi(argv[1]); file_mb = atoi(argv[2]); file_rounds = atoi(argv[3]); fallocate_file(filename, file_mb * MB); printf("Allocate %ld MB anonymous pages ", anon_mb); ret1 = alloc_anon(anon_mb * MB); printf("Access %ld MB file pages ", file_mb); ret2 = access_file(filename, file_mb * MB, file_rounds); printf("Print result to prevent optimization: %ld ", *ret1 + ret2); return 0; } ---8<--- Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without this patch, the file cache is dropped aggresively and every access to the file is from disk. $ ./thrash 2048 200 10 Allocate 2048 MB anonymous pages Access 200 MB file pages File access time, round 0: 2.489316 (sec) File access time, round 1: 2.581277 (sec) File access time, round 2: 2.487624 (sec) File access time, round 3: 2.449100 (sec) File access time, round 4: 2.420423 (sec) File access time, round 5: 2.343411 (sec) File access time, round 6: 2.454833 (sec) File access time, round 7: 2.483398 (sec) File access time, round 8: 2.572701 (sec) File access time, round 9: 2.493014 (sec) With this patch, these file pages can be cached. $ ./thrash 2048 200 10 Allocate 2048 MB anonymous pages Access 200 MB file pages File access time, round 0: 2.475189 (sec) File access time, round 1: 2.440777 (sec) File access time, round 2: 2.411671 (sec) File access time, round 3: 1.955267 (sec) File access time, round 4: 0.029924 (sec) File access time, round 5: 0.000808 (sec) File access time, round 6: 0.000771 (sec) File access time, round 7: 0.000746 (sec) File access time, round 8: 0.000738 (sec) File access time, round 9: 0.000747 (sec) Checked the swap out stats during the test [1], 19006 pages swapped out with this patch, 3418 pages swapped out without this patch. There are more swap out, but I think it's within reasonable range when file backed data set doesn't fit into the memory. $ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644, inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB) pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages Access 2100 MB regular file pages File access time, round 0: 12.165, (sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896, inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec) active_anon: 1430576, inactive_anon: 477144, active_file: 25440, inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec) active_anon: 1427436, inactive_anon: 476060, active_file: 21112, inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec) active_anon: 1420444, inactive_anon: 473632, active_file: 23216, inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec) active_anon: 1413964, inactive_anon: 471460, active_file: 31728, inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366 (+ 11309120) With 4 processes accessing non-overlapping parts of a large file, 30316 pages swapped out with this patch, 5152 pages swapped out without this patch. The swapout number is small comparing to pgpgin. [1]: https://github.com/vovo/testing/blob/master/mem_thrash.c Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty") Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty") Signed-off-by: Kuo-Hsin Yang Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Sonny Rao Cc: Mel Gorman Cc: Rik van Riel Cc: Vladimir Davydov Cc: Minchan Kim Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 910e02c793ff..96aafbf8ce4e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2125,7 +2125,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool actual_reclaim) + struct scan_control *sc, bool trace) { enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -2151,7 +2151,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * rid of the stale workingset quickly. */ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - if (file && actual_reclaim && lruvec->refaults != refaults) { + if (file && lruvec->refaults != refaults) { inactive_ratio = 0; } else { gb = (inactive + active) >> (30 - PAGE_SHIFT); @@ -2161,7 +2161,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive_ratio = 1; } - if (actual_reclaim) + if (trace) trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, -- cgit v1.2.3-59-g8ed1b From dd9239900e12db84c198855b262ae7796db1123b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 11 Jul 2019 20:52:11 -0700 Subject: mm/memcontrol: fix wrong statistics in memory.stat When we calculate total statistics for memcg1_stats and memcg1_events, we use the the index 'i' in the for loop as the events index. Actually we should use memcg1_stats[i] and memcg1_events[i] as the events index. Link: http://lkml.kernel.org/r/1562116978-19539-1-git-send-email-laoar.shao@gmail.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty"). Signed-off-by: Yafang Shao Cc: Michal Hocko Cc: Johannes Weiner Cc: Yafang Shao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ba9138a4a1de..591eafafbd8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3530,12 +3530,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)memcg_page_state(memcg, i) * PAGE_SIZE); + (u64)memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)memcg_events(memcg, i)); + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], -- cgit v1.2.3-59-g8ed1b From 810481a246089117d98e3373a3cb735c3efc1f90 Mon Sep 17 00:00:00 2001 From: Henry Burns Date: Thu, 11 Jul 2019 20:52:14 -0700 Subject: mm/z3fold.c: lock z3fold page before __SetPageMovable() Following zsmalloc.c's example we call trylock_page() and unlock_page(). Also make z3fold_page_migrate() assert that newpage is passed in locked, as per the documentation. [akpm@linux-foundation.org: fix trylock_page return value test, per Shakeel] Link: http://lkml.kernel.org/r/20190702005122.41036-1-henryburns@google.com Link: http://lkml.kernel.org/r/20190702233538.52793-1-henryburns@google.com Signed-off-by: Henry Burns Suggested-by: Vitaly Wool Acked-by: Vitaly Wool Acked-by: David Rientjes Reviewed-by: Shakeel Butt Cc: Vitaly Vul Cc: Mike Rapoport Cc: Xidong Wang Cc: Jonathan Adams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/z3fold.c b/mm/z3fold.c index 985732c8b025..dfcd69d08c1e 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -924,7 +924,16 @@ retry: set_bit(PAGE_HEADLESS, &page->private); goto headless; } - __SetPageMovable(page, pool->inode->i_mapping); + if (can_sleep) { + lock_page(page); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } else { + if (trylock_page(page)) { + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } + } z3fold_page_lock(zhdr); found: @@ -1331,6 +1340,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); zhdr = page_address(page); pool = zhdr_to_pool(zhdr); -- cgit v1.2.3-59-g8ed1b From 598a0717a816abc8f5d3c4598628338b9190d127 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2019 20:53:23 -0700 Subject: mm/slab: validate cache membership under freelist hardening Patch series "mm/slab: Improved sanity checking". This adds defenses against slab cache confusion (as seen in real-world exploits[1]) and gracefully handles type confusions when trying to look up slab caches from an arbitrary page. (Also is patch 3: new LKDTM tests for these defenses as well as for the existing double-free detection. This patch (of 3): When building under CONFIG_SLAB_FREELIST_HARDENING, it makes sense to perform sanity-checking on the assumed slab cache during kmem_cache_free() to make sure the kernel doesn't mix freelists across slab caches and corrupt memory (as seen in the exploitation of flaws like CVE-2018-9568[1]). Note that the prior code might WARN() but still corrupt memory (i.e. return the assumed cache instead of the owned cache). There is no noticeable performance impact (changes are within noise). Measuring parallel kernel builds, I saw the following with CONFIG_SLAB_FREELIST_HARDENED, before and after this patch: before: Run times: 288.85 286.53 287.09 287.07 287.21 Min: 286.53 Max: 288.85 Mean: 287.35 Std Dev: 0.79 after: Run times: 289.58 287.40 286.97 287.20 287.01 Min: 286.97 Max: 289.58 Mean: 287.63 Std Dev: 0.99 Delta: 0.1% which is well below the standard deviation [1] https://github.com/ThomasKing2014/slides/raw/master/Building%20universal%20Android%20rooting%20with%20a%20type%20confusion%20vulnerability.pdf Link: http://lkml.kernel.org/r/20190530045017.15252-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Alexander Popov Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 43ac818b8592..4dafae2c8620 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -310,7 +310,7 @@ static inline bool is_root_cache(struct kmem_cache *s) static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { - return true; + return s == p; } static inline const char *cache_name(struct kmem_cache *s) @@ -363,18 +363,16 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * will also be a constant. */ if (!memcg_kmem_enabled() && + !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); cachep = page->slab_cache; - if (slab_equal_or_root(cachep, s)) - return cachep; - - pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name); - WARN_ON_ONCE(1); - return s; + WARN_ONCE(!slab_equal_or_root(cachep, s), + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); + return cachep; } static inline size_t slab_ksize(const struct kmem_cache *s) -- cgit v1.2.3-59-g8ed1b From a64b53780ec35b77daf817210c88aa42d172c98f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2019 20:53:26 -0700 Subject: mm/slab: sanity-check page type when looking up cache This avoids any possible type confusion when looking up an object. For example, if a non-slab were to be passed to kfree(), the invalid slab_cache pointer (i.e. overlapped with some other value from the struct page union) would be used for subsequent slab manipulations that could lead to further memory corruption. Since the page is already in cache, adding the PageSlab() check will have nearly zero cost, so add a check and WARN() to virt_to_cache(). Additionally replaces an open-coded virt_to_cache(). To support the failure mode this also updates all callers of virt_to_cache() and cache_from_obj() to handle a NULL cache pointer return value (though note that several already handle this case gracefully). [dan.carpenter@oracle.com: restore IRQs in kfree()] Link: http://lkml.kernel.org/r/20190613065637.GE16334@mwanda Link: http://lkml.kernel.org/r/20190530045017.15252-3-keescook@chromium.org Signed-off-by: Kees Cook Signed-off-by: Dan Carpenter Cc: Alexander Popov Cc: Alexander Potapenko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 16 +++++++++------- mm/slab.h | 17 +++++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f7117ad9b3a3..db01e9aae31b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -371,12 +371,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -static inline struct kmem_cache *virt_to_cache(const void *obj) -{ - struct page *page = virt_to_head_page(obj); - return page->slab_cache; -} - static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx) { @@ -3715,6 +3709,8 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) s = virt_to_cache(objp); else s = cache_from_obj(orig_s, objp); + if (!s) + continue; debug_check_no_locks_freed(objp, s->object_size); if (!(s->flags & SLAB_DEBUG_OBJECTS)) @@ -3749,6 +3745,10 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); + if (!c) { + local_irq_restore(flags); + return; + } debug_check_no_locks_freed(objp, c->object_size); debug_check_no_obj_freed(objp, c->object_size); @@ -4219,13 +4219,15 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, */ size_t ksize(const void *objp) { + struct kmem_cache *c; size_t size; BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - size = virt_to_cache(objp)->object_size; + c = virt_to_cache(objp); + size = c ? c->object_size : 0; /* We assume that ksize callers could use the whole allocated area, * so we need to unpoison this area. */ diff --git a/mm/slab.h b/mm/slab.h index 4dafae2c8620..739099af6cbb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -350,10 +350,20 @@ static inline void memcg_link_cache(struct kmem_cache *s) #endif /* CONFIG_MEMCG_KMEM */ +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page; + + page = virt_to_head_page(obj); + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return page->slab_cache; +} + static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { struct kmem_cache *cachep; - struct page *page; /* * When kmemcg is not being used, both assignments should return the @@ -367,9 +377,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; - page = virt_to_head_page(x); - cachep = page->slab_cache; - WARN_ONCE(!slab_equal_or_root(cachep, s), + cachep = virt_to_cache(x); + WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), "%s: Wrong slab cache. %s but object is from %s\n", __func__, s->name, cachep->name); return cachep; -- cgit v1.2.3-59-g8ed1b From 9cf3a8d847bd08977dc168ed243ffbef3c456d88 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 11 Jul 2019 20:53:33 -0700 Subject: mm/slub.c: avoid double string traverse in kmem_cache_flags() If ',' is not found, kmem_cache_flags() calls strlen() to find the end of line. We can do it in a single pass using strchrnul(). Link: http://lkml.kernel.org/r/20190501053111.7950-1-ynorov@marvell.com Signed-off-by: Yury Norov Acked-by: Aaron Tomlin Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index cd04dbd2b5d0..1802c87799ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1313,9 +1313,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, char *end, *glob; size_t cmplen; - end = strchr(iter, ','); - if (!end) - end = iter + strlen(iter); + end = strchrnul(iter, ','); glob = strnchr(iter, end - iter, '*'); if (glob) -- cgit v1.2.3-59-g8ed1b From cb097cd48313575b03a5de092a04f6af8aa0739e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:53:36 -0700 Subject: slub: don't panic for memcg kmem cache creation failure Currently for CONFIG_SLUB, if a memcg kmem cache creation is failed and the corresponding root kmem cache has SLAB_PANIC flag, the kernel will be crashed. This is unnecessary as the kernel can handle the creation failures of memcg kmem caches. Additionally CONFIG_SLAB does not implement this behavior. So, to keep the behavior consistent between SLAB and SLUB, removing the panic for memcg kmem cache creation failures. The root kmem cache creation failure for SLAB_PANIC correctly panics for both SLAB and SLUB. Link: http://lkml.kernel.org/r/20190619232514.58994-1-shakeelb@google.com Reported-by: Dave Hansen Signed-off-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Johannes Weiner Cc: Christoph Lameter Cc: Roman Gushchin Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 1802c87799ff..d46a91759b96 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3650,10 +3650,6 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) free_kmem_cache_nodes(s); error: - if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n", - s->name, s->size, s->size, - oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From 6ef9056952532c3b746de46aa10d45b4d7797bd8 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 11 Jul 2019 20:53:39 -0700 Subject: mm/kmemleak.c: fix check for softirq context in_softirq() is a wrong predicate to check if we are in a softirq context. It also returns true if we have BH disabled, so objects are falsely stamped with "softirq" comm. The correct predicate is in_serving_softirq(). If user does cat from /sys/kernel/debug/kmemleak previously they would see this, which is clearly wrong, this is system call context (see the comm): unreferenced object 0xffff88805bd661c0 (size 64): comm "softirq", pid 0, jiffies 4294942959 (age 12.400s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 ff ff ff ff 00 00 00 00 ................ 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace: [<0000000007dcb30c>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<0000000007dcb30c>] slab_post_alloc_hook mm/slab.h:439 [inline] [<0000000007dcb30c>] slab_alloc mm/slab.c:3326 [inline] [<0000000007dcb30c>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<00000000969722b7>] kmalloc include/linux/slab.h:547 [inline] [<00000000969722b7>] kzalloc include/linux/slab.h:742 [inline] [<00000000969722b7>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<00000000969722b7>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a4134b5f>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<00000000d20248ad>] do_ip_setsockopt.isra.0+0x19fe/0x1c00 net/ipv4/ip_sockglue.c:957 [<000000003d367be7>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<000000003c7c76af>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000000c1aeb23>] sock_common_setsockopt+0x3e/0x50 net/core/sock.c:3130 [<000000000157b92b>] __sys_setsockopt+0x9e/0x120 net/socket.c:2078 [<00000000a9f3d058>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000a9f3d058>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000a9f3d058>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000001b8da885>] do_syscall_64+0x7c/0x1a0 arch/x86/entry/common.c:301 [<00000000ba770c62>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 now they will see this: unreferenced object 0xffff88805413c800 (size 64): comm "syz-executor.4", pid 8960, jiffies 4294994003 (age 14.350s) hex dump (first 32 bytes): 00 7a 8a 57 80 88 ff ff e0 00 00 01 00 00 00 00 .z.W............ 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace: [<00000000c5d3be64>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<00000000c5d3be64>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000c5d3be64>] slab_alloc mm/slab.c:3326 [inline] [<00000000c5d3be64>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<0000000023865be2>] kmalloc include/linux/slab.h:547 [inline] [<0000000023865be2>] kzalloc include/linux/slab.h:742 [inline] [<0000000023865be2>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<0000000023865be2>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<000000003029a9d4>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<00000000ccd0a87c>] do_ip_setsockopt.isra.0+0x19fe/0x1c00 net/ipv4/ip_sockglue.c:957 [<00000000a85a3785>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000ec13c18d>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<0000000052d748e3>] sock_common_setsockopt+0x3e/0x50 net/core/sock.c:3130 [<00000000512f1014>] __sys_setsockopt+0x9e/0x120 net/socket.c:2078 [<00000000181758bc>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000181758bc>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000181758bc>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<00000000d4b73623>] do_syscall_64+0x7c/0x1a0 arch/x86/entry/common.c:301 [<00000000c1098bec>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Link: http://lkml.kernel.org/r/20190517171507.96046-1-dvyukov@gmail.com Signed-off-by: Dmitry Vyukov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 9dd581d11565..3e147ea83182 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -575,7 +575,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, if (in_irq()) { object->pid = 0; strncpy(object->comm, "hardirq", sizeof(object->comm)); - } else if (in_softirq()) { + } else if (in_serving_softirq()) { object->pid = 0; strncpy(object->comm, "softirq", sizeof(object->comm)); } else { -- cgit v1.2.3-59-g8ed1b From 4e4dfce2278929de4379cdcfa2335dad7a6c4aa0 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 11 Jul 2019 20:53:43 -0700 Subject: mm/kmemleak.c: change error at _write when kmemleak is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to POSIX, EBUSY means that the "device or resource is busy", and this can lead to people thinking that the file `/sys/kernel/debug/kmemleak/` is somehow locked or being used by other process. Change this error code to a more appropriate one. Link: http://lkml.kernel.org/r/20190612155231.19448-1-andrealmeid@collabora.com Signed-off-by: André Almeida Reviewed-by: Andrew Morton Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3e147ea83182..aa8f4fa93ca3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1866,7 +1866,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } if (!kmemleak_enabled) { - ret = -EBUSY; + ret = -EPERM; goto out; } -- cgit v1.2.3-59-g8ed1b From e89692190065c12386bd37272ae8b7d142dd079f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:53:49 -0700 Subject: mm/kasan: print frame description for stack bugs This adds support for printing stack frame description on invalid stack accesses. The frame description is embedded by the compiler, which is parsed and then pretty-printed. Currently, we can only print the stack frame info for accesses to the task's own stack, but not accesses to other tasks' stacks. Example of what it looks like: page dumped because: kasan: bad access detected addr ffff8880673ef98a is located in stack of task insmod/2008 at offset 106 in frame: kasan_stack_oob+0x0/0xf5 [test_kasan] this frame has 2 objects: [32, 36) 'i' [96, 106) 'stack_array' Memory state around the buggy address: Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=198435 Link: http://lkml.kernel.org/r/20190522100048.146841-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 5 ++ mm/kasan/report.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) (limited to 'mm') diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3ce956efa0cb..1979db4763e2 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -43,6 +43,11 @@ #define KASAN_ALLOCA_REDZONE_SIZE 32 +/* + * Stack frame marker (compiler ABI). + */ +#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3 + /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION #define KASAN_ABI_VERSION 1 diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 03a443579386..0e5f965f1882 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -181,6 +182,168 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +static bool __must_check tokenize_frame_descr(const char **frame_descr, + char *token, size_t max_tok_len, + unsigned long *value) +{ + const char *sep = strchr(*frame_descr, ' '); + + if (sep == NULL) + sep = *frame_descr + strlen(*frame_descr); + + if (token != NULL) { + const size_t tok_len = sep - *frame_descr; + + if (tok_len + 1 > max_tok_len) { + pr_err("KASAN internal error: frame description too long: %s\n", + *frame_descr); + return false; + } + + /* Copy token (+ 1 byte for '\0'). */ + strlcpy(token, *frame_descr, tok_len + 1); + } + + /* Advance frame_descr past separator. */ + *frame_descr = sep + 1; + + if (value != NULL && kstrtoul(token, 10, value)) { + pr_err("KASAN internal error: not a valid number: %s\n", token); + return false; + } + + return true; +} + +static void print_decoded_frame_descr(const char *frame_descr) +{ + /* + * We need to parse the following string: + * "n alloc_1 alloc_2 ... alloc_n" + * where alloc_i looks like + * "offset size len name" + * or "offset size len name:line". + */ + + char token[64]; + unsigned long num_objects; + + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &num_objects)) + return; + + pr_err("\n"); + pr_err("this frame has %lu %s:\n", num_objects, + num_objects == 1 ? "object" : "objects"); + + while (num_objects--) { + unsigned long offset; + unsigned long size; + + /* access offset */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &offset)) + return; + /* access size */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &size)) + return; + /* name length (unused) */ + if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL)) + return; + /* object name */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + NULL)) + return; + + /* Strip line number; without filename it's not very helpful. */ + strreplace(token, ':', '\0'); + + /* Finally, print object information. */ + pr_err(" [%lu, %lu) '%s'", offset, offset + size, token); + } +} + +static bool __must_check get_address_stack_frame_info(const void *addr, + unsigned long *offset, + const char **frame_descr, + const void **frame_pc) +{ + unsigned long aligned_addr; + unsigned long mem_ptr; + const u8 *shadow_bottom; + const u8 *shadow_ptr; + const unsigned long *frame; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); + + /* + * NOTE: We currently only support printing frame information for + * accesses to the task's own stack. + */ + if (!object_is_on_stack(addr)) + return false; + + aligned_addr = round_down((unsigned long)addr, sizeof(long)); + mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE); + shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); + shadow_bottom = kasan_mem_to_shadow(end_of_stack(current)); + + while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_SHADOW_SCALE_SIZE; + } + + while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_SHADOW_SCALE_SIZE; + } + + if (shadow_ptr < shadow_bottom) + return false; + + frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE); + if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { + pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", + frame[0]); + return false; + } + + *offset = (unsigned long)addr - (unsigned long)frame; + *frame_descr = (const char *)frame[1]; + *frame_pc = (void *)frame[2]; + + return true; +} + +static void print_address_stack_frame(const void *addr) +{ + unsigned long offset; + const char *frame_descr; + const void *frame_pc; + + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return; + + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, + &frame_pc)) + return; + + /* + * get_address_stack_frame_info only returns true if the given addr is + * on the current task's stack. + */ + pr_err("\n"); + pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", + addr, current->comm, task_pid_nr(current), offset); + pr_err(" %pS\n", frame_pc); + + if (!frame_descr) + return; + + print_decoded_frame_descr(frame_descr); +} + static void print_address_description(void *addr) { struct page *page = addr_to_page(addr); @@ -204,6 +367,8 @@ static void print_address_description(void *addr) pr_err("The buggy address belongs to the page:\n"); dump_page(page, "kasan: bad access detected"); } + + print_address_stack_frame(addr); } static bool row_is_guilty(const void *row, const void *guilty) -- cgit v1.2.3-59-g8ed1b From 7d8ad890dad00f6cd64bfb44d9be4fceb10cf819 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:03 -0700 Subject: mm/kasan: introduce __kasan_check_{read,write} Patch series "mm/kasan: Add object validation in ksize()", v3. This patch (of 5): This introduces __kasan_check_{read,write}. __kasan_check functions may be used from anywhere, even compilation units that disable instrumentation selectively. This change eliminates the need for the __KASAN_INTERNAL definition. [elver@google.com: v5] Link: http://lkml.kernel.org/r/20190708170706.174189-2-elver@google.com Link: http://lkml.kernel.org/r/20190626142014.141844-2-elver@google.com Signed-off-by: Marco Elver Acked-by: Mark Rutland Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 25 ++++++++++++++++++++++--- mm/kasan/common.c | 10 ++++------ 2 files changed, 26 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index a61dc075e2ce..221f05fbddd7 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,9 +2,28 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H -#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) -void kasan_check_read(const volatile void *p, unsigned int size); -void kasan_check_write(const volatile void *p, unsigned int size); +/* + * __kasan_check_*: Always available when KASAN is enabled. This may be used + * even in compilation units that selectively disable KASAN, but must use KASAN + * to validate access to an address. Never use these in header files! + */ +#ifdef CONFIG_KASAN +void __kasan_check_read(const volatile void *p, unsigned int size); +void __kasan_check_write(const volatile void *p, unsigned int size); +#else +static inline void __kasan_check_read(const volatile void *p, unsigned int size) +{ } +static inline void __kasan_check_write(const volatile void *p, unsigned int size) +{ } +#endif + +/* + * kasan_check_*: Only available when the particular compilation unit has KASAN + * instrumentation enabled. May be used in header files. + */ +#ifdef __SANITIZE_ADDRESS__ +#define kasan_check_read __kasan_check_read +#define kasan_check_write __kasan_check_write #else static inline void kasan_check_read(const volatile void *p, unsigned int size) { } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 242fdc01aaa9..6bada42cc152 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -14,8 +14,6 @@ * */ -#define __KASAN_INTERNAL - #include #include #include @@ -89,17 +87,17 @@ void kasan_disable_current(void) current->kasan_depth--; } -void kasan_check_read(const volatile void *p, unsigned int size) +void __kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } -EXPORT_SYMBOL(kasan_check_read); +EXPORT_SYMBOL(__kasan_check_read); -void kasan_check_write(const volatile void *p, unsigned int size) +void __kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } -EXPORT_SYMBOL(kasan_check_write); +EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) -- cgit v1.2.3-59-g8ed1b From b5f6e0fc7d60e0234dac82498e90dfe9027bad1f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:07 -0700 Subject: mm/kasan: change kasan_check_{read,write} to return boolean This changes {,__}kasan_check_{read,write} functions to return a boolean denoting if the access was valid or not. [sfr@canb.auug.org.au: include types.h for "bool"] Link: http://lkml.kernel.org/r/20190705184949.13cdd021@canb.auug.org.au Link: http://lkml.kernel.org/r/20190626142014.141844-3-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Stephen Rothwell Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 30 ++++++++++++++++++++---------- mm/kasan/common.c | 8 ++++---- mm/kasan/generic.c | 13 +++++++------ mm/kasan/kasan.h | 10 +++++++++- mm/kasan/tags.c | 12 +++++++----- 5 files changed, 47 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index 221f05fbddd7..ac6aba632f2d 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,19 +2,25 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H +#include + /* * __kasan_check_*: Always available when KASAN is enabled. This may be used * even in compilation units that selectively disable KASAN, but must use KASAN * to validate access to an address. Never use these in header files! */ #ifdef CONFIG_KASAN -void __kasan_check_read(const volatile void *p, unsigned int size); -void __kasan_check_write(const volatile void *p, unsigned int size); +bool __kasan_check_read(const volatile void *p, unsigned int size); +bool __kasan_check_write(const volatile void *p, unsigned int size); #else -static inline void __kasan_check_read(const volatile void *p, unsigned int size) -{ } -static inline void __kasan_check_write(const volatile void *p, unsigned int size) -{ } +static inline bool __kasan_check_read(const volatile void *p, unsigned int size) +{ + return true; +} +static inline bool __kasan_check_write(const volatile void *p, unsigned int size) +{ + return true; +} #endif /* @@ -25,10 +31,14 @@ static inline void __kasan_check_write(const volatile void *p, unsigned int size #define kasan_check_read __kasan_check_read #define kasan_check_write __kasan_check_write #else -static inline void kasan_check_read(const volatile void *p, unsigned int size) -{ } -static inline void kasan_check_write(const volatile void *p, unsigned int size) -{ } +static inline bool kasan_check_read(const volatile void *p, unsigned int size) +{ + return true; +} +static inline bool kasan_check_write(const volatile void *p, unsigned int size) +{ + return true; +} #endif #endif diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6bada42cc152..2277b82902d8 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -87,15 +87,15 @@ void kasan_disable_current(void) current->kasan_depth--; } -void __kasan_check_read(const volatile void *p, unsigned int size) +bool __kasan_check_read(const volatile void *p, unsigned int size) { - check_memory_region((unsigned long)p, size, false, _RET_IP_); + return check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); -void __kasan_check_write(const volatile void *p, unsigned int size) +bool __kasan_check_write(const volatile void *p, unsigned int size) { - check_memory_region((unsigned long)p, size, true, _RET_IP_); + return check_memory_region((unsigned long)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 504c79363a34..616f9dd82d12 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -166,29 +166,30 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline void check_memory_region_inline(unsigned long addr, +static __always_inline bool check_memory_region_inline(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { if (unlikely(size == 0)) - return; + return true; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { kasan_report(addr, size, write, ret_ip); - return; + return false; } if (likely(!memory_is_poisoned(addr, size))) - return; + return true; kasan_report(addr, size, write, ret_ip); + return false; } -void check_memory_region(unsigned long addr, size_t size, bool write, +bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { - check_memory_region_inline(addr, size, write, ret_ip); + return check_memory_region_inline(addr, size, write, ret_ip); } void kasan_cache_shrink(struct kmem_cache *cache) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1979db4763e2..014f19e76247 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -128,7 +128,15 @@ static inline bool addr_has_shadow(const void *addr) void kasan_poison_shadow(const void *address, size_t size, u8 value); -void check_memory_region(unsigned long addr, size_t size, bool write, +/** + * check_memory_region - Check memory region, and report if invalid access. + * @addr: the accessed address + * @size: the accessed size + * @write: true if access is a write access + * @ret_ip: return address + * @return: true if access was valid, false if invalid + */ +bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip); void *find_first_bad_addr(void *addr, size_t size); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 63fca3172659..0e987c9ca052 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -76,7 +76,7 @@ void *kasan_reset_tag(const void *addr) return reset_tag(addr); } -void check_memory_region(unsigned long addr, size_t size, bool write, +bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { u8 tag; @@ -84,7 +84,7 @@ void check_memory_region(unsigned long addr, size_t size, bool write, void *untagged_addr; if (unlikely(size == 0)) - return; + return true; tag = get_tag((const void *)addr); @@ -106,22 +106,24 @@ void check_memory_region(unsigned long addr, size_t size, bool write, * set to KASAN_TAG_KERNEL (0xFF)). */ if (tag == KASAN_TAG_KERNEL) - return; + return true; untagged_addr = reset_tag((const void *)addr); if (unlikely(untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { kasan_report(addr, size, write, ret_ip); - return; + return false; } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { if (*shadow != tag) { kasan_report(addr, size, write, ret_ip); - return; + return false; } } + + return true; } #define DEFINE_HWASAN_LOAD_STORE(size) \ -- cgit v1.2.3-59-g8ed1b From 10d1f8cb3965a6f633bf23eb984cda552927e3a5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:14 -0700 Subject: mm/slab: refactor common ksize KASAN logic into slab_common.c This refactors common code of ksize() between the various allocators into slab_common.c: __ksize() is the allocator-specific implementation without instrumentation, whereas ksize() includes the required KASAN logic. Link: http://lkml.kernel.org/r/20190626142014.141844-5-elver@google.com Signed-off-by: Marco Elver Acked-by: Christoph Lameter Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 + mm/slab.c | 22 +++++----------------- mm/slab_common.c | 26 ++++++++++++++++++++++++++ mm/slob.c | 4 ++-- mm/slub.c | 14 ++------------ 5 files changed, 36 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9449b19c5f10..98c3d12b7275 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,6 +184,7 @@ void * __must_check __krealloc(const void *, size_t, gfp_t); void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); void kzfree(const void *); +size_t __ksize(const void *); size_t ksize(const void *); #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR diff --git a/mm/slab.c b/mm/slab.c index db01e9aae31b..3521a351ceb5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4204,20 +4204,12 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, #endif /* CONFIG_HARDENED_USERCOPY */ /** - * ksize - get the actual amount of memory allocated for a given object - * @objp: Pointer to the object + * __ksize -- Uninstrumented ksize. * - * kmalloc may internally round up allocations and return more memory - * than requested. ksize() can be used to determine the actual amount of - * memory allocated. The caller may use this additional memory, even though - * a smaller amount of memory was initially specified with the kmalloc call. - * The caller must guarantee that objp points to a valid object previously - * allocated with either kmalloc() or kmem_cache_alloc(). The object - * must not be freed during the duration of the call. - * - * Return: size of the actual memory used by @objp in bytes + * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same + * safety checks as ksize() with KASAN instrumentation enabled. */ -size_t ksize(const void *objp) +size_t __ksize(const void *objp) { struct kmem_cache *c; size_t size; @@ -4228,11 +4220,7 @@ size_t ksize(const void *objp) c = virt_to_cache(objp); size = c ? c->object_size : 0; - /* We assume that ksize callers could use the whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_shadow(objp, size); return size; } -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); diff --git a/mm/slab_common.c b/mm/slab_common.c index 58251ba63e4a..b7c6a40e436a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1597,6 +1597,32 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); +/** + * ksize - get the actual amount of memory allocated for a given object + * @objp: Pointer to the object + * + * kmalloc may internally round up allocations and return more memory + * than requested. ksize() can be used to determine the actual amount of + * memory allocated. The caller may use this additional memory, even though + * a smaller amount of memory was initially specified with the kmalloc call. + * The caller must guarantee that objp points to a valid object previously + * allocated with either kmalloc() or kmem_cache_alloc(). The object + * must not be freed during the duration of the call. + * + * Return: size of the actual memory used by @objp in bytes + */ +size_t ksize(const void *objp) +{ + size_t size = __ksize(objp); + /* + * We assume that ksize callers could use whole allocated area, + * so we need to unpoison this area. + */ + kasan_unpoison_shadow(objp, size); + return size; +} +EXPORT_SYMBOL(ksize); + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/slob.c b/mm/slob.c index 84aefd9b91ee..7f421d0ca9ab 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -527,7 +527,7 @@ void kfree(const void *block) EXPORT_SYMBOL(kfree); /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ -size_t ksize(const void *block) +size_t __ksize(const void *block) { struct page *sp; int align; @@ -545,7 +545,7 @@ size_t ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { diff --git a/mm/slub.c b/mm/slub.c index d46a91759b96..5e217653286c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3895,7 +3895,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, } #endif /* CONFIG_HARDENED_USERCOPY */ -static size_t __ksize(const void *object) +size_t __ksize(const void *object) { struct page *page; @@ -3911,17 +3911,7 @@ static size_t __ksize(const void *object) return slab_ksize(page->slab_cache); } - -size_t ksize(const void *object) -{ - size_t size = __ksize(object); - /* We assume that ksize callers could use whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_shadow(object, size); - return size; -} -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); void kfree(const void *x) { -- cgit v1.2.3-59-g8ed1b From 0d4ca4c9bab397b525c9a4f875d31410ce4bc738 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:18 -0700 Subject: mm/kasan: add object validation in ksize() ksize() has been unconditionally unpoisoning the whole shadow memory region associated with an allocation. This can lead to various undetected bugs, for example, double-kzfree(). Specifically, kzfree() uses ksize() to determine the actual allocation size, and subsequently zeroes the memory. Since ksize() used to just unpoison the whole shadow memory region, no invalid free was detected. This patch addresses this as follows: 1. Add a check in ksize(), and only then unpoison the memory region. 2. Preserve kasan_unpoison_slab() semantics by explicitly unpoisoning the shadow memory region using the size obtained from __ksize(). Tested: 1. With SLAB allocator: a) normal boot without warnings; b) verified the added double-kzfree() is detected. 2. With SLUB allocator: a) normal boot without warnings; b) verified the added double-kzfree() is detected. [elver@google.com: s/BUG_ON/WARN_ON_ONCE/, per Kees] Link: http://lkml.kernel.org/r/20190627094445.216365-6-elver@google.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199359 Link: http://lkml.kernel.org/r/20190626142014.141844-6-elver@google.com Signed-off-by: Marco Elver Acked-by: Kees Cook Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 7 +++++-- mm/slab_common.c | 22 +++++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b40ea104dd36..cc8a03cc9674 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -76,8 +76,11 @@ void kasan_free_shadow(const struct vm_struct *vm); int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); -size_t ksize(const void *); -static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } +size_t __ksize(const void *); +static inline void kasan_unpoison_slab(const void *ptr) +{ + kasan_unpoison_shadow(ptr, __ksize(ptr)); +} size_t kasan_metadata_size(struct kmem_cache *cache); bool kasan_save_enable_multi_shot(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index b7c6a40e436a..a09bb10aa026 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1613,7 +1613,27 @@ EXPORT_SYMBOL(kzfree); */ size_t ksize(const void *objp) { - size_t size = __ksize(objp); + size_t size; + + if (WARN_ON_ONCE(!objp)) + return 0; + /* + * We need to check that the pointed to object is valid, and only then + * unpoison the shadow memory below. We use __kasan_check_read(), to + * generate a more useful report at the time ksize() is called (rather + * than later where behaviour is undefined due to potential + * use-after-free or double-free). + * + * If the pointed to memory is invalid we return 0, to avoid users of + * ksize() writing to and potentially corrupting the memory region. + * + * We want to perform the check before __ksize(), to avoid potentially + * crashing in __ksize() due to accessing invalid metadata. + */ + if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1)) + return 0; + + size = __ksize(objp); /* * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. -- cgit v1.2.3-59-g8ed1b From a7030aea20d9191c271607aa1478f72a41948b0a Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Thu, 11 Jul 2019 20:54:34 -0700 Subject: mm/gup.c: make follow_page_mask() static follow_page_mask() is only used in gup.c, make it static. Link: http://lkml.kernel.org/r/20190510190831.GA4061@bharath12345-Inspiron-5559 Signed-off-by: Bharath Vedartham Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index ddde097cf9e4..f1c1daebc425 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -515,7 +515,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page_mask(struct vm_area_struct *vma, +static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { -- cgit v1.2.3-59-g8ed1b From 465fc3a9b3129722b0df395529b3894b0b90d2de Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 11 Jul 2019 20:54:37 -0700 Subject: mm/memory.c: trivial clean up in insert_page() Make the success case use the same cleanup path as the failure case. Link: http://lkml.kernel.org/r/20190523134024.GC24093@localhost.localdomain Signed-off-by: Miklos Szeredi Reviewed-by: Andrew Morton Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index ddf20bd0c317..ced4bedc660d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1475,8 +1475,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; - pte_unmap_unlock(pte, ptl); - return retval; out_unlock: pte_unmap_unlock(pte, ptl); out: -- cgit v1.2.3-59-g8ed1b From ac1c3e49a9a734150b33297eeca5b43d92fd5be8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:54:46 -0700 Subject: mm: remove the account_page_dirtied export account_page_dirtied() is only used by our set_page_dirty() helpers and should not be used anywhere else. Link: http://lkml.kernel.org/r/20190605183702.30572-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bdbe8b6b1225..1804f64ff43c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2429,7 +2429,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) this_cpu_inc(bdp_ratelimits); } } -EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. -- cgit v1.2.3-59-g8ed1b From 1fcf0a561cd09d7fb7f7afa2ddfe05f72f32050e Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Thu, 11 Jul 2019 20:54:49 -0700 Subject: mm/page_isolation.c: change the prototype of undo_isolate_page_range() undo_isolate_page_range() never fails, so no need to return value. Link: http://lkml.kernel.org/r/1562075604-8979-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 2 +- mm/page_isolation.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 280ae96dc4c3..1099c2fee20f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. * target range is [start_pfn, end_pfn) */ -int +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index e3638a5bafff..89c19c0feadb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -230,7 +230,7 @@ undo: /* * Make isolated pages available again. */ -int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype) { unsigned long pfn; @@ -247,7 +247,6 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, continue; unset_migratetype_isolate(page, migratetype); } - return 0; } /* * Test all pages in the range is free(means isolated) or not. -- cgit v1.2.3-59-g8ed1b From 98ef2046f28b642ad35935972a173194ea58a21d Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 11 Jul 2019 20:54:56 -0700 Subject: mm: remove the exporting of totalram_pages Previously totalram_pages was the global variable. Currently, totalram_pages is the static inline function from the include/linux/mm.h However, the function is also marked as EXPORT_SYMBOL, which is at best an odd combination. Because there is no point for the static inline function from a public header to be exported, this commit removes the EXPORT_SYMBOL() marking. It will be still possible to use the function in modules because all the symbols it depends on are exported. Link: http://lkml.kernel.org/r/20190710141031.15642-1-efremov@linux.com Fixes: ca79b0c211af6 ("mm: convert totalram_pages and totalhigh_pages variables to atomic") Signed-off-by: Denis Efremov Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Mel Gorman Cc: Mike Rapoport Cc: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e3bc949ebcc..060303496094 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -224,8 +224,6 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { [ZONE_MOVABLE] = 0, }; -EXPORT_SYMBOL(totalram_pages); - static char * const zone_names[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA "DMA", -- cgit v1.2.3-59-g8ed1b From a9659476d4b391aec3f5357f176b63b9f8c46231 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Thu, 11 Jul 2019 20:55:03 -0700 Subject: mm/failslab.c: by default, do not fail allocations with direct reclaim only When failslab was originally written, the intention of the "ignore-gfp-wait" flag default value ("N") was to fail GFP_ATOMIC allocations. Those were defined as (__GFP_HIGH), and the code would test for __GFP_WAIT (0x10u). However, since then, __GFP_WAIT was replaced by __GFP_RECLAIM (___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM), and GFP_ATOMIC is now defined as (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM). This means that when the flag is false, almost no allocation ever fails (as even GFP_ATOMIC allocations contain ___GFP_KSWAPD_RECLAIM). Restore the original intent of the code, by ignoring calls that directly reclaim only (__GFP_DIRECT_RECLAIM), and thus, failing GFP_ATOMIC calls again by default. Link: http://lkml.kernel.org/r/20190520214514.81360-1-drinkcat@chromium.org Fixes: 71baba4b92dc1fa1 ("mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM") Signed-off-by: Nicolas Boichat Reviewed-by: Akinobu Mita Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/failslab.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/failslab.c b/mm/failslab.c index ec5aad211c5b..f92fed91ac23 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -23,7 +23,8 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) + if (failslab.ignore_gfp_reclaim && + (gfpflags & __GFP_DIRECT_RECLAIM)) return false; if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) -- cgit v1.2.3-59-g8ed1b From 96a2b03f281d3a3b29c27028164f43090d6495b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:06 -0700 Subject: mm, debug_pagelloc: use static keys to enable debugging Patch series "debug_pagealloc improvements". I have been recently debugging some pcplist corruptions, where it would be useful to perform struct page checks immediately as pages are allocated from and freed to pcplists, which is now only possible by rebuilding the kernel with CONFIG_DEBUG_VM (details in Patch 2 changelog). To make this kind of debugging simpler in future on a distro kernel, I have improved CONFIG_DEBUG_PAGEALLOC so that it has even smaller overhead when not enabled at boot time (Patch 1) and also when enabled (Patch 3), and extended it to perform the struct page checks more often when enabled (Patch 2). Now it can be configured in when building a distro kernel without extra overhead, and debugging page use after free or double free can be enabled simply by rebooting with debug_pagealloc=on. This patch (of 3): CONFIG_DEBUG_PAGEALLOC has been redesigned by 031bc5743f15 ("mm/debug-pagealloc: make debug-pagealloc boottime configurable") to allow being always enabled in a distro kernel, but only perform its expensive functionality when booted with debug_pagelloc=on. We can further reduce the overhead when not boot-enabled (including page allocator fast paths) using static keys. This patch introduces one for debug_pagealloc core functionality, and another for the optional guard page functionality (enabled by booting with debug_guardpage_minorder=X). Link: http://lkml.kernel.org/r/20190603143451.27353-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++---- mm/page_alloc.c | 23 +++++++++++++++++------ 2 files changed, 28 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0a6dae2f2b84..2c2e98cae2d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2701,11 +2701,18 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif -extern bool _debug_pagealloc_enabled; +#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#else +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +#endif static inline bool debug_pagealloc_enabled(void) { - return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled; + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) + return false; + + return static_branch_unlikely(&_debug_pagealloc_enabled); } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) @@ -2859,7 +2866,7 @@ extern struct page_ext_operations debug_guardpage_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; -extern bool _debug_guardpage_enabled; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { @@ -2868,7 +2875,7 @@ static inline unsigned int debug_guardpage_minorder(void) static inline bool debug_guardpage_enabled(void) { - return _debug_guardpage_enabled; + return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 060303496094..3180d79be20c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -644,16 +644,27 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); + +#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#else +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +#endif EXPORT_SYMBOL(_debug_pagealloc_enabled); -bool _debug_guardpage_enabled __read_mostly; + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static int __init early_debug_pagealloc(char *buf) { - if (!buf) + bool enable = false; + + if (kstrtobool(buf, &enable)) return -EINVAL; - return kstrtobool(buf, &_debug_pagealloc_enabled); + + if (enable) + static_branch_enable(&_debug_pagealloc_enabled); + + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); @@ -677,7 +688,7 @@ static void init_debug_guardpage(void) if (!debug_guardpage_minorder()) return; - _debug_guardpage_enabled = true; + static_branch_enable(&_debug_guardpage_enabled); } struct page_ext_operations debug_guardpage_ops = { -- cgit v1.2.3-59-g8ed1b From 4462b32c9285b521ef378907aa66a5ca485aae41 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:09 -0700 Subject: mm, page_alloc: more extensive free page checking with debug_pagealloc The page allocator checks struct pages for expected state (mapcount, flags etc) as pages are being allocated (check_new_page()) and freed (free_pages_check()) to provide some defense against errors in page allocator users. Prior commits 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") and 4db7548ccbd9 ("mm, page_alloc: defer debugging checks of freed pages until a PCP drain") this has happened for order-0 pages as they were allocated from or freed to the per-cpu caches (pcplists). Since those are fast paths, the checks are now performed only when pages are moved between pcplists and global free lists. This however lowers the chances of catching errors soon enough. In order to increase the chances of the checks to catch errors, the kernel has to be rebuilt with CONFIG_DEBUG_VM, which also enables multiple other internal debug checks (VM_BUG_ON() etc), which is suboptimal when the goal is to catch errors in mm users, not in mm code itself. To catch some wrong users of the page allocator we have CONFIG_DEBUG_PAGEALLOC, which is designed to have virtually no overhead unless enabled at boot time. Memory corruptions when writing to freed pages have often the same underlying errors (use-after-free, double free) as corrupting the corresponding struct pages, so this existing debugging functionality is a good fit to extend by also perform struct page checks at least as often as if CONFIG_DEBUG_VM was enabled. Specifically, after this patch, when debug_pagealloc is enabled on boot, and CONFIG_DEBUG_VM disabled, pages are checked when allocated from or freed to the pcplists *in addition* to being moved between pcplists and free lists. When both debug_pagealloc and CONFIG_DEBUG_VM are enabled, pages are checked when being moved between pcplists and free lists *in addition* to when allocated from or freed to the pcplists. When debug_pagealloc is not enabled on boot, the overhead in fast paths should be virtually none thanks to the use of static key. Link: http://lkml.kernel.org/r/20190603143451.27353-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Mel Gorman Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 13 +++++++++---- mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index fa6d79281368..a35ab6c55192 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -19,12 +19,17 @@ config DEBUG_PAGEALLOC Depending on runtime enablement, this results in a small or large slowdown, but helps to find certain types of memory corruption. + Also, the state of page tracking structures is checked more often as + pages are being allocated and freed, as unexpected state changes + often happen for same reasons as memory corruption (e.g. double free, + use-after-free). + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify - the patterns before alloc_pages(). Additionally, - this option cannot be enabled in combination with hibernation as - that would result in incorrect warnings of memory corruption after - a resume because free pages are not saved to the suspend image. + the patterns before alloc_pages(). Additionally, this option cannot + be enabled in combination with hibernation as that would result in + incorrect warnings of memory corruption after a resume because free + pages are not saved to the suspend image. By default this option will have a small overhead, e.g. by not allowing the kernel mapping to be backed by large pages on some diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3180d79be20c..26b6ad8b065d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1160,19 +1160,36 @@ static __always_inline bool free_pages_prepare(struct page *page, } #ifdef CONFIG_DEBUG_VM -static inline bool free_pcp_prepare(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when + * moved from pcp lists to free lists. + */ +static bool free_pcp_prepare(struct page *page) { return free_pages_prepare(page, 0, true); } -static inline bool bulkfree_pcp_prepare(struct page *page) +static bool bulkfree_pcp_prepare(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return free_pages_check(page); + else + return false; } #else +/* + * With DEBUG_VM disabled, order-0 pages being freed are checked only when + * moving from pcp lists to free list in order to reduce overhead. With + * debug_pagealloc enabled, they are checked also immediately when being freed + * to the pcp lists. + */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, false); + if (debug_pagealloc_enabled()) + return free_pages_prepare(page, 0, true); + else + return free_pages_prepare(page, 0, false); } static bool bulkfree_pcp_prepare(struct page *page) @@ -2035,23 +2052,39 @@ static inline bool free_pages_prezeroed(void) } #ifdef CONFIG_DEBUG_VM -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked for expected state when + * being allocated from pcp lists. With debug_pagealloc also enabled, they are + * also checked when pcp lists are refilled from the free lists. + */ +static inline bool check_pcp_refill(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return check_new_page(page); + else + return false; } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { return check_new_page(page); } #else -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM disabled, free order-0 pages are checked for expected state + * when pcp lists are being refilled from the free lists. With debug_pagealloc + * enabled, they are also checked when being allocated from the pcp lists. + */ +static inline bool check_pcp_refill(struct page *page) { return check_new_page(page); } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return check_new_page(page); + else + return false; } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3-59-g8ed1b From 3972f6bb1c6ae1d32dcf2e4ff635d24b77f26dcb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:13 -0700 Subject: mm, debug_pagealloc: use a page type instead of page_ext flag When debug_pagealloc is enabled, we currently allocate the page_ext array to mark guard pages with the PAGE_EXT_DEBUG_GUARD flag. Now that we have the page_type field in struct page, we can use that instead, as guard pages are neither PageSlab nor mapped to userspace. This reduces memory overhead when debug_pagealloc is enabled and there are no other features requiring the page_ext array. Link: http://lkml.kernel.org/r/20190603143451.27353-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 10 +++---- include/linux/mm.h | 10 +------ include/linux/page-flags.h | 6 ++++ include/linux/page_ext.h | 1 - mm/Kconfig.debug | 1 - mm/page_alloc.c | 40 ++++--------------------- mm/page_ext.c | 3 -- 7 files changed, 17 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1c433daef6b..aa4e7e7b87c2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -805,12 +805,10 @@ tracking down these problems. debug_pagealloc= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this - parameter enables the feature at boot time. In - default, it is disabled. We can avoid allocating huge - chunk of memory for debug pagealloc if we don't enable - it at boot time and the system will work mostly same - with the kernel built without CONFIG_DEBUG_PAGEALLOC. + [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter + enables the feature at boot time. By default, it is + disabled and the system will work mostly the same as a + kernel built without CONFIG_DEBUG_PAGEALLOC. on: enable the feature debugpat [X86] Enable PAT debugging diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c2e98cae2d1..cb8d413d635e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2862,8 +2862,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, bool allow_pagefault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -extern struct page_ext_operations debug_guardpage_ops; - #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); @@ -2880,16 +2878,10 @@ static inline bool debug_guardpage_enabled(void) static inline bool page_is_guard(struct page *page) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + return PageGuard(page); } #else static inline unsigned int debug_guardpage_minorder(void) { return 0; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9f8712a4b1a5..b848517da64c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -703,6 +703,7 @@ PAGEFLAG_FALSE(DoubleMap) #define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 +#define PG_guard 0x00000800 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -754,6 +755,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg) */ PAGE_TYPE_OPS(Table, table) +/* + * Marks guardpages used with debug_pagealloc. + */ +PAGE_TYPE_OPS(Guard, guard) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index f84f167ec04c..09592951725c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -17,7 +17,6 @@ struct page_ext_operations { #ifdef CONFIG_PAGE_EXTENSION enum page_ext_flags { - PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index a35ab6c55192..82b6a20898bd 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -12,7 +12,6 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC - select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26b6ad8b065d..ae56e8feec0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -668,18 +667,6 @@ static int __init early_debug_pagealloc(char *buf) } early_param("debug_pagealloc", early_debug_pagealloc); -static bool need_debug_guardpage(void) -{ - /* If we don't use debug_pagealloc, we don't need guard page */ - if (!debug_pagealloc_enabled()) - return false; - - if (!debug_guardpage_minorder()) - return false; - - return true; -} - static void init_debug_guardpage(void) { if (!debug_pagealloc_enabled()) @@ -691,11 +678,6 @@ static void init_debug_guardpage(void) static_branch_enable(&_debug_guardpage_enabled); } -struct page_ext_operations debug_guardpage_ops = { - .need = need_debug_guardpage, - .init = init_debug_guardpage, -}; - static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; @@ -713,20 +695,13 @@ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; if (order >= debug_guardpage_minorder()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); - + __SetPageGuard(page); INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ @@ -738,23 +713,16 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + __ClearPageGuard(page); set_page_private(page, 0); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops; static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -1930,6 +1898,10 @@ void __init page_alloc_init_late(void) for_each_populated_zone(zone) set_zone_contiguous(zone); + +#ifdef CONFIG_DEBUG_PAGEALLOC + init_debug_guardpage(); +#endif } #ifdef CONFIG_CMA diff --git a/mm/page_ext.c b/mm/page_ext.c index d8f1aca4ad43..5f5769c7db3b 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ */ static struct page_ext_operations *page_ext_ops[] = { -#ifdef CONFIG_DEBUG_PAGEALLOC - &debug_guardpage_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -- cgit v1.2.3-59-g8ed1b From d322a8e5e3e9742fa6b76a207e5df57e03f318f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:17 -0700 Subject: mm/filemap.c: fix an overly long line in read_cache_page Patch series "fix filler_t callback type mismatches", v2. Casting mapping->a_ops->readpage to filler_t causes an indirect call type mismatch with Control-Flow Integrity checking. This change fixes the mismatch in read_cache_page_gfp and read_mapping_page by adding using a NULL filler argument as an indication to call ->readpage directly, and by passing the right parameter callbacks in nfs and jffs2. This patch (of 4): Code cleanup. Link: http://lkml.kernel.org/r/20190520055731.24538-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f1aa20ab8434..d6f7596f148f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2915,7 +2915,8 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *, struct page *), void *data) { - return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); + return do_read_cache_page(mapping, index, filler, data, + mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); -- cgit v1.2.3-59-g8ed1b From 6c45b454191b330c8bc21d1ed3cf39bb6da1a4eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:20 -0700 Subject: mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page We can just pass a NULL filler and do the right thing inside of do_read_cache_page based on the NULL parameter. Link: http://lkml.kernel.org/r/20190520055731.24538-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6fd0d3aa492c..c7552459a15f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -383,8 +383,7 @@ extern int read_cache_pages(struct address_space *mapping, static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return read_cache_page(mapping, index, filler, data); + return read_cache_page(mapping, index, NULL, data); } /* diff --git a/mm/filemap.c b/mm/filemap.c index d6f7596f148f..1e5e006b8557 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2825,7 +2825,11 @@ repeat: } filler: - err = filler(data, page); + if (filler) + err = filler(data, page); + else + err = mapping->a_ops->readpage(data, page); + if (err < 0) { put_page(page); return ERR_PTR(err); @@ -2937,9 +2941,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - - return do_read_cache_page(mapping, index, filler, NULL, gfp); + return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -- cgit v1.2.3-59-g8ed1b From a4985833885b8f568bab90d5dc1886ae68dc82cf Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Jul 2019 20:55:29 -0700 Subject: mm/filemap.c: correct the comment about VM_FAULT_RETRY Commit 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations") changed when mmap_sem is dropped during filemap page fault and when returning VM_FAULT_RETRY. Correct the comment to reflect the change. Link: http://lkml.kernel.org/r/1556234531-108228-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Josef Bacik Acked-by: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1e5e006b8557..d0cf700bf201 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2504,10 +2504,8 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * * vma->vm_mm->mmap_sem must be held on entry. * - * If our return value has VM_FAULT_RETRY set, it's because - * lock_page_or_retry() returned 0. - * The mmap_sem has usually been released in this case. - * See __lock_page_or_retry() for the exception. + * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem + * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_sem * has not been released. -- cgit v1.2.3-59-g8ed1b From eb085574a7526c4375965c5fbf7e5b0c19cdd336 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:33 -0700 Subject: mm, swap: fix race between swapoff and some swap operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When swapin is performed, after getting the swap entry information from the page table, system will swap in the swap entry, without any lock held to prevent the swap device from being swapoff. This may cause the race like below, CPU 1 CPU 2 ----- ----- do_swap_page swapin_readahead __read_swap_cache_async swapoff swapcache_prepare p->swap_map = NULL __swap_duplicate p->swap_map[?] /* !!! NULL pointer access */ Because swapoff is usually done when system shutdown only, the race may not hit many people in practice. But it is still a race need to be fixed. To fix the race, get_swap_device() is added to check whether the specified swap entry is valid in its swap device. If so, it will keep the swap entry valid via preventing the swap device from being swapoff, until put_swap_device() is called. Because swapoff() is very rare code path, to make the normal path runs as fast as possible, rcu_read_lock/unlock() and synchronize_rcu() instead of reference count is used to implement get/put_swap_device(). >From get_swap_device() to put_swap_device(), RCU reader side is locked, so synchronize_rcu() in swapoff() will wait until put_swap_device() is called. In addition to swap_map, cluster_info, etc. data structure in the struct swap_info_struct, the swap cache radix tree will be freed after swapoff, so this patch fixes the race between swap cache looking up and swapoff too. Races between some other swap cache usages and swapoff are fixed too via calling synchronize_rcu() between clearing PageSwapCache() and freeing swap cache data structure. Another possible method to fix this is to use preempt_off() + stop_machine() to prevent the swap device from being swapoff when its data structure is being accessed. The overhead in hot-path of both methods is similar. The advantages of RCU based method are, 1. stop_machine() may disturb the normal execution code path on other CPUs. 2. File cache uses RCU to protect its radix tree. If the similar mechanism is used for swap cache too, it is easier to share code between them. 3. RCU is used to protect swap cache in total_swapcache_pages() and exit_swap_address_space() already. The two mechanisms can be merged to simplify the logic. Link: http://lkml.kernel.org/r/20190522015423.14418-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" Reviewed-by: Andrea Parri Not-nacked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Paul E. McKenney Cc: Daniel Jordan Cc: Michal Hocko Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 13 ++++- mm/memory.c | 2 +- mm/swap_state.c | 16 +++++- mm/swapfile.c | 154 ++++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 146 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4bfb5c4ac108..6358a6185634 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -175,8 +175,9 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + rcu_read_unlock(); +} #else /* CONFIG_SWAP */ @@ -576,7 +583,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index ced4bedc660d..b47e4e56448a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2805,7 +2805,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); diff --git a/mm/swap_state.c b/mm/swap_state.c index 85245fdec8d9..61453f1faf72 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 596ac98051c5..dbab16ddefa6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1079,12 +1079,11 @@ fail: static struct swap_info_struct *__swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; - unsigned long offset, type; + unsigned long offset; if (!entry.val) goto out; - type = swp_type(entry); - p = swap_type_to_swap_info(type); + p = swp_swap_info(entry); if (!p) goto bad_nofile; if (!(p->flags & SWP_USED)) @@ -1187,6 +1186,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, return usage; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + * + * The entirety of the RCU read critical section must come before the + * return from or after the call to synchronize_rcu() in + * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is + * true, the si->map, si->cluster_info, etc. must be valid in the + * critical section. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() + * in put_swap_device() if there isn't any other way to prevent + * swapoff, such as page lock, page table lock, etc. The caller must + * be prepared for that. For example, the following situation is + * possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long offset; + + if (!entry.val) + goto out; + si = swp_swap_info(entry); + if (!si) + goto bad_nofile; + + rcu_read_lock(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + rcu_read_unlock(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1358,11 +1420,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1387,9 +1456,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -2335,9 +2406,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2362,7 +2433,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2389,7 +2464,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are valid + * between get/put_swap_device() if SWP_VALID bit is set + */ + synchronize_rcu(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2398,7 +2483,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2501,6 +2587,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + synchronize_rcu(); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3265,17 +3362,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) - goto out; - - p = swp_swap_info(entry); + p = get_swap_device(entry); if (!p) - goto bad_file; - - offset = swp_offset(entry); - if (unlikely(offset >= p->max)) goto out; + offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3321,11 +3412,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3417,6 +3506,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3424,15 +3514,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3450,9 +3540,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3504,10 +3593,11 @@ out_unlock_cont: out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* -- cgit v1.2.3-59-g8ed1b From 054f1d1faaed6a7930b77286d607ae45c01d0443 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:37 -0700 Subject: mm/swap_state.c: simplify total_swapcache_pages() with get_swap_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit total_swapcache_pages() may race with swapper_spaces[] allocation and freeing. Previously, this is protected with a swapper_spaces[] specific RCU mechanism. To simplify the logic/code complexity, it is replaced with get/put_swap_device(). The code line number is reduced too. Although not so important, the swapoff() performance improves too because one synchronize_rcu() call during swapoff() is deleted. [ying.huang@intel.com: fix bad swap file entry warning] Link: http://lkml.kernel.org/r/20190531024102.21723-1-ying.huang@intel.com Link: http://lkml.kernel.org/r/20190527082714.12151-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Tested-by: Mike Kravetz Cc: Hugh Dickins Cc: Paul E. McKenney Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Cc: Daniel Jordan Cc: Andrea Parri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 61453f1faf72..8368621a0fc7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -73,23 +73,24 @@ unsigned long total_swapcache_pages(void) unsigned int i, j, nr; unsigned long ret = 0; struct address_space *spaces; + struct swap_info_struct *si; - rcu_read_lock(); for (i = 0; i < MAX_SWAPFILES; i++) { - /* - * The corresponding entries in nr_swapper_spaces and - * swapper_spaces will be reused only after at least - * one grace period. So it is impossible for them - * belongs to different usage. - */ - nr = nr_swapper_spaces[i]; - spaces = rcu_dereference(swapper_spaces[i]); - if (!nr || !spaces) + swp_entry_t entry = swp_entry(i, 1); + + /* Avoid get_swap_device() to warn for bad swap entry */ + if (!swp_swap_info(entry)) + continue; + /* Prevent swapoff to free swapper_spaces */ + si = get_swap_device(entry); + if (!si) continue; + nr = nr_swapper_spaces[i]; + spaces = swapper_spaces[i]; for (j = 0; j < nr; j++) ret += spaces[j].nrpages; + put_swap_device(si); } - rcu_read_unlock(); return ret; } @@ -611,20 +612,16 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) mapping_set_no_writeback_tags(space); } nr_swapper_spaces[type] = nr; - rcu_assign_pointer(swapper_spaces[type], spaces); + swapper_spaces[type] = spaces; return 0; } void exit_swap_address_space(unsigned int type) { - struct address_space *spaces; - - spaces = swapper_spaces[type]; + kvfree(swapper_spaces[type]); nr_swapper_spaces[type] = 0; - rcu_assign_pointer(swapper_spaces[type], NULL); - synchronize_rcu(); - kvfree(spaces); + swapper_spaces[type] = NULL; } static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, -- cgit v1.2.3-59-g8ed1b From 4efaceb1c5f8136d5fec3f26549d294b8e898bd7 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 11 Jul 2019 20:55:41 -0700 Subject: mm, swap: use rbtree for swap_extent swap_extent is used to map swap page offset to backing device's block offset. For a continuous block range, one swap_extent is used and all these swap_extents are managed in a linked list. These swap_extents are used by map_swap_entry() during swap's read and write path. To find out the backing device's block offset for a page offset, the swap_extent list will be traversed linearly, with curr_swap_extent being used as a cache to speed up the search. This works well as long as swap_extents are not huge or when the number of processes that access swap device are few, but when the swap device has many extents and there are a number of processes accessing the swap device concurrently, it can be a problem. On one of our servers, the disk's remaining size is tight: $df -h Filesystem Size Used Avail Use% Mounted on ... ... /dev/nvme0n1p1 1.8T 1.3T 504G 72% /home/t4 When creating a 80G swapfile there, there are as many as 84656 swap extents. The end result is, kernel spends abou 30% time in map_swap_entry() and swap throughput is only 70MB/s. As a comparison, when I used smaller sized swapfile, like 4G whose swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and map_swap_entry() is about 3%. One downside of using rbtree for swap_extent is, 'struct rbtree' takes 24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more for each swap_extent. For a swapfile that has 80k swap_extents, that means 625KiB more memory consumed. Test: Since it's not possible to reboot that server, I can not test this patch diretly there. Instead, I tested it on another server with NVMe disk. I created a 20G swapfile on an NVMe backed XFS fs. By default, the filesystem is quite clean and the created swapfile has only 2 extents. Testing vanilla and this patch shows no obvious performance difference when swapfile is not fragmented. To see the patch's effects, I used some tweaks to manually fragment the swapfile by breaking the extent at 1M boundary. This made the swapfile have 20K extents. nr_task=4 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 165191 90.77% 171798 90.21% patched 858993 +420% 2.16% 715827 +317% 0.77% nr_task=8 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 306783 92.19% 318145 87.76% patched 954437 +211% 2.35% 1073741 +237% 1.57% swapout: the throughput of swap out, in KB/s, higher is better 1st map_swap_entry: cpu cycles percent sampled by perf swapin: the throughput of swap in, in KB/s, higher is better. 2nd map_swap_entry: cpu cycles percent sampled by perf nr_task=1 doesn't show any difference, this is due to the curr_swap_extent can be effectively used to cache the correct swap extent for single task workload. [akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/] Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu Signed-off-by: Aaron Lu Cc: Huang Ying Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +- mm/page_io.c | 2 +- mm/swapfile.c | 137 ++++++++++++++++++++++++++++----------------------- 3 files changed, 78 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6358a6185634..de2c67a33b7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -148,7 +148,7 @@ struct zone; * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { - struct list_head list; + struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; @@ -248,8 +248,7 @@ struct swap_info_struct { unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct swap_extent *curr_swap_extent; - struct swap_extent first_swap_extent; + struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ diff --git a/mm/page_io.c b/mm/page_io.c index a39aac2f8c8d..24ee600f9131 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -163,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis, blocks_per_page = PAGE_SIZE >> blkbits; /* - * Map all the blocks into the extent list. This code doesn't try + * Map all the blocks into the extent tree. This code doesn't try * to be very smart. */ probe_block = 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index dbab16ddefa6..0789a762ce2f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, return ret; } +static inline struct swap_extent *first_se(struct swap_info_struct *sis) +{ + struct rb_node *rb = rb_first(&sis->swap_extent_root); + return rb_entry(rb, struct swap_extent, rb_node); +} + +static inline struct swap_extent *next_se(struct swap_extent *se) +{ + struct rb_node *rb = rb_next(&se->rb_node); + return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; +} + /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. @@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si) int err = 0; /* Do not discard the swap header page! */ - se = &si->first_swap_extent; + se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { @@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si) cond_resched(); } - list_for_each_entry(se, &si->first_swap_extent.list, list) { + for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); @@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si) return err; /* That will often be -EOPNOTSUPP */ } +static struct swap_extent * +offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) +{ + struct swap_extent *se; + struct rb_node *rb; + + rb = sis->swap_extent_root.rb_node; + while (rb) { + se = rb_entry(rb, struct swap_extent, rb_node); + if (offset < se->start_page) + rb = rb->rb_left; + else if (offset >= se->start_page + se->nr_pages) + rb = rb->rb_right; + else + return se; + } + /* It *must* be present */ + BUG(); +} + /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. @@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si) static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { - struct swap_extent *se = si->curr_swap_extent; - int found_extent = 0; + struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { - if (se->start_page <= start_page && - start_page < se->start_page + se->nr_pages) { - pgoff_t offset = start_page - se->start_page; - sector_t start_block = se->start_block + offset; - sector_t nr_blocks = se->nr_pages - offset; - - if (nr_blocks > nr_pages) - nr_blocks = nr_pages; - start_page += nr_blocks; - nr_pages -= nr_blocks; - - if (!found_extent++) - si->curr_swap_extent = se; - - start_block <<= PAGE_SHIFT - 9; - nr_blocks <<= PAGE_SHIFT - 9; - if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, 0)) - break; - } + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO, 0)) + break; - se = list_next_entry(se, list); + se = next_se(se); } } @@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) return type; } if (bdev == sis->bdev) { - struct swap_extent *se = &sis->first_swap_extent; + struct swap_extent *se = first_se(sis); if (se->start_block == offset) { if (bdev_p) @@ -2232,7 +2257,6 @@ static void drain_mmlist(void) static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { struct swap_info_struct *sis; - struct swap_extent *start_se; struct swap_extent *se; pgoff_t offset; @@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) *bdev = sis->bdev; offset = swp_offset(entry); - start_se = sis->curr_swap_extent; - se = start_se; - - for ( ; ; ) { - if (se->start_page <= offset && - offset < (se->start_page + se->nr_pages)) { - return se->start_block + (offset - se->start_page); - } - se = list_next_entry(se, list); - sis->curr_swap_extent = se; - BUG_ON(se == start_se); /* It *must* be present */ - } + se = offset_to_swap_extent(sis, offset); + return se->start_block + (offset - se->start_page); } /* @@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev) */ static void destroy_swap_extents(struct swap_info_struct *sis) { - while (!list_empty(&sis->first_swap_extent.list)) { - struct swap_extent *se; + while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { + struct rb_node *rb = sis->swap_extent_root.rb_node; + struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); - se = list_first_entry(&sis->first_swap_extent.list, - struct swap_extent, list); - list_del(&se->list); + rb_erase(rb, &sis->swap_extent_root); kfree(se); } @@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) /* * Add a block range (and the corresponding page range) into this swapdev's - * extent list. The extent list is kept sorted in page order. + * extent tree. * * This function rather assumes that it is called in ascending page order. */ @@ -2298,20 +2311,21 @@ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { + struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; - struct list_head *lh; - - if (start_page == 0) { - se = &sis->first_swap_extent; - sis->curr_swap_extent = se; - se->start_page = 0; - se->nr_pages = nr_pages; - se->start_block = start_block; - return 1; - } else { - lh = sis->first_swap_extent.list.prev; /* Highest extent */ - se = list_entry(lh, struct swap_extent, list); + + /* + * place the new node at the right most since the + * function is called in ascending page order. + */ + while (*link) { + parent = *link; + link = &parent->rb_right; + } + + if (parent) { + se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ @@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, } } - /* - * No merge. Insert a new extent, preserving ordering. - */ + /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; @@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, new_se->nr_pages = nr_pages; new_se->start_block = start_block; - list_add_tail(&new_se->list, &sis->first_swap_extent.list); + rb_link_node(&new_se->rb_node, parent, link); + rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); @@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void) * would be relying on p->type to remain valid. */ } - INIT_LIST_HEAD(&p->first_swap_extent.list); + p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); -- cgit v1.2.3-59-g8ed1b From aeb309b81c6bada783c3695528a3e10748e97285 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:44 -0700 Subject: mm/mincore.c: fix race between swapoff and mincore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Via commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"), after swapoff, the address_space associated with the swap device will be freed. So swap_address_space() users which touch the address_space need some kind of mechanism to prevent the address_space from being freed during accessing. When mincore processes an unmapped range for swapped shmem pages, it doesn't hold the lock to prevent swap device from being swapped off. So the following race is possible: CPU1 CPU2 do_mincore() swapoff() walk_page_range() mincore_unmapped_range() __mincore_unmapped_range mincore_page as = swap_address_space() ... exit_swap_address_space() ... kvfree(spaces) find_get_page(as) The address space may be accessed after being freed. To fix the race, get_swap_device()/put_swap_device() is used to enclose find_get_page() to check whether the swap entry is valid and prevent the swap device from being swapoff during accessing. Link: http://lkml.kernel.org/r/20190611020510.28251-1-ying.huang@intel.com Fixes: 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks") Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Paul E. McKenney Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Andrea Arcangeli Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Cc: Daniel Jordan Cc: Andrea Parri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index c3f058bd0faf..4fe91d497436 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); + struct swap_info_struct *si; + + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + put_swap_device(si); + } else + page = NULL; } } else page = find_get_page(mapping, pgoff); -- cgit v1.2.3-59-g8ed1b From 38d384932ed1b58f0ed8dd17c66e0cd89acb354a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:48 -0700 Subject: memcg, oom: no oom-kill for __GFP_RETRY_MAYFAIL The documentation of __GFP_RETRY_MAYFAIL clearly mentioned that the OOM killer will not be triggered and indeed the page alloc does not invoke OOM killer for such allocations. However we do trigger memcg OOM killer for __GFP_RETRY_MAYFAIL. Fix that. This flag will used later to not trigger oom-killer in the charging path for fanotify and inotify event allocations. Link: http://lkml.kernel.org/r/20190514212259.156585-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Amir Goldstein Cc: Jan Kara Cc: Johannes Weiner Cc: Roman Gushchin Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 591eafafbd8c..2ad94d0ce22f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2279,7 +2279,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - bool oomed = false; enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) @@ -2366,7 +2365,7 @@ retry: if (nr_retries--) goto retry; - if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; if (gfp_mask & __GFP_NOFAIL) @@ -2385,7 +2384,6 @@ retry: switch (oom_status) { case OOM_SUCCESS: nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - oomed = true; goto retry; case OOM_FAILED: goto force; -- cgit v1.2.3-59-g8ed1b From 1e577f970f66a53d429cbee37b36177c9712f488 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:55 -0700 Subject: mm, memcg: introduce memory.events.local The memory controller in cgroup v2 exposes memory.events file for each memcg which shows the number of times events like low, high, max, oom and oom_kill have happened for the whole tree rooted at that memcg. Users can also poll or register notification to monitor the changes in that file. Any event at any level of the tree rooted at memcg will notify all the listeners along the path till root_mem_cgroup. There are existing users which depend on this behavior. However there are users which are only interested in the events happening at a specific level of the memcg tree and not in the events in the underlying tree rooted at that memcg. One such use-case is a centralized resource monitor which can dynamically adjust the limits of the jobs running on a system. The jobs can create their sub-hierarchy for their own sub-tasks. The centralized monitor is only interested in the events at the top level memcgs of the jobs as it can then act and adjust the limits of the jobs. Using the current memory.events for such centralized monitor is very inconvenient. The monitor will keep receiving events which it is not interested and to find if the received event is interesting, it has to read memory.event files of the next level and compare it with the top level one. So, let's introduce memory.events.local to the memcg which shows and notify for the events at the memcg level. Now, does memory.stat and memory.pressure need their local versions. IMHO no due to the no internal process contraint of the cgroup v2. The memory.stat file of the top level memcg of a job shows the stats and vmevents of the whole tree. The local stats or vmevents of the top level memcg will only change if there is a process running in that memcg but v2 does not allow that. Similarly for memory.pressure there will not be any process in the internal nodes and thus no chance of local pressure. Link: http://lkml.kernel.org/r/20190527174643.209172-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Chris Down Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++ include/linux/memcontrol.h | 7 ++++++- mm/memcontrol.c | 34 +++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a5c845338d6d..a9548de56ac9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1146,6 +1146,11 @@ PAGE_SIZE multiple when read back. otherwise, a value change in this file generates a file modified event. + Note that all fields in this file are hierarchical and the + file modified event can be generated due to an event down the + hierarchy. For for the local events at the cgroup level see + memory.events.local. + low The number of times the cgroup is reclaimed due to high memory pressure even though its usage is under @@ -1185,6 +1190,11 @@ PAGE_SIZE multiple when read back. The number of processes belonging to this cgroup killed by any kind of OOM killer. + memory.events.local + Similar to memory.events but the fields in the file are local + to the cgroup i.e. not hierarchical. The file modified event + generated on this file reflects only the local events. + memory.stat A read-only flat-keyed file which exists on non-root cgroups. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1dcb763bb610..22141ebc5e15 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,8 +233,9 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* memory.events */ + /* memory.events and memory.events.local */ struct cgroup_file events_file; + struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; @@ -281,6 +282,7 @@ struct mem_cgroup { /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; + atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -747,6 +749,9 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + atomic_long_inc(&memcg->memory_events_local[event]); + cgroup_file_notify(&memcg->events_local_file); + do { atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ad94d0ce22f..0a9bd604aa15 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5624,21 +5624,29 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static void __memory_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); + seq_printf(m, "oom_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_KILL])); +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - seq_printf(m, "low %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_LOW])); - seq_printf(m, "high %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); - seq_printf(m, "max %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_MAX])); - seq_printf(m, "oom %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM])); - seq_printf(m, "oom_kill %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + __memory_events_show(m, memcg->memory_events); + return 0; +} + +static int memory_events_local_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + __memory_events_show(m, memcg->memory_events_local); return 0; } @@ -5800,6 +5808,12 @@ static struct cftype memory_files[] = { .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memory_events_local_show, + }, { .name = "stat", .flags = CFTYPE_NOT_ON_ROOT, -- cgit v1.2.3-59-g8ed1b From c8713d0b23123759c9d86b0421243c2c309505d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 11 Jul 2019 20:55:59 -0700 Subject: mm: memcontrol: dump memory.stat during cgroup OOM The current cgroup OOM memory info dump doesn't include all the memory we are tracking, nor does it give insight into what the VM tried to do leading up to the OOM. All that useful info is in memory.stat. Furthermore, the recursive printing for every child cgroup can generate absurd amounts of data on the console for larger cgroup trees, and it's not like we provide a per-cgroup breakdown during global OOM kills. When an OOM kill is triggered, print one set of recursive memory.stat items at the level whose limit triggered the OOM condition. Example output: stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014 Call Trace: dump_stack+0x46/0x60 dump_header+0x4c/0x2d0 oom_kill_process.cold.10+0xb/0x10 out_of_memory+0x200/0x270 ? try_to_free_mem_cgroup_pages+0xdf/0x130 mem_cgroup_out_of_memory+0xb7/0xc0 try_charge+0x680/0x6f0 mem_cgroup_try_charge+0xb5/0x160 __add_to_page_cache_locked+0xc6/0x300 ? list_lru_destroy+0x80/0x80 add_to_page_cache_lru+0x45/0xc0 pagecache_get_page+0x11b/0x290 filemap_fault+0x458/0x6d0 ext4_filemap_fault+0x27/0x36 __do_fault+0x2f/0xb0 __handle_mm_fault+0x9c5/0x1140 ? apic_timer_interrupt+0xa/0x20 handle_mm_fault+0xc5/0x180 __do_page_fault+0x1ab/0x440 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 RIP: 0033:0x55c32167fc10 Code: Bad RIP value. RSP: 002b:00007fff1d031c50 EFLAGS: 00010206 RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010 RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000 RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000 R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000 memory: usage 1024kB, limit 1024kB, failcnt 75131 swap: usage 0kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /foo: anon 0 file 0 kernel_stack 36864 slab 274432 sock 0 shmem 0 file_mapped 0 file_dirty 0 file_writeback 0 anon_thp 0 inactive_anon 126976 active_anon 0 inactive_file 0 active_file 0 unevictable 0 slab_reclaimable 0 slab_unreclaimable 274432 pgfault 59466 pgmajfault 1617 workingset_refault 2145 workingset_activate 0 workingset_nodereclaim 0 pgrefill 98952 pgscan 200060 pgsteal 59340 pgactivate 40095 pgdeactivate 96787 pglazyfree 0 pglazyfreed 0 thp_fault_alloc 0 thp_collapse_alloc 0 Tasks state (memory values in pages): [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name [ 200] 0 200 1121 884 53248 29 0 bash [ 209] 0 209 905 246 45056 19 0 stress [ 210] 0 210 66442 56 499712 56349 0 stress oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0 Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal] Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 289 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 157 insertions(+), 132 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a9bd604aa15..6de79ec3cd21 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -1356,27 +1357,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, - NR_SHMEM, - NR_FILE_MAPPED, - NR_FILE_DIRTY, - NR_WRITEBACK, - MEMCG_SWAP, -}; +static char *memory_stat_format(struct mem_cgroup *memcg) +{ + struct seq_buf s; + int i; -static const char *const memcg1_stat_names[] = { - "cache", - "rss", - "rss_huge", - "shmem", - "mapped_file", - "dirty", - "writeback", - "swap", -}; + seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); + if (!s.buffer) + return NULL; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_buf_printf(&s, "anon %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS) * + PAGE_SIZE); + seq_buf_printf(&s, "file %llu\n", + (u64)memcg_page_state(memcg, MEMCG_CACHE) * + PAGE_SIZE); + seq_buf_printf(&s, "kernel_stack %llu\n", + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + 1024); + seq_buf_printf(&s, "slab %llu\n", + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); + seq_buf_printf(&s, "sock %llu\n", + (u64)memcg_page_state(memcg, MEMCG_SOCK) * + PAGE_SIZE); + + seq_buf_printf(&s, "shmem %llu\n", + (u64)memcg_page_state(memcg, NR_SHMEM) * + PAGE_SIZE); + seq_buf_printf(&s, "file_mapped %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * + PAGE_SIZE); + seq_buf_printf(&s, "file_dirty %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * + PAGE_SIZE); + seq_buf_printf(&s, "file_writeback %llu\n", + (u64)memcg_page_state(memcg, NR_WRITEBACK) * + PAGE_SIZE); + + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_buf_printf(&s, "anon_thp %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); + + seq_buf_printf(&s, "slab_reclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); + seq_buf_printf(&s, "slab_unreclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE); + + /* Accumulated memory events */ + + seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); + + seq_buf_printf(&s, "workingset_refault %lu\n", + memcg_page_state(memcg, WORKINGSET_REFAULT)); + seq_buf_printf(&s, "workingset_activate %lu\n", + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_nodereclaim %lu\n", + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_buf_printf(&s, "pgscan %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_buf_printf(&s, "pgsteal %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_buf_printf(&s, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); + seq_buf_printf(&s, "thp_collapse_alloc %lu\n", + memcg_events(memcg, THP_COLLAPSE_ALLOC)); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* The above should easily fit into one page */ + WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + + return s.buffer; +} #define K(x) ((x) << (PAGE_SHIFT-10)) /** @@ -1411,39 +1499,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { - struct mem_cgroup *iter; - unsigned int i; + char *buf; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), K((u64)memcg->memory.max), memcg->memory.failcnt); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.max), memcg->memsw.failcnt); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.max), memcg->kmem.failcnt); - - for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats for "); - pr_cont_cgroup_path(iter->css.cgroup); - pr_cont(":"); - - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) - continue; - pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state_local(iter, - memcg1_stats[i]))); - } - - for (i = 0; i < NR_LRU_LISTS; i++) - pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(memcg_page_state_local(iter, - NR_LRU_BASE + i))); - - pr_cont("\n"); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->swap)), + K((u64)memcg->swap.max), memcg->swap.failcnt); + else { + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.max), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.max), memcg->kmem.failcnt); } + + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(":"); + buf = memory_stat_format(memcg); + if (!buf) + return; + pr_info("%s", buf); + kfree(buf); } /* @@ -3470,6 +3551,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +static const unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + /* Universal VM events cgroup1 shows, original sort order */ static const unsigned int memcg1_events[] = { PGPGIN, @@ -5653,91 +5756,13 @@ static int memory_events_local_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - int i; - - /* - * Provide statistics on the state of the memory subsystem as - * well as cumulative event counters that show past behavior. - * - * This list is ordered following a combination of these gradients: - * 1) generic big picture -> specifics and details - * 2) reflecting userspace activity -> reflecting kernel heuristics - * - * Current memory state: - */ - - seq_printf(m, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); - seq_printf(m, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); - seq_printf(m, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); - seq_printf(m, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * - PAGE_SIZE); - seq_printf(m, "sock %llu\n", - (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); - - seq_printf(m, "shmem %llu\n", - (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); - seq_printf(m, "file_mapped %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); - seq_printf(m, "file_dirty %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); - seq_printf(m, "file_writeback %llu\n", - (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ - seq_printf(m, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); - - for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); - - seq_printf(m, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * - PAGE_SIZE); - seq_printf(m, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * - PAGE_SIZE); - - /* Accumulated memory events */ - - seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); - seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); - - seq_printf(m, "workingset_refault %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT)); - seq_printf(m, "workingset_activate %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE)); - seq_printf(m, "workingset_nodereclaim %lu\n", - memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); - - seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); - seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); - seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); - seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); - seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); - seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); - seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", - memcg_events(memcg, THP_FAULT_ALLOC)); - seq_printf(m, "thp_collapse_alloc %lu\n", - memcg_events(memcg, THP_COLLAPSE_ALLOC)); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + char *buf; + buf = memory_stat_format(memcg); + if (!buf) + return -ENOMEM; + seq_puts(m, buf); + kfree(buf); return 0; } -- cgit v1.2.3-59-g8ed1b From c03914b7aa319fb2b6701a6427c13752c7418b9b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:02 -0700 Subject: mm: memcg/slab: postpone kmem_cache memcg pointer initialization to memcg_link_cache() Patch series "mm: reparent slab memory on cgroup removal", v7. # Why do we need this? We've noticed that the number of dying cgroups is steadily growing on most of our hosts in production. The following investigation revealed an issue in the userspace memory reclaim code [1], accounting of kernel stacks [2], and also the main reason: slab objects. The underlying problem is quite simple: any page charged to a cgroup holds a reference to it, so the cgroup can't be reclaimed unless all charged pages are gone. If a slab object is actively used by other cgroups, it won't be reclaimed, and will prevent the origin cgroup from being reclaimed. Slab objects, and first of all vfs cache, is shared between cgroups, which are using the same underlying fs, and what's even more important, it's shared between multiple generations of the same workload. So if something is running periodically every time in a new cgroup (like how systemd works), we do accumulate multiple dying cgroups. Strictly speaking pagecache isn't different here, but there is a key difference: we disable protection and apply some extra pressure on LRUs of dying cgroups, and these LRUs contain all charged pages. My experiments show that with the disabled kernel memory accounting the number of dying cgroups stabilizes at a relatively small number (~100, depends on memory pressure and cgroup creation rate), and with kernel memory accounting it grows pretty steadily up to several thousands. Memory cgroups are quite complex and big objects (mostly due to percpu stats), so it leads to noticeable memory losses. Memory occupied by dying cgroups is measured in hundreds of megabytes. I've even seen a host with more than 100Gb of memory wasted for dying cgroups. It leads to a degradation of performance with the uptime, and generally limits the usage of cgroups. My previous attempt [3] to fix the problem by applying extra pressure on slab shrinker lists caused a regressions with xfs and ext4, and has been reverted [4]. The following attempts to find the right balance [5, 6] were not successful. So instead of trying to find a maybe non-existing balance, let's do reparent accounted slab caches to the parent cgroup on cgroup removal. # Implementation approach There is however a significant problem with reparenting of slab memory: there is no list of charged pages. Some of them are in shrinker lists, but not all. Introducing of a new list is really not an option. But fortunately there is a way forward: every slab page has a stable pointer to the corresponding kmem_cache. So the idea is to reparent kmem_caches instead of slab pages. It's actually simpler and cheaper, but requires some underlying changes: 1) Make kmem_caches to hold a single reference to the memory cgroup, instead of a separate reference per every slab page. 2) Stop setting page->mem_cgroup pointer for memcg slab pages and use page->kmem_cache->memcg indirection instead. It's used only on slab page release, so performance overhead shouldn't be a big issue. 3) Introduce a refcounter for non-root slab caches. It's required to be able to destroy kmem_caches when they become empty and release the associated memory cgroup. There is a bonus: currently we release all memcg kmem_caches all together with the memory cgroup itself. This patchset allows individual kmem_caches to be released as soon as they become inactive and free. Some additional implementation details are provided in corresponding commit messages. # Results Below is the average number of dying cgroups on two groups of our production hosts. They do run some sort of web frontend workload, the memory pressure is moderate. As we can see, with the kernel memory reparenting the number stabilizes in 60s range; however with the original version it grows almost linearly and doesn't show any signs of plateauing. The difference in slab and percpu usage between patched and unpatched versions also grows linearly. In 7 days it exceeded 200Mb. day 0 1 2 3 4 5 6 7 original 56 362 628 752 1070 1250 1490 1560 patched 23 46 51 55 60 57 67 69 mem diff(Mb) 22 74 123 152 164 182 214 241 # Links [1]: commit 68600f623d69 ("mm: don't miss the last page because of round-off error") [2]: commit 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting") [3]: commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects") [4]: commit a9a238e83fbb ("Revert "mm: slowly shrink slabs with a relatively small number of objects") [5]: https://lkml.org/lkml/2019/1/28/1865 [6]: https://marc.info/?l=linux-mm&m=155064763626437&w=2 This patch (of 10): Initialize kmem_cache->memcg_params.memcg pointer in memcg_link_cache() rather than in init_memcg_params(). Once kmem_cache will hold a reference to the memory cgroup, it will simplify the refcounting. For non-root kmem_caches memcg_link_cache() is always called before the kmem_cache becomes visible to a user, so it's safe. Link: http://lkml.kernel.org/r/20190611231813.3148843-2-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Waiman Long Cc: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slab.h | 5 +++-- mm/slab_common.c | 14 +++++++------- mm/slub.c | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3521a351ceb5..badd98f7e2f1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1239,7 +1239,7 @@ void __init kmem_cache_init(void) nr_node_ids * sizeof(struct kmem_cache_node *), SLAB_HWCACHE_ALIGN, 0, 0); list_add(&kmem_cache->list, &slab_caches); - memcg_link_cache(kmem_cache); + memcg_link_cache(kmem_cache, NULL); slab_state = PARTIAL; /* diff --git a/mm/slab.h b/mm/slab.h index 739099af6cbb..86f7ede21203 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -289,7 +289,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, } extern void slab_init_memcg_params(struct kmem_cache *); -extern void memcg_link_cache(struct kmem_cache *s); +extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); @@ -344,7 +344,8 @@ static inline void slab_init_memcg_params(struct kmem_cache *s) { } -static inline void memcg_link_cache(struct kmem_cache *s) +static inline void memcg_link_cache(struct kmem_cache *s, + struct mem_cgroup *memcg) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index a09bb10aa026..07ee4189b40c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -140,13 +140,12 @@ void slab_init_memcg_params(struct kmem_cache *s) } static int init_memcg_params(struct kmem_cache *s, - struct mem_cgroup *memcg, struct kmem_cache *root_cache) + struct kmem_cache *root_cache) { struct memcg_cache_array *arr; if (root_cache) { s->memcg_params.root_cache = root_cache; - s->memcg_params.memcg = memcg; INIT_LIST_HEAD(&s->memcg_params.children_node); INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); return 0; @@ -221,11 +220,12 @@ int memcg_update_all_caches(int num_memcgs) return ret; } -void memcg_link_cache(struct kmem_cache *s) +void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg) { if (is_root_cache(s)) { list_add(&s->root_caches_node, &slab_root_caches); } else { + s->memcg_params.memcg = memcg; list_add(&s->memcg_params.children_node, &s->memcg_params.root_cache->memcg_params.children); list_add(&s->memcg_params.kmem_caches_node, @@ -244,7 +244,7 @@ static void memcg_unlink_cache(struct kmem_cache *s) } #else static inline int init_memcg_params(struct kmem_cache *s, - struct mem_cgroup *memcg, struct kmem_cache *root_cache) + struct kmem_cache *root_cache) { return 0; } @@ -384,7 +384,7 @@ static struct kmem_cache *create_cache(const char *name, s->useroffset = useroffset; s->usersize = usersize; - err = init_memcg_params(s, memcg, root_cache); + err = init_memcg_params(s, root_cache); if (err) goto out_free_cache; @@ -394,7 +394,7 @@ static struct kmem_cache *create_cache(const char *name, s->refcount = 1; list_add(&s->list, &slab_caches); - memcg_link_cache(s); + memcg_link_cache(s, memcg); out: if (err) return ERR_PTR(err); @@ -997,7 +997,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, create_boot_cache(s, name, size, flags, useroffset, usersize); list_add(&s->list, &slab_caches); - memcg_link_cache(s); + memcg_link_cache(s, NULL); s->refcount = 1; return s; } diff --git a/mm/slub.c b/mm/slub.c index 5e217653286c..e1402ed19e74 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4199,7 +4199,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) } slab_init_memcg_params(s); list_add(&s->list, &slab_caches); - memcg_link_cache(s); + memcg_link_cache(s, NULL); return s; } -- cgit v1.2.3-59-g8ed1b From 0b14e8aa68223c2c124d408aa4b110b364d13c53 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:06 -0700 Subject: mm: memcg/slab: rename slab delayed deactivation functions and fields The delayed work/rcu deactivation infrastructure of non-root kmem_caches can be also used for asynchronous release of these objects. Let's get rid of the word "deactivation" in corresponding names to make the code look better after generalization. It's easier to make the renaming first, so that the generalized code will look consistent from scratch. Let's rename struct memcg_cache_params fields: deact_fn -> work_fn deact_rcu_head -> rcu_head deact_work -> work And RCU/delayed work callbacks in slab common code: kmemcg_deactivate_rcufn -> kmemcg_rcufn kmemcg_deactivate_workfn -> kmemcg_workfn This patch contains no functional changes, only renamings. Link: http://lkml.kernel.org/r/20190611231813.3148843-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 6 +++--- mm/slab.h | 2 +- mm/slab_common.c | 30 +++++++++++++++--------------- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index 98c3d12b7275..6008d884e621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -643,10 +643,10 @@ struct memcg_cache_params { struct list_head children_node; struct list_head kmem_caches_node; - void (*deact_fn)(struct kmem_cache *); + void (*work_fn)(struct kmem_cache *); union { - struct rcu_head deact_rcu_head; - struct work_struct deact_work; + struct rcu_head rcu_head; + struct work_struct work; }; }; }; diff --git a/mm/slab.h b/mm/slab.h index 86f7ede21203..7ef695b91919 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -291,7 +291,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*deact_fn)(struct kmem_cache *)); + void (*work_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG_KMEM */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 07ee4189b40c..f4dd9f75751c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -691,17 +691,17 @@ out_unlock: put_online_cpus(); } -static void kmemcg_deactivate_workfn(struct work_struct *work) +static void kmemcg_workfn(struct work_struct *work) { struct kmem_cache *s = container_of(work, struct kmem_cache, - memcg_params.deact_work); + memcg_params.work); get_online_cpus(); get_online_mems(); mutex_lock(&slab_mutex); - s->memcg_params.deact_fn(s); + s->memcg_params.work_fn(s); mutex_unlock(&slab_mutex); @@ -712,36 +712,36 @@ static void kmemcg_deactivate_workfn(struct work_struct *work) css_put(&s->memcg_params.memcg->css); } -static void kmemcg_deactivate_rcufn(struct rcu_head *head) +static void kmemcg_rcufn(struct rcu_head *head) { struct kmem_cache *s = container_of(head, struct kmem_cache, - memcg_params.deact_rcu_head); + memcg_params.rcu_head); /* - * We need to grab blocking locks. Bounce to ->deact_work. The + * We need to grab blocking locks. Bounce to ->work. The * work item shares the space with the RCU head and can't be * initialized eariler. */ - INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); - queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work); + INIT_WORK(&s->memcg_params.work, kmemcg_workfn); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); } /** * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a * sched RCU grace period * @s: target kmem_cache - * @deact_fn: deactivation function to call + * @work_fn: deactivation function to call * - * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex + * Schedule @work_fn to be invoked with online cpus, mems and slab_mutex * held after a sched RCU grace period. The slab is guaranteed to stay - * alive until @deact_fn is finished. This is to be used from + * alive until @work_fn is finished. This is to be used from * __kmemcg_cache_deactivate(). */ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*deact_fn)(struct kmem_cache *)) + void (*work_fn)(struct kmem_cache *)) { if (WARN_ON_ONCE(is_root_cache(s)) || - WARN_ON_ONCE(s->memcg_params.deact_fn)) + WARN_ON_ONCE(s->memcg_params.work_fn)) return; if (s->memcg_params.root_cache->memcg_params.dying) @@ -750,8 +750,8 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); - s->memcg_params.deact_fn = deact_fn; - call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); + s->memcg_params.work_fn = work_fn; + call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); } void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) -- cgit v1.2.3-59-g8ed1b From 434866947564b954409c2fe561605e22f7b49f64 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:09 -0700 Subject: mm: memcg/slab: generalize postponed non-root kmem_cache deactivation Currently SLUB uses a work scheduled after an RCU grace period to deactivate a non-root kmem_cache. This mechanism can be reused for kmem_caches release, but requires generalization for SLAB case. Introduce kmemcg_cache_deactivate() function, which calls allocator-specific __kmem_cache_deactivate() and schedules execution of __kmem_cache_deactivate_after_rcu() with all necessary locks in a worker context after an rcu grace period. Here is the new calling scheme: kmemcg_cache_deactivate() __kmemcg_cache_deactivate() SLAB/SLUB-specific kmemcg_rcufn() rcu kmemcg_workfn() work __kmemcg_cache_deactivate_after_rcu() SLAB/SLUB-specific instead of: __kmemcg_cache_deactivate() SLAB/SLUB-specific slab_deactivate_memcg_cache_rcu_sched() SLUB-only kmemcg_rcufn() rcu kmemcg_workfn() work kmemcg_cache_deact_after_rcu() SLUB-only For consistency, all allocator-specific functions start with "__". Link: http://lkml.kernel.org/r/20190611231813.3148843-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++++ mm/slab.h | 3 +-- mm/slab_common.c | 27 ++++++++------------------- mm/slub.c | 8 +------- 4 files changed, 14 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index badd98f7e2f1..30347bd3f19c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2252,6 +2252,10 @@ void __kmemcg_cache_deactivate(struct kmem_cache *cachep) { __kmem_cache_shrink(cachep); } + +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s) +{ +} #endif int __kmem_cache_shutdown(struct kmem_cache *cachep) diff --git a/mm/slab.h b/mm/slab.h index 7ef695b91919..dc83583ee9dd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -172,6 +172,7 @@ int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); void __kmemcg_cache_deactivate(struct kmem_cache *s); +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -290,8 +291,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); -extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*work_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG_KMEM */ diff --git a/mm/slab_common.c b/mm/slab_common.c index f4dd9f75751c..62733bbcc971 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -708,7 +708,7 @@ static void kmemcg_workfn(struct work_struct *work) put_online_mems(); put_online_cpus(); - /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ + /* done, put the ref from kmemcg_cache_deactivate() */ css_put(&s->memcg_params.memcg->css); } @@ -726,31 +726,21 @@ static void kmemcg_rcufn(struct rcu_head *head) queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); } -/** - * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a - * sched RCU grace period - * @s: target kmem_cache - * @work_fn: deactivation function to call - * - * Schedule @work_fn to be invoked with online cpus, mems and slab_mutex - * held after a sched RCU grace period. The slab is guaranteed to stay - * alive until @work_fn is finished. This is to be used from - * __kmemcg_cache_deactivate(). - */ -void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, - void (*work_fn)(struct kmem_cache *)) +static void kmemcg_cache_deactivate(struct kmem_cache *s) { if (WARN_ON_ONCE(is_root_cache(s)) || WARN_ON_ONCE(s->memcg_params.work_fn)) return; + __kmemcg_cache_deactivate(s); + if (s->memcg_params.root_cache->memcg_params.dying) return; /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); - s->memcg_params.work_fn = work_fn; + s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu; call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); } @@ -773,7 +763,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmemcg_cache_deactivate(c); + kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -866,11 +856,10 @@ static void flush_memcg_workqueue(struct kmem_cache *s) mutex_unlock(&slab_mutex); /* - * SLUB deactivates the kmem_caches through call_rcu. Make + * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. */ - if (IS_ENABLED(CONFIG_SLUB)) - rcu_barrier(); + rcu_barrier(); /* * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB diff --git a/mm/slub.c b/mm/slub.c index e1402ed19e74..845aeaa6c2d4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4008,7 +4008,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) } #ifdef CONFIG_MEMCG -static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s) { /* * Called with all the locks held after a sched RCU grace period. @@ -4034,12 +4034,6 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) */ slub_set_cpu_partial(s, 0); s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), so - * we have to make sure the change is visible before shrinking. - */ - slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); } #endif /* CONFIG_MEMCG */ -- cgit v1.2.3-59-g8ed1b From 49a18eae2e98a794477b5af5d85938e430c0be72 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:13 -0700 Subject: mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg() Let's separate the page counter modification code out of __memcg_kmem_uncharge() in a way similar to what __memcg_kmem_charge() and __memcg_kmem_charge_memcg() work. This will allow to reuse this code later using a new memcg_kmem_uncharge_memcg() wrapper, which calls __memcg_kmem_uncharge_memcg() if memcg_kmem_enabled() check is passed. Link: http://lkml.kernel.org/r/20190611231813.3148843-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ mm/memcontrol.c | 25 +++++++++++++++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 22141ebc5e15..68402842c337 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1278,6 +1278,8 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg); +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1319,6 +1321,14 @@ static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, return __memcg_kmem_charge_memcg(page, gfp, order, memcg); return 0; } + +static inline void memcg_kmem_uncharge_memcg(struct page *page, int order, + struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge_memcg(memcg, 1 << order); +} + /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6de79ec3cd21..25e35a8b8ba2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2807,6 +2807,22 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) css_put(&memcg->css); return ret; } + +/** + * __memcg_kmem_uncharge_memcg: uncharge a kmem page + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} /** * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge @@ -2821,14 +2837,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - + __memcg_kmem_uncharge_memcg(memcg, nr_pages); page->mem_cgroup = NULL; /* slab pages do not have PageKmemcg flag set */ -- cgit v1.2.3-59-g8ed1b From 6cea1d569d24af6f9e95f70cb301807440ae2981 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:16 -0700 Subject: mm: memcg/slab: unify SLAB and SLUB page accounting Currently the page accounting code is duplicated in SLAB and SLUB internals. Let's move it into new (un)charge_slab_page helpers in the slab_common.c file. These helpers will be responsible for statistics (global and memcg-aware) and memcg charging. So they are replacing direct memcg_(un)charge_slab() calls. Link: http://lkml.kernel.org/r/20190611231813.3148843-6-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Christoph Lameter Acked-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 19 +++---------------- mm/slab.h | 25 +++++++++++++++++++++++++ mm/slub.c | 14 ++------------ 3 files changed, 30 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 30347bd3f19c..e9d90b0da47b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1360,7 +1360,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct page *page; - int nr_pages; flags |= cachep->allocflags; @@ -1370,17 +1369,11 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { + if (charge_slab_page(page, flags, cachep->gfporder, cachep)) { __free_pages(page, cachep->gfporder); return NULL; } - nr_pages = (1 << cachep->gfporder); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages); - else - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages); - __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(page)) @@ -1395,12 +1388,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { int order = cachep->gfporder; - unsigned long nr_freed = (1 << order); - - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); - else - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed); BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); @@ -1409,8 +1396,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page->mapping = NULL; if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += nr_freed; - memcg_uncharge_slab(page, order, cachep); + current->reclaim_state->reclaimed_slab += 1 << order; + uncharge_slab_page(page, order, cachep); __free_pages(page, order); } diff --git a/mm/slab.h b/mm/slab.h index dc83583ee9dd..46623a576a3c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -205,6 +205,12 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +static inline int cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE; +} + #ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ @@ -361,6 +367,25 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) return page->slab_cache; } +static __always_inline int charge_slab_page(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + int ret = memcg_charge_slab(page, gfp, order, s); + + if (!ret) + mod_lruvec_page_state(page, cache_vmstat_idx(s), 1 << order); + + return ret; +} + +static __always_inline void uncharge_slab_page(struct page *page, int order, + struct kmem_cache *s) +{ + mod_lruvec_page_state(page, cache_vmstat_idx(s), -(1 << order)); + memcg_uncharge_slab(page, order, s); +} + static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { struct kmem_cache *cachep; diff --git a/mm/slub.c b/mm/slub.c index 845aeaa6c2d4..c9541a480627 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1488,7 +1488,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, else page = __alloc_pages_node(node, flags, order); - if (page && memcg_charge_slab(page, flags, order, s)) { + if (page && charge_slab_page(page, flags, order, s)) { __free_pages(page, order); page = NULL; } @@ -1681,11 +1681,6 @@ out: if (!page) return NULL; - mod_lruvec_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); - inc_slabs_node(s, page_to_nid(page), page->objects); return page; @@ -1719,18 +1714,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) check_object(s, page, p, SLUB_RED_INACTIVE); } - mod_lruvec_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -pages); - __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - memcg_uncharge_slab(page, order, s); + uncharge_slab_page(page, order, s); __free_pages(page, order); } -- cgit v1.2.3-59-g8ed1b From 570332978ea7fdbec86a07086a584d796a87da2c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:20 -0700 Subject: mm: memcg/slab: don't check the dying flag on kmem_cache creation There is no point in checking the root_cache->memcg_params.dying flag on kmem_cache creation path. New allocations shouldn't be performed using a dead root kmem_cache, so no new memcg kmem_cache creation can be scheduled after the flag is set. And if it was scheduled before, flush_memcg_workqueue() will wait for it anyway. So let's drop this check to simplify the code. Link: http://lkml.kernel.org/r/20190611231813.3148843-7-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 62733bbcc971..afdd73553b88 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -640,7 +640,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); -- cgit v1.2.3-59-g8ed1b From 63b02ef7dc4ec239df45c018ac0adbd02ba30a0c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:24 -0700 Subject: mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock Currently the memcg_params.dying flag and the corresponding workqueue used for the asynchronous deactivation of kmem_caches is synchronized using the slab_mutex. It makes impossible to check this flag from the irq context, which will be required in order to implement asynchronous release of kmem_caches. So let's switch over to the irq-save flavor of the spinlock-based synchronization. Link: http://lkml.kernel.org/r/20190611231813.3148843-8-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index afdd73553b88..a15557776d7d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -130,6 +130,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, #ifdef CONFIG_MEMCG_KMEM LIST_HEAD(slab_root_caches); +static DEFINE_SPINLOCK(memcg_kmem_wq_lock); void slab_init_memcg_params(struct kmem_cache *s) { @@ -734,14 +735,22 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) __kmemcg_cache_deactivate(s); + /* + * memcg_kmem_wq_lock is used to synchronize memcg_params.dying + * flag and make sure that no new kmem_cache deactivation tasks + * are queued (see flush_memcg_workqueue() ). + */ + spin_lock_irq(&memcg_kmem_wq_lock); if (s->memcg_params.root_cache->memcg_params.dying) - return; + goto unlock; /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu; call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); +unlock: + spin_unlock_irq(&memcg_kmem_wq_lock); } void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) @@ -851,9 +860,9 @@ static int shutdown_memcg_caches(struct kmem_cache *s) static void flush_memcg_workqueue(struct kmem_cache *s) { - mutex_lock(&slab_mutex); + spin_lock_irq(&memcg_kmem_wq_lock); s->memcg_params.dying = true; - mutex_unlock(&slab_mutex); + spin_unlock_irq(&memcg_kmem_wq_lock); /* * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make -- cgit v1.2.3-59-g8ed1b From f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:27 -0700 Subject: mm: memcg/slab: rework non-root kmem_cache lifecycle management Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [cai@lca.pw: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com Signed-off-by: Roman Gushchin Signed-off-by: Qian Cai Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 +- mm/memcontrol.c | 50 +++++++++++++++++++++++++-------- mm/slab.h | 44 ++++++++--------------------- mm/slab_common.c | 78 +++++++++++++++++++++++++++++----------------------- 4 files changed, 96 insertions(+), 79 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index 6008d884e621..bc189a43e680 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include /* @@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *); void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); void memcg_deactivate_kmem_caches(struct mem_cgroup *); -void memcg_destroy_kmem_caches(struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the @@ -642,6 +642,7 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head children_node; struct list_head kmem_caches_node; + struct percpu_ref refcnt; void (*work_fn)(struct kmem_cache *); union { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 25e35a8b8ba2..ce4ce5e7937b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, { struct memcg_kmem_cache_create_work *cw; + if (!css_tryget_online(&memcg->css)) + return; + cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); if (!cw) return; - css_get(&memcg->css); - cw->memcg = memcg; cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); @@ -2707,6 +2708,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + struct memcg_cache_array *arr; int kmemcg_id; VM_BUG_ON(!is_root_cache(cachep)); @@ -2714,14 +2716,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - memcg = get_mem_cgroup_from_current(); + rcu_read_lock(); + + if (unlikely(current->active_memcg)) + memcg = current->active_memcg; + else + memcg = mem_cgroup_from_task(current); + + if (!memcg || memcg == root_mem_cgroup) + goto out_unlock; + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) - goto out; + goto out_unlock; + + arr = rcu_dereference(cachep->memcg_params.memcg_caches); - memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); - if (likely(memcg_cachep)) - return memcg_cachep; + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match the data dependency + * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). + */ + memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); /* * If we are in a safe context (can wait, and not in interrupt @@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) * memcg_create_kmem_cache, this means no further allocation * could happen with the slab_mutex held. So it's better to * defer everything. + * + * If the memcg is dying or memcg_cache is about to be released, + * don't bother creating new kmem_caches. Because memcg_cachep + * is ZEROed as the fist step of kmem offlining, we don't need + * percpu_ref_tryget_live() here. css_tryget_online() check in + * memcg_schedule_kmem_cache_create() will prevent us from + * creation of a new kmem_cache. */ - memcg_schedule_kmem_cache_create(memcg, cachep); -out: - css_put(&memcg->css); + if (unlikely(!memcg_cachep)) + memcg_schedule_kmem_cache_create(memcg, cachep); + else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) + cachep = memcg_cachep; +out_unlock: + rcu_read_unlock(); return cachep; } @@ -2748,7 +2774,7 @@ out: void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params.memcg->css); + percpu_ref_put(&cachep->memcg_params.refcnt); } /** @@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) memcg_offline_kmem(memcg); if (memcg->kmem_state == KMEM_ALLOCATED) { - memcg_destroy_kmem_caches(memcg); + WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } diff --git a/mm/slab.h b/mm/slab.h index 46623a576a3c..5d2b8511e6fb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -/* - * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. - * That said the caller must assure the memcg's cache won't go away by either - * taking a css reference to the owner cgroup, or holding the slab_mutex. - */ -static inline struct kmem_cache * -cache_from_memcg_idx(struct kmem_cache *s, int idx) -{ - struct kmem_cache *cachep; - struct memcg_cache_array *arr; - - rcu_read_lock(); - arr = rcu_dereference(s->memcg_params.memcg_caches); - - /* - * Make sure we will access the up-to-date value. The code updating - * memcg_caches issues a write barrier to match this (see - * memcg_create_kmem_cache()). - */ - cachep = READ_ONCE(arr->entries[idx]); - rcu_read_unlock(); - - return cachep; -} - static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { if (is_root_cache(s)) @@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (is_root_cache(s)) return 0; - return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); + + ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); + if (ret) + return ret; + + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + + return 0; } static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { + if (!is_root_cache(s)) + percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); memcg_kmem_uncharge(page, order); } @@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache * -cache_from_memcg_idx(struct kmem_cache *s, int idx) -{ - return NULL; -} - static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; diff --git a/mm/slab_common.c b/mm/slab_common.c index a15557776d7d..ee3971f7fabc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, LIST_HEAD(slab_root_caches); static DEFINE_SPINLOCK(memcg_kmem_wq_lock); +static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref); + void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.root_cache = NULL; @@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s, struct memcg_cache_array *arr; if (root_cache) { + int ret = percpu_ref_init(&s->memcg_params.refcnt, + kmemcg_cache_shutdown, + 0, GFP_KERNEL); + if (ret) + return ret; + s->memcg_params.root_cache = root_cache; INIT_LIST_HEAD(&s->memcg_params.children_node); INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); @@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s) { if (is_root_cache(s)) kvfree(rcu_access_pointer(s->memcg_params.memcg_caches)); + else + percpu_ref_exit(&s->memcg_params.refcnt); } static void free_memcg_params(struct rcu_head *rcu) @@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg) if (is_root_cache(s)) { list_add(&s->root_caches_node, &slab_root_caches); } else { + css_get(&memcg->css); s->memcg_params.memcg = memcg; list_add(&s->memcg_params.children_node, &s->memcg_params.root_cache->memcg_params.children); @@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s) } else { list_del(&s->memcg_params.children_node); list_del(&s->memcg_params.kmem_caches_node); + css_put(&s->memcg_params.memcg->css); } } #else @@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, } /* - * Since readers won't lock (see cache_from_memcg_idx()), we need a + * Since readers won't lock (see memcg_kmem_get_cache()), we need a * barrier here to ensure nobody will see the kmem_cache partially * initialized. */ @@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work) get_online_mems(); mutex_lock(&slab_mutex); - s->memcg_params.work_fn(s); - mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - /* done, put the ref from kmemcg_cache_deactivate() */ - css_put(&s->memcg_params.memcg->css); } static void kmemcg_rcufn(struct rcu_head *head) @@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head) queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); } +static void kmemcg_cache_shutdown_fn(struct kmem_cache *s) +{ + WARN_ON(shutdown_cache(s)); +} + +static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref) +{ + struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache, + memcg_params.refcnt); + unsigned long flags; + + spin_lock_irqsave(&memcg_kmem_wq_lock, flags); + if (s->memcg_params.root_cache->memcg_params.dying) + goto unlock; + + s->memcg_params.work_fn = kmemcg_cache_shutdown_fn; + INIT_WORK(&s->memcg_params.work, kmemcg_workfn); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); + +unlock: + spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags); +} + +static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s) +{ + __kmemcg_cache_deactivate_after_rcu(s); + percpu_ref_kill(&s->memcg_params.refcnt); +} + static void kmemcg_cache_deactivate(struct kmem_cache *s) { - if (WARN_ON_ONCE(is_root_cache(s)) || - WARN_ON_ONCE(s->memcg_params.work_fn)) + if (WARN_ON_ONCE(is_root_cache(s))) return; __kmemcg_cache_deactivate(s); @@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) if (s->memcg_params.root_cache->memcg_params.dying) goto unlock; - /* pin memcg so that @s doesn't get destroyed in the middle */ - css_get(&s->memcg_params.memcg->css); - - s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu; + s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu; call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); unlock: spin_unlock_irq(&memcg_kmem_wq_lock); @@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } -void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) -{ - struct kmem_cache *s, *s2; - - get_online_cpus(); - get_online_mems(); - - mutex_lock(&slab_mutex); - list_for_each_entry_safe(s, s2, &memcg->kmem_caches, - memcg_params.kmem_caches_node) { - /* - * The cgroup is about to be freed and therefore has no charges - * left. Hence, all its caches must be empty by now. - */ - BUG_ON(shutdown_cache(s)); - } - mutex_unlock(&slab_mutex); - - put_online_mems(); - put_online_cpus(); -} - static int shutdown_memcg_caches(struct kmem_cache *s) { struct memcg_cache_array *arr; -- cgit v1.2.3-59-g8ed1b From 4d96ba3530750fae3f3f01150adfecde96157815 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:31 -0700 Subject: mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages Every slab page charged to a non-root memory cgroup has a pointer to the memory cgroup and holds a reference to it, which protects a non-empty memory cgroup from being released. At the same time the page has a pointer to the corresponding kmem_cache, and also hold a reference to the kmem_cache. And kmem_cache by itself holds a reference to the cgroup. So there is clearly some redundancy, which allows to stop setting the page->mem_cgroup pointer and rely on getting memcg pointer indirectly via kmem_cache. Further it will allow to change this pointer easier, without a need to go over all charged pages. So let's stop setting page->mem_cgroup pointer for slab pages, and stop using the css refcounter directly for protecting the memory cgroup from going away. Instead rely on kmem_cache as an intermediate object. Make sure that vmstats and shrinker lists are working as previously, as well as /proc/kpagecgroup interface. Link: http://lkml.kernel.org/r/20190611231813.3148843-10-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 3 ++- mm/memcontrol.c | 12 ++++++---- mm/slab.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 70 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/list_lru.c b/mm/list_lru.c index 927d85be32f6..0f1f6b06b7f3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,6 +12,7 @@ #include #include #include +#include "slab.h" #ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); @@ -63,7 +64,7 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) if (!memcg_kmem_enabled()) return NULL; page = virt_to_head_page(ptr); - return page->mem_cgroup; + return memcg_from_slab_page(page); } static inline struct list_lru_one * diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce4ce5e7937b..fa39e51b3d94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -486,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = READ_ONCE(page->mem_cgroup); + if (PageHead(page) && PageSlab(page)) + memcg = memcg_from_slab_page(page); + else + memcg = READ_ONCE(page->mem_cgroup); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) @@ -2802,9 +2805,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, cancel_charge(memcg, nr_pages); return -ENOMEM; } - - page->mem_cgroup = memcg; - return 0; } @@ -2827,8 +2827,10 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); - if (!ret) + if (!ret) { + page->mem_cgroup = memcg; __SetPageKmemcg(page); + } } css_put(&memcg->css); return ret; diff --git a/mm/slab.h b/mm/slab.h index 5d2b8511e6fb..7ead47cb9338 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -255,30 +255,67 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } +/* + * Expects a pointer to a slab page. Please note, that PageSlab() check + * isn't sufficient, as it returns true also for tail compound slab pages, + * which do not have slab_cache pointer set. + * So this function assumes that the page can pass PageHead() and PageSlab() + * checks. + */ +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + struct kmem_cache *s; + + s = READ_ONCE(page->slab_cache); + if (s && !is_root_cache(s)) + return s->memcg_params.memcg; + + return NULL; +} + +/* + * Charge the slab page belonging to the non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + struct mem_cgroup *memcg; + struct lruvec *lruvec; int ret; - if (is_root_cache(s)) - return 0; - - ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); + memcg = s->memcg_params.memcg; + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (ret) return ret; + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); + + /* transer try_charge() page references to kmem_cache */ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + css_put_many(&memcg->css, 1 << order); return 0; } +/* + * Uncharge a slab page belonging to a non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - if (!is_root_cache(s)) - percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); - memcg_kmem_uncharge(page, order); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + memcg = s->memcg_params.memcg; + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + + percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -314,6 +351,11 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; } +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + return NULL; +} + static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { @@ -351,18 +393,24 @@ static __always_inline int charge_slab_page(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - int ret = memcg_charge_slab(page, gfp, order, s); - - if (!ret) - mod_lruvec_page_state(page, cache_vmstat_idx(s), 1 << order); + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + 1 << order); + return 0; + } - return ret; + return memcg_charge_slab(page, gfp, order, s); } static __always_inline void uncharge_slab_page(struct page *page, int order, struct kmem_cache *s) { - mod_lruvec_page_state(page, cache_vmstat_idx(s), -(1 << order)); + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + return; + } + memcg_uncharge_slab(page, order, s); } -- cgit v1.2.3-59-g8ed1b From fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:34 -0700 Subject: mm: memcg/slab: reparent memcg kmem_caches on cgroup removal Let's reparent non-root kmem_caches on memcg offlining. This allows us to release the memory cgroup without waiting for the last outstanding kernel object (e.g. dentry used by another application). Since the parent cgroup is already charged, everything we need to do is to splice the list of kmem_caches to the parent's kmem_caches list, swap the memcg pointer, drop the css refcounter for each kmem_cache and adjust the parent's css refcounter. Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer anymore. It's safe to read it under rcu_read_lock(), cgroup_mutex held, or any other way that protects the memory cgroup from being released. We can race with the slab allocation and deallocation paths. It's not a big problem: parent's charge and slab global stats are always correct, and we don't care anymore about the child usage and global stats. The child cgroup is already offline, so we don't use or show it anywhere. Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't used anywhere except count_shadow_nodes(). But even there it won't break anything: after reparenting "nodes" will be 0 on child level (because we're already reparenting shrinker lists), and on parent level page stats always were 0, and this patch won't change anything. [guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup] Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Acked-by: David Rientjes Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- mm/memcontrol.c | 14 ++++++++------ mm/slab.h | 41 ++++++++++++++++++++++++++++++++--------- mm/slab_common.c | 19 +++++++++++++++++-- 4 files changed, 58 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index bc189a43e680..fd0ef2e16178 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); -void memcg_deactivate_kmem_caches(struct mem_cgroup *); +void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa39e51b3d94..2cb7e4e5c51a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3284,15 +3284,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg->kmem_state = KMEM_ALLOCATED; - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; + memcg_deactivate_kmem_caches(memcg, parent); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus @@ -3325,7 +3325,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) if (memcg->kmem_state == KMEM_ALLOCATED) { WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); } } #else @@ -4773,6 +4772,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* The following stuff does not apply to the root */ if (!parent) { +#ifdef CONFIG_MEMCG_KMEM + INIT_LIST_HEAD(&memcg->kmem_caches); +#endif root_mem_cgroup = memcg; return &memcg->css; } diff --git a/mm/slab.h b/mm/slab.h index 7ead47cb9338..a62372d0f271 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,6 +261,9 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) * which do not have slab_cache pointer set. * So this function assumes that the page can pass PageHead() and PageSlab() * checks. + * + * The kmem_cache can be reparented asynchronously. The caller must ensure + * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex. */ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) { @@ -268,7 +271,7 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) s = READ_ONCE(page->slab_cache); if (s && !is_root_cache(s)) - return s->memcg_params.memcg; + return READ_ONCE(s->memcg_params.memcg); return NULL; } @@ -285,10 +288,22 @@ static __always_inline int memcg_charge_slab(struct page *page, struct lruvec *lruvec; int ret; - memcg = s->memcg_params.memcg; + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + while (memcg && !css_tryget_online(&memcg->css)) + memcg = parent_mem_cgroup(memcg); + rcu_read_unlock(); + + if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + (1 << order)); + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + return 0; + } + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (ret) - return ret; + goto out; lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); @@ -296,8 +311,9 @@ static __always_inline int memcg_charge_slab(struct page *page, /* transer try_charge() page references to kmem_cache */ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); css_put_many(&memcg->css, 1 << order); - - return 0; +out: + css_put(&memcg->css); + return ret; } /* @@ -310,10 +326,17 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct mem_cgroup *memcg; struct lruvec *lruvec; - memcg = s->memcg_params.memcg; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); - memcg_kmem_uncharge_memcg(page, order, memcg); + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + if (likely(!mem_cgroup_is_root(memcg))) { + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + } else { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + } + rcu_read_unlock(); percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); } diff --git a/mm/slab_common.c b/mm/slab_common.c index ee3971f7fabc..b893eefb6229 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -252,7 +252,8 @@ static void memcg_unlink_cache(struct kmem_cache *s) } else { list_del(&s->memcg_params.children_node); list_del(&s->memcg_params.kmem_caches_node); - css_put(&s->memcg_params.memcg->css); + mem_cgroup_put(s->memcg_params.memcg); + WRITE_ONCE(s->memcg_params.memcg, NULL); } } #else @@ -785,11 +786,13 @@ unlock: spin_unlock_irq(&memcg_kmem_wq_lock); } -void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg, + struct mem_cgroup *parent) { int idx; struct memcg_cache_array *arr; struct kmem_cache *s, *c; + unsigned int nr_reparented; idx = memcg_cache_id(memcg); @@ -807,6 +810,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } + nr_reparented = 0; + list_for_each_entry(s, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { + WRITE_ONCE(s->memcg_params.memcg, parent); + css_put(&memcg->css); + nr_reparented++; + } + if (nr_reparented) { + list_splice_init(&memcg->kmem_caches, + &parent->kmem_caches); + css_get_many(&parent->css, nr_reparented); + } mutex_unlock(&slab_mutex); put_online_mems(); -- cgit v1.2.3-59-g8ed1b From fcf8a1e483490cd249df4e02d5425636c3f43c86 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 11 Jul 2019 20:56:38 -0700 Subject: mm, memcg: add a memcg_slabinfo debugfs file There are concerns about memory leaks from extensive use of memory cgroups as each memory cgroup creates its own set of kmem caches. There is a possiblity that the memcg kmem caches may remain even after the memory cgroups have been offlined. Therefore, it will be useful to show the status of each of memcg kmem caches. This patch introduces a new /memcg_slabinfo file which is somewhat similar to /proc/slabinfo in format, but lists only information about kmem caches that have child memcg kmem caches. Information available in /proc/slabinfo are not repeated in memcg_slabinfo. A portion of a sample output of the file was: # rpc_inode_cache root 13 51 1 1 rpc_inode_cache 48 0 0 0 0 fat_inode_cache root 1 45 1 1 fat_inode_cache 41 2 45 1 1 xfs_inode root 770 816 24 24 xfs_inode 92 22 34 1 1 xfs_inode 88:dead 1 34 1 1 xfs_inode 89:dead 23 34 1 1 xfs_inode 85 4 34 1 1 xfs_inode 84 9 34 1 1 The css id of the memcg is also listed. If a memcg is not online, the tag ":dead" will be attached as shown above. [longman@redhat.com: memcg: add ":deact" tag for reparented kmem caches in memcg_slabinfo] Link: http://lkml.kernel.org/r/20190621173005.31514-1-longman@redhat.com [longman@redhat.com: set the flag in the common code as suggested by Roman] Link: http://lkml.kernel.org/r/20190627184324.5875-1-longman@redhat.com Link: http://lkml.kernel.org/r/20190619171621.26209-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++++ mm/slab_common.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index fd0ef2e16178..56c9c7eed34e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,6 +116,10 @@ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ + +/* Slab deactivation flag */ +#define SLAB_DEACTIVATED ((slab_flags_t __force)0x10000000U) + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/mm/slab_common.c b/mm/slab_common.c index b893eefb6229..6c49dbb3769e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -770,6 +771,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) return; __kmemcg_cache_deactivate(s); + s->flags |= SLAB_DEACTIVATED; /* * memcg_kmem_wq_lock is used to synchronize memcg_params.dying @@ -1521,6 +1523,64 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM) +/* + * Display information about kmem caches that have child memcg caches. + */ +static int memcg_slabinfo_show(struct seq_file *m, void *unused) +{ + struct kmem_cache *s, *c; + struct slabinfo sinfo; + + mutex_lock(&slab_mutex); + seq_puts(m, "# "); + seq_puts(m, " \n"); + list_for_each_entry(s, &slab_root_caches, root_caches_node) { + /* + * Skip kmem caches that don't have any memcg children. + */ + if (list_empty(&s->memcg_params.children)) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n", + cache_name(s), sinfo.active_objs, sinfo.num_objs, + sinfo.active_slabs, sinfo.num_slabs); + + for_each_memcg_cache(c, s) { + struct cgroup_subsys_state *css; + char *status = ""; + + css = &c->memcg_params.memcg->css; + if (!(css->flags & CSS_ONLINE)) + status = ":dead"; + else if (c->flags & SLAB_DEACTIVATED) + status = ":deact"; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n", + cache_name(c), css->id, status, + sinfo.active_objs, sinfo.num_objs, + sinfo.active_slabs, sinfo.num_slabs); + } + } + mutex_unlock(&slab_mutex); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo); + +static int __init memcg_slabinfo_init(void) +{ + debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO, + NULL, NULL, &memcg_slabinfo_fops); + return 0; +} + +late_initcall(memcg_slabinfo_init); +#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */ #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ static __always_inline void *__do_krealloc(const void *p, size_t new_size, -- cgit v1.2.3-59-g8ed1b From f455c854877dcce5d714b00203ea804bf601fb02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:41 -0700 Subject: mm: use untagged_addr() for get_user_pages_fast addresses Patch series "switch the remaining architectures to use generic GUP", v4. A series to switch mips, sh and sparc64 to use the generic GUP code so that we only have one codebase to touch for further improvements to this code. This patch (of 16): This will allow sparc64, or any future architecture with memory tagging to override its tags for get_user_pages and get_user_pages_fast. Link: http://lkml.kernel.org/r/20190625143715.1689-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Khalid Aziz Reviewed-by: Jason Gunthorpe Cc: Paul Burton Cc: James Hogan Cc: Yoshinori Sato Cc: Rich Felker Cc: David Miller Cc: Nicholas Piggin Cc: Khalid Aziz Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index f1c1daebc425..fc704dc37914 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2146,7 +2146,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, unsigned long flags; int nr = 0; - start &= PAGE_MASK; + start = untagged_addr(start) & PAGE_MASK; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; @@ -2219,7 +2219,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned long addr, len, end; int nr = 0, ret = 0; - start &= PAGE_MASK; + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; -- cgit v1.2.3-59-g8ed1b From 26f4c328079d78d6cd0462c53c14ec0b69f4748e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:45 -0700 Subject: mm: simplify gup_fast_permitted Pass in the already calculated end value instead of recomputing it, and leave the end > start check in the callers instead of duplicating them in the arch code. Link: http://lkml.kernel.org/r/20190625143715.1689-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 8 +------- arch/x86/include/asm/pgtable_64.h | 8 +------- mm/gup.c | 17 +++++++---------- 3 files changed, 9 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9f0195d5fa16..9b274fcaacb6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1270,14 +1270,8 @@ static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; return end <= current->mm->context.asce_limit; } #define gup_fast_permitted gup_fast_permitted diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0bb566315621..4990d26dfc73 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -259,14 +259,8 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #define gup_fast_permitted gup_fast_permitted -static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long)nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; if (end >> __VIRTUAL_MASK_SHIFT) return false; return true; diff --git a/mm/gup.c b/mm/gup.c index fc704dc37914..84891c06d485 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2123,13 +2123,9 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, * Check if it's allowed to use __get_user_pages_fast() for the range, or * we need to fall back to the slow version: */ -bool gup_fast_permitted(unsigned long start, int nr_pages) +static bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - return end >= start; + return true; } #endif @@ -2150,6 +2146,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; + if (end <= start) + return 0; if (unlikely(!access_ok((void __user *)start, len))) return 0; @@ -2165,7 +2163,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - if (gup_fast_permitted(start, nr_pages)) { + if (gup_fast_permitted(start, end)) { local_irq_save(flags); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); @@ -2224,13 +2222,12 @@ int get_user_pages_fast(unsigned long start, int nr_pages, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (nr_pages <= 0) + if (end <= start) return 0; - if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (gup_fast_permitted(start, nr_pages)) { + if (gup_fast_permitted(start, end)) { local_irq_disable(); gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); -- cgit v1.2.3-59-g8ed1b From 39656e83dab918861931ef96e5c41731b0899e56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:56:49 -0700 Subject: mm: lift the x86_32 PAE version of gup_get_pte to common code The split low/high access is the only non-READ_ONCE version of gup_get_pte that did show up in the various arch implemenations. Lift it to common code and drop the ifdef based arch override. Link: http://lkml.kernel.org/r/20190625143715.1689-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable-3level.h | 47 -------------------------------- arch/x86/kvm/mmu.c | 2 +- mm/Kconfig | 3 +++ mm/gup.c | 51 ++++++++++++++++++++++++++++++++--- 5 files changed, 52 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dce10b18f4bc..71c1f7864434 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,6 +123,7 @@ config X86 select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GUP_GET_PTE_LOW_HIGH if X86_PAE select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f8b1ad2c3828..e3633795fb22 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -285,53 +285,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) -#define gup_get_pte gup_get_pte -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a PTE will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' iff pte_high - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have gotten rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. Because get_user_pages_fast() only operates on present ptes - * we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} - #include #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 98f6e4f88b04..4a9c63d1c20a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -650,7 +650,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from - * gup_get_pte(arch/x86/mm/gup.c). + * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because kvm_set_pte_rmapp * coalesces them and we are running out of the MMU lock. Therefore diff --git a/mm/Kconfig b/mm/Kconfig index ef6efedc5921..53e2ca54b385 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -762,6 +762,9 @@ config GUP_BENCHMARK See tools/testing/selftests/vm/gup_benchmark.c +config GUP_GET_PTE_LOW_HIGH + bool + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/gup.c b/mm/gup.c index 84891c06d485..2093283e4933 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1684,17 +1684,60 @@ struct page *get_dump_page(unsigned long addr) * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GENERIC_GUP +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking any + * locks. For this we would like to load the pointers atomically, but sometimes + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What + * we do have is the guarantee that a PTE will only either go from not present + * to present, or present to not present or both -- it will not switch to a + * completely different present page without a TLB flush in between; something + * that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. + * We load pte_high *after* loading pte_low, which ensures we don't see an older + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't + * picked up a changed pte high. We might have gotten rubbish values from + * pte_low and pte_high, but we are guaranteed that pte_low will not have the + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only + * operates on present ptes we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; -#ifndef gup_get_pte + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} +#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ /* - * We assume that the PTE can be read atomically. If this is not the case for - * your architecture, please provide the helper. + * We require that the PTE can be read atomically. */ static inline pte_t gup_get_pte(pte_t *ptep) { return READ_ONCE(*ptep); } -#endif +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) { -- cgit v1.2.3-59-g8ed1b From 67a929e097b774c69253c8b61ef9eb8a42b463a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:14 -0700 Subject: mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP We only support the generic GUP now, so rename the config option to be more clear, and always use the mm/Kconfig definition of the symbol and select it from the arch Kconfigs. Link: http://lkml.kernel.org/r/20190625143715.1689-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Khalid Aziz Reviewed-by: Jason Gunthorpe Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 5 +---- arch/arm64/Kconfig | 4 +--- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 4 +--- mm/Kconfig | 2 +- mm/gup.c | 4 ++-- 10 files changed, 11 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0c55aa199c67..2bf1ce39a96d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -75,6 +75,7 @@ config ARM select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD + select HAVE_FAST_GUP if ARM_LPAE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL @@ -1625,10 +1626,6 @@ config ARCH_SPARSEMEM_DEFAULT config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_GUP - def_bool y - depends on ARM_LPAE - config HIGHMEM bool "High Memory Support" depends on MMU diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c085aec9459b..a36ff61321ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -143,6 +143,7 @@ config ARM64 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER @@ -267,9 +268,6 @@ config ZONE_DMA32 bool "Support DMA32 zone" if EXPERT default y -config HAVE_GENERIC_GUP - def_bool y - config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 51c05f669d57..7957d3457156 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -53,10 +53,10 @@ config MIPS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EXIT_THREAD + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_GENERIC_GUP select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3b795a0cab62..959866c156de 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -185,12 +185,12 @@ config PPC select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL select HAVE_EBPF_JIT if PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC - select HAVE_GENERIC_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f089ae375f6b..5d8570ed6cab 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -139,6 +139,7 @@ config S390 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_FAST_GUP select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD @@ -146,7 +147,6 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS - select HAVE_GENERIC_GUP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 02ae8132ebfe..31a7d12db705 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -15,7 +15,7 @@ config SUPERH select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS select HAVE_DEBUG_BUGVERBOSE - select HAVE_GENERIC_GUP + select HAVE_FAST_GUP if MMU select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c59464755764..e9f5d62e9817 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -28,7 +28,7 @@ config SPARC select RTC_DRV_M48T59 select RTC_SYSTOHC select HAVE_ARCH_JUMP_LABEL if SPARC64 - select HAVE_GENERIC_GUP if SPARC64 + select HAVE_FAST_GUP if SPARC64 select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_PCI_IOMAP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 71c1f7864434..9df2d1cb7a9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -159,6 +159,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA select HAVE_EXIT_THREAD + select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER @@ -2907,9 +2908,6 @@ config HAVE_ATOMIC_IOMAP config X86_DEV_DMA_OPS bool -config HAVE_GENERIC_GUP - def_bool y - source "drivers/firmware/Kconfig" source "arch/x86/kvm/Kconfig" diff --git a/mm/Kconfig b/mm/Kconfig index 53e2ca54b385..b5a258d62465 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -132,7 +132,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_GUP +config HAVE_FAST_GUP bool config ARCH_KEEP_MEMBLOCK diff --git a/mm/gup.c b/mm/gup.c index 2093283e4933..2f69b30a11d3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1651,7 +1651,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic Fast GUP + * Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1683,7 +1683,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_GUP +#ifdef CONFIG_HAVE_FAST_GUP #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* * WARNING: only to be used in the get_user_pages_fast() implementation. -- cgit v1.2.3-59-g8ed1b From d3649f68b4336e7ef7aa264cf05ba1265feb0968 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:18 -0700 Subject: mm: reorder code blocks in gup.c This moves the actually exported functions towards the end of the file, and reorders some functions to be in more logical blocks as a preparation for moving various stubs inline into the main functionality using IS_ENABLED(). Link: http://lkml.kernel.org/r/20190625143715.1689-12-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 410 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 205 insertions(+), 205 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 2f69b30a11d3..d36b82c05e79 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1100,86 +1100,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, return pages_done; } -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * - * get_user_pages_locked() is suitable to replace the form: - * - * down_read(&mm->mmap_sem); - * do_something() - * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); - * - * to: - * - * int locked = 1; - * down_read(&mm->mmap_sem); - * do_something() - * get_user_pages_locked(tsk, mm, ..., pages, &locked); - * if (locked) - * up_read(&mm->mmap_sem); - */ -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages_locked); - -/* - * get_user_pages_unlocked() is suitable to replace the form: - * - * down_read(&mm->mmap_sem); - * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); - * - * with: - * - * get_user_pages_unlocked(tsk, mm, ..., pages); - * - * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead if specific gup_flags - * (e.g. FOLL_FORCE) are not required. - */ -long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) -{ - struct mm_struct *mm = current->mm; - int locked = 1; - long ret; - - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - down_read(&mm->mmap_sem); - ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); - if (locked) - up_read(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(get_user_pages_unlocked); - /* * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or @@ -1256,6 +1176,153 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_remote); +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { @@ -1503,152 +1570,85 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/** - * populate_vma_page_range() - populate a range of pages in the vma. - * @vma: target vma - * @start: start address - * @end: end address - * @nonblocking: - * - * This takes care of mlocking the pages too if VM_LOCKED is set. +/* + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). * - * return 0 on success, negative error code on error. + * get_user_pages_locked() is suitable to replace the form: * - * vma->vm_mm->mmap_sem must be held. + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); * - * If @nonblocking is NULL, it may be held for read or write and will - * be unperturbed. + * to: * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. + * int locked = 1; + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * if (locked) + * up_read(&mm->mmap_sem); */ -long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) +long get_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) { - struct mm_struct *mm = vma->vm_mm; - unsigned long nr_pages = (end - start) / PAGE_SIZE; - int gup_flags; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; - if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) - gup_flags |= FOLL_WRITE; - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) - gup_flags |= FOLL_FORCE; + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; - /* - * We made sure addr is within a VMA, so the following will - * not result in a stack expansion that recurses back here. - */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); } +EXPORT_SYMBOL(get_user_pages_locked); /* - * __mm_populate - populate and/or mlock pages within a range of address space. + * get_user_pages_unlocked() is suitable to replace the form: * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. + * down_read(&mm->mmap_sem); + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * with: + * + * get_user_pages_unlocked(tsk, mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) { struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; - struct vm_area_struct *vma = NULL; - int locked = 0; - long ret = 0; + int locked = 1; + long ret; - end = start + len; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; - for (nstart = start; nstart < end; nstart = nend) { - /* - * We want to fault in pages for [nstart; end) address range. - * Find first corresponding VMA. - */ - if (!locked) { - locked = 1; - down_read(&mm->mmap_sem); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - /* - * Set [nstart; nend) to intersection of desired address - * range with the first VMA. Also, skip undesirable VMA types. - */ - nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - /* - * Now fault in a range of pages. populate_vma_page_range() - * double checks the vma flags, so that it won't mlock pages - * if the vma was already munlocked. - */ - ret = populate_vma_page_range(vma, nstart, nend, &locked); - if (ret < 0) { - if (ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - break; - } - nend = nstart + ret * PAGE_SIZE; - ret = 0; - } + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); if (locked) up_read(&mm->mmap_sem); - return ret; /* 0 or negative error code */ -} - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; + return ret; } -#endif /* CONFIG_ELF_CORE */ +EXPORT_SYMBOL(get_user_pages_unlocked); /* * Fast GUP -- cgit v1.2.3-59-g8ed1b From 050a9adc64383aed3429a31432b4f5a7b0cdc8ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:21 -0700 Subject: mm: consolidate the get_user_pages* implementations Always build mm/gup.c so that we don't have to provide separate nommu stubs. Also merge the get_user_pages_fast and __get_user_pages_fast stubs when HAVE_FAST_GUP into the main implementations, which will never call the fast path if HAVE_FAST_GUP is not set. This also ensures the new put_user_pages* helpers are available for nommu, as those are currently missing, which would create a problem as soon as we actually grew users for it. Link: http://lkml.kernel.org/r/20190625143715.1689-13-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + mm/Makefile | 4 +-- mm/gup.c | 67 ++++++++++++++++++++++++++++++++++++++++++---- mm/nommu.c | 88 ------------------------------------------------------------- mm/util.c | 47 --------------------------------- 5 files changed, 65 insertions(+), 142 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index b5a258d62465..48840b28482b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -133,6 +133,7 @@ config HAVE_MEMBLOCK_PHYS_MAP bool config HAVE_FAST_GUP + depends on MMU bool config ARCH_KEEP_MEMBLOCK diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..dc0746ca1109 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n mmu-y := nommu.o -mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o gup.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/gup.c b/mm/gup.c index d36b82c05e79..cc5ddd8869b7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -134,6 +134,7 @@ void put_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(put_user_pages); +#ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -1322,6 +1323,51 @@ struct page *get_dump_page(unsigned long addr) return page; } #endif /* CONFIG_ELF_CORE */ +#else /* CONFIG_MMU */ +static long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + struct vm_area_struct **vmas, int *locked, + unsigned int foll_flags) +{ + struct vm_area_struct *vma; + unsigned long vm_flags; + int i; + + /* calculate required read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + for (i = 0; i < nr_pages; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + + if (pages) { + pages[i] = virt_to_page(start); + if (pages[i]) + get_page(pages[i]); + } + if (vmas) + vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; + } + + return i; + +finish_or_fault: + return i ? : -EFAULT; +} +#endif /* !CONFIG_MMU */ #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) @@ -1484,7 +1530,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, { return nr_pages; } -#endif +#endif /* CONFIG_CMA */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -2160,6 +2206,12 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, return; } while (pgdp++, addr = next, addr != end); } +#else +static inline void gup_pgd_range(unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ +} +#endif /* CONFIG_HAVE_FAST_GUP */ #ifndef gup_fast_permitted /* @@ -2177,6 +2229,9 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -2206,7 +2261,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - if (gup_fast_permitted(start, end)) { + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && + gup_fast_permitted(start, end)) { local_irq_save(flags); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); @@ -2214,6 +2270,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +EXPORT_SYMBOL_GPL(__get_user_pages_fast); static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2270,7 +2327,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (gup_fast_permitted(start, end)) { + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && + gup_fast_permitted(start, end)) { local_irq_disable(); gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); @@ -2296,5 +2354,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -#endif /* CONFIG_HAVE_GENERIC_GUP */ +EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/mm/nommu.c b/mm/nommu.c index d8c02fbe03b5..07165ad2e548 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -111,94 +111,6 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - struct vm_area_struct *vma; - unsigned long vm_flags; - int i; - - /* calculate required read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (foll_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (foll_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - - for (i = 0; i < nr_pages; i++) { - vma = find_vma(mm, start); - if (!vma) - goto finish_or_fault; - - /* protect what we can, including chardevs */ - if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - goto finish_or_fault; - - if (pages) { - pages[i] = virt_to_page(start); - if (pages[i]) - get_page(pages[i]); - } - if (vmas) - vmas[i] = vma; - start = (start + PAGE_SIZE) & PAGE_MASK; - } - - return i; - -finish_or_fault: - return i ? : -EFAULT; -} - -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages(current, current->mm, start, nr_pages, - gup_flags, pages, vmas, NULL); -} -EXPORT_SYMBOL(get_user_pages); - -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, NULL); -} -EXPORT_SYMBOL(get_user_pages_locked); - -static long __get_user_pages_unlocked(struct task_struct *tsk, - struct mm_struct *mm, unsigned long start, - unsigned long nr_pages, struct page **pages, - unsigned int gup_flags) -{ - long ret; - down_read(&mm->mmap_sem); - ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, - NULL, NULL); - up_read(&mm->mmap_sem); - return ret; -} - -long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) -{ - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, gup_flags); -} -EXPORT_SYMBOL(get_user_pages_unlocked); - /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping diff --git a/mm/util.c b/mm/util.c index 9834c4ab7d8e..68575a315dc5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -300,53 +300,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } #endif -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * If the architecture does not support this function, simply return with no - * pages pinned. - */ -int __weak __get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) -{ - return 0; -} -EXPORT_SYMBOL_GPL(__get_user_pages_fast); - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm, with force=0 and vma=NULL. However - * unlike get_user_pages, it must be called without mmap_sem held. - * - * get_user_pages_fast may take mmap_sem and page table locks, so no - * assumptions can be made about lack of locking. get_user_pages_fast is to be - * implemented in a way that is advantageous (vs get_user_pages()) when the - * user memory area is already faulted in and present in ptes. However if the - * pages have to be faulted in, it may turn out to be slightly slower so - * callers need to carefully consider what to use. On many architectures, - * get_user_pages_fast simply falls back to get_user_pages. - * - * Return: number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int __weak get_user_pages_fast(unsigned long start, - int nr_pages, unsigned int gup_flags, - struct page **pages) -{ - return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); -} -EXPORT_SYMBOL_GPL(get_user_pages_fast); - unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) -- cgit v1.2.3-59-g8ed1b From 817be129e6f254e5bd8c17b1da834c8f612dca28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:25 -0700 Subject: mm: validate get_user_pages_fast flags We can only deal with FOLL_WRITE and/or FOLL_LONGTERM in get_user_pages_fast, so reject all other flags. Link: http://lkml.kernel.org/r/20190625143715.1689-14-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index cc5ddd8869b7..9d68cef2fa90 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2317,6 +2317,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned long addr, len, end; int nr = 0, ret = 0; + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM))) + return -EINVAL; + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; -- cgit v1.2.3-59-g8ed1b From cbd34da7dc9afd521e0bea5e7d12701f4a9da7c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:28 -0700 Subject: mm: move the powerpc hugepd code to mm/gup.c While only powerpc supports the hugepd case, the code is pretty generic and I'd like to keep all GUP internals in one place. Link: http://lkml.kernel.org/r/20190625143715.1689-15-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/hugetlbpage.c | 72 ------------------------------------- include/linux/hugetlb.h | 18 ---------- mm/Kconfig | 10 ++++++ mm/gup.c | 82 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 90 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 959866c156de..24a41f919309 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -125,6 +125,7 @@ config PPC select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV + select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b5d92dc32844..51716c11d0fb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -511,13 +511,6 @@ retry: return page; } -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - #ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -665,68 +658,3 @@ void flush_dcache_icache_hugepage(struct page *page) } } } - -static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long pte_end; - struct page *head, *page; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = READ_ONCE(*ptep); - - if (!pte_access_permitted(pte, write)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, - unsigned long end, int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f895a79c6f5c..edfca4278319 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,29 +16,11 @@ struct user_struct; struct mmu_gather; #ifndef is_hugepd -/* - * Some architectures requires a hugepage directory format that is - * required to support multiple hugepage sizes. For example - * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" - * introduced the same on powerpc. This allows for a more flexible hugepage - * pagetable layout. - */ typedef struct { unsigned long pd; } hugepd_t; #define is_hugepd(hugepd) (0) #define __hugepd(x) ((hugepd_t) { (x) }) -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr) -{ - return 0; -} -#else -extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr); #endif - #ifdef CONFIG_HUGETLB_PAGE #include diff --git a/mm/Kconfig b/mm/Kconfig index 48840b28482b..0b4352557dd5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -769,4 +769,14 @@ config GUP_GET_PTE_LOW_HIGH config ARCH_HAS_PTE_SPECIAL bool +# +# Some architectures require a special hugepage directory format that is +# required to support multiple hugepage sizes. For example a4fe3ce76 +# "powerpc/mm: Allow more flexible layouts for hugepage pagetables" +# introduced it on powerpc. This allows for a more flexible hugepage +# pagetable layouts. +# +config ARCH_HAS_HUGEPD + bool + endmenu diff --git a/mm/gup.c b/mm/gup.c index 9d68cef2fa90..2f8bf7a71c74 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1966,6 +1966,88 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, } #endif +#ifdef CONFIG_ARCH_HAS_HUGEPD +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = READ_ONCE(*ptep); + + if (!pte_access_permitted(pte, write)) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + return 1; +} + +static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned int pdshift, unsigned long end, int write, + struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} +#else +static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, int write, + struct page **pages, int *nr) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD */ + static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { -- cgit v1.2.3-59-g8ed1b From 01a369160bbea43727aa2b99877f86ebddba9acc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:32 -0700 Subject: mm: switch gup_hugepte to use try_get_compound_head This applies the overflow fixes from 8fde12ca79aff ("mm: prevent get_user_pages() from overflowing page refcount") to the powerpc hugepd code and brings it back in sync with the other GUP cases. Link: http://lkml.kernel.org/r/20190625143715.1689-16-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 2f8bf7a71c74..7763abd16405 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2006,7 +2006,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(head, refs); + if (!head) { *nr -= refs; return 0; } -- cgit v1.2.3-59-g8ed1b From 520b4a4496f12b117b94f3ac7c493651881c5fe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:36 -0700 Subject: mm: mark the page referenced in gup_hugepte All other get_user_page_fast cases mark the page referenced, so do this here as well. Link: http://lkml.kernel.org/r/20190625143715.1689-17-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 7763abd16405..83d480e9b05f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2020,6 +2020,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } -- cgit v1.2.3-59-g8ed1b From aa712399c1e8245c375a5c44760de684ec2ebefb Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Thu, 11 Jul 2019 20:57:39 -0700 Subject: mm/gup: speed up check_and_migrate_cma_pages() on huge page Both hugetlb and thp locate on the same migration type of pageblock, since they are allocated from a free_list[]. Based on this fact, it is enough to check on a single subpage to decide the migration type of the whole huge page. By this way, it saves (2M/4K - 1) times loop for pmd_huge on x86, similar on other archs. Furthermore, when executing isolate_huge_page(), it avoid taking global hugetlb_lock many times, and meanless remove/add to the local link list cma_page_list. [akpm@linux-foundation.org: make `i' and `step' unsigned] Link: http://lkml.kernel.org/r/1561612545-28997-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Thomas Gleixner Cc: John Hubbard Cc: "Aneesh Kumar K.V" Cc: Christoph Hellwig Cc: Keith Busch Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 83d480e9b05f..f411bab037f5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1449,25 +1449,31 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, struct vm_area_struct **vmas, unsigned int gup_flags) { - long i; + unsigned long i; + unsigned long step; bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); check_again: - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages;) { + + struct page *head = compound_head(pages[i]); + + /* + * gup may start from a tail page. Advance step by the left + * part. + */ + step = (1 << compound_order(head)) - (pages[i] - head); /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out * of the CMA zone if possible. */ - if (is_migrate_cma_page(pages[i])) { - - struct page *head = compound_head(pages[i]); - - if (PageHuge(head)) { + if (is_migrate_cma_page(head)) { + if (PageHuge(head)) isolate_huge_page(head, &cma_page_list); - } else { + else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; @@ -1482,6 +1488,8 @@ check_again: } } } + + i += step; } if (!list_empty(&cma_page_list)) { -- cgit v1.2.3-59-g8ed1b From b5d1c39f34d1c9bca0c4b9ae2e339fbbe264a9c7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Jul 2019 20:57:43 -0700 Subject: mm/gup.c: remove some BUG_ONs from get_gate_page() If we end up without a PGD or PUD entry backing the gate area, don't BUG -- just fail gracefully. It's not entirely implausible that this could happen some day on x86. It doesn't right now even with an execute-only emulated vsyscall page because the fixmap shares the PUD, but the core mm code shouldn't rely on that particular detail to avoid OOPSing. Link: http://lkml.kernel.org/r/a1d9f4efb75b9d464e59fd6af00104b21c58f6f7.1561610798.git.luto@kernel.org Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Cc: Florian Weimer Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index f411bab037f5..bb4ad57d20e3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -586,11 +586,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); - BUG_ON(pgd_none(*pgd)); + if (pgd_none(*pgd)) + return -EFAULT; p4d = p4d_offset(pgd, address); - BUG_ON(p4d_none(*p4d)); + if (p4d_none(*p4d)) + return -EFAULT; pud = pud_offset(p4d, address); - BUG_ON(pud_none(*pud)); + if (pud_none(*pud)) + return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; -- cgit v1.2.3-59-g8ed1b From 790c73690c2bbecb3f6f8becbdb11ddc9bcff8cc Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 11 Jul 2019 20:57:46 -0700 Subject: mm/gup.c: mark undo_dev_pagemap as __maybe_unused Several mips builds generate the following build warning. mm/gup.c:1788:13: warning: 'undo_dev_pagemap' defined but not used The function is declared unconditionally but only called from behind various ifdefs. Mark it __maybe_unused. Link: http://lkml.kernel.org/r/1562072523-22311-1-git-send-email-linux@roeck-us.net Signed-off-by: Guenter Roeck Reviewed-by: Andrew Morton Cc: Stephen Rothwell Cc: Robin Murphy Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index bb4ad57d20e3..43b7d875de37 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1796,7 +1796,8 @@ static inline pte_t gup_get_pte(pte_t *ptep) } #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, + struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; -- cgit v1.2.3-59-g8ed1b From 8b1e0f81fb6fcf3109465a168b2e2da3f711fa86 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 11 Jul 2019 20:58:43 -0700 Subject: mm/pgtable: drop pgtable_t variable from pte_fn_t functions Drop the pgtable_t variable from all implementation for pte_fn_t as none of them use it. apply_to_pte_range() should stop computing it as well. Should help us save some cycles. Link: http://lkml.kernel.org/r/1556803126-26596-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Matthew Wilcox Cc: Ard Biesheuvel Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Michal Hocko Cc: Logan Gunthorpe Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/efi.c | 3 +-- arch/arm/mm/dma-mapping.c | 3 +-- arch/arm/mm/pageattr.c | 3 +-- arch/arm64/kernel/efi.c | 3 +-- arch/arm64/mm/pageattr.c | 3 +-- arch/x86/xen/mmu_pv.c | 3 +-- drivers/gpu/drm/i915/i915_mm.c | 3 +-- drivers/xen/gntdev.c | 6 ++---- drivers/xen/privcmd.c | 6 ++---- drivers/xen/xlate_mmu.c | 3 +-- include/linux/mm.h | 3 +-- mm/memory.c | 5 +---- mm/vmalloc.c | 2 +- 13 files changed, 15 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ed005870671a..e57dbcc89123 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -8,8 +8,7 @@ #include #include -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; pte_t pte = *ptep; diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 439bb6a59a04..1fb5c0ca1ed8 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -493,8 +493,7 @@ void __init dma_contiguous_remap(void) } } -static int __dma_update_pte(pte_t *pte, pgtable_t token, unsigned long addr, - void *data) +static int __dma_update_pte(pte_t *pte, unsigned long addr, void *data) { struct page *page = virt_to_page(addr); pgprot_t prot = *(pgprot_t *)data; diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c index 0f5faf30d9bf..d546efad7e97 100644 --- a/arch/arm/mm/pageattr.c +++ b/arch/arm/mm/pageattr.c @@ -14,8 +14,7 @@ struct page_change_data { pgprot_t clear_mask; }; -static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = *ptep; diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 3c33d0dd8e0e..d0cf596db82c 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -82,8 +82,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) return 0; } -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; pte_t pte = READ_ONCE(*ptep); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index fcdcf6cd7677..03c53f16ee77 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -19,8 +19,7 @@ struct page_change_data { bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); -static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = READ_ONCE(*ptep); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index beb44e22afdf..f6e5eeecfc69 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2700,8 +2700,7 @@ struct remap_data { struct mmu_update *mmu_update; }; -static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_data *rmd = data; pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot)); diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index e4935dd1fd37..c23bb29e6d3e 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -35,8 +35,7 @@ struct remap_pfn { pgprot_t prot; }; -static int remap_pfn(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int remap_pfn(pte_t *pte, unsigned long addr, void *data) { struct remap_pfn *r = data; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 469dfbd6cf90..4c339c7e66e5 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -264,8 +264,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) /* ------------------------------------------------------------------ */ -static int find_grant_ptes(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) { struct gntdev_grant_map *map = data; unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; @@ -292,8 +291,7 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, } #ifdef CONFIG_X86 -static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int set_grant_ptes_as_special(pte_t *pte, unsigned long addr, void *data) { set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte)); return 0; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1ff38d8036e9..2f5ce7230a43 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -731,8 +731,7 @@ struct remap_pfn { unsigned long i; }; -static int remap_pfn_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_pfn *r = data; struct page *page = r->pages[r->i]; @@ -966,8 +965,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) * on a per pfn/pte basis. Mapping calls that fail with ENOENT * can be then retried until success. */ -static int is_mapped_fn(pte_t *pte, struct page *pmd_page, - unsigned long addr, void *data) +static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { return pte_none(*pte) ? 0 : -EBUSY; } diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c index e7df65d32c91..ba883a80b3c0 100644 --- a/drivers/xen/xlate_mmu.c +++ b/drivers/xen/xlate_mmu.c @@ -93,8 +93,7 @@ static void setup_hparams(unsigned long gfn, void *data) info->fgfn++; } -static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int remap_pte_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_data *info = data; struct page *page = info->pages[info->index++]; diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8d413d635e..bb242ad810eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2686,8 +2686,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } -typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, - void *data); +typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); diff --git a/mm/memory.c b/mm/memory.c index b47e4e56448a..0428ff5ee339 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2036,7 +2036,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; int err; - pgtable_t token; spinlock_t *uninitialized_var(ptl); pte = (mm == &init_mm) ? @@ -2049,10 +2048,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); - token = pmd_pgtable(*pmd); - do { - err = fn(pte++, token, addr, data); + err = fn(pte++, addr, data); if (err) break; } while (addr += PAGE_SIZE, addr != end); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 030a544e6602..a5413a6e51fa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2996,7 +2996,7 @@ void __weak vmalloc_sync_all(void) } -static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) +static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; -- cgit v1.2.3-59-g8ed1b From 96756fcb831ddec3ad15f3a107b6e2749084aafb Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 11 Jul 2019 20:58:47 -0700 Subject: mm/memory.c: fail when offset == num in first check of __vm_map_pages() If the caller asks us for offset == num, we should already fail in the first check, i.e. the one testing for offsets beyond the object. At the moment, we are failing on the second test anyway, since count cannot be 0. Still, to agree with the comment of the first test, we should first test it there. Link: http://lkml.kernel.org/r/20190528193004.GA7744@gmail.com Signed-off-by: Miguel Ojeda Reviewed-by: Andrew Morton Cc: Souptick Joarder Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0428ff5ee339..ad4bf1a1a0ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1545,7 +1545,7 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, int ret, i; /* Fail if the user requested offset is beyond the end of the object */ - if (offset > num) + if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ -- cgit v1.2.3-59-g8ed1b From 543bdb2d825fe2400d6e951f1786d92139a16931 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 11 Jul 2019 20:58:50 -0700 Subject: mm/mmu_notifier: use hlist_add_head_rcu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make mmu_notifier_register() safer by issuing a memory barrier before registering a new notifier. This fixes a theoretical bug on weakly ordered CPUs. For example, take this simplified use of notifiers by a driver: my_struct->mn.ops = &my_ops; /* (1) */ mmu_notifier_register(&my_struct->mn, mm) ... hlist_add_head(&mn->hlist, &mm->mmu_notifiers); /* (2) */ ... Once mmu_notifier_register() releases the mm locks, another thread can invalidate a range: mmu_notifier_invalidate_range() ... hlist_for_each_entry_rcu(mn, &mm->mmu_notifiers, hlist) { if (mn->ops->invalidate_range) The read side relies on the data dependency between mn and ops to ensure that the pointer is properly initialized. But the write side doesn't have any dependency between (1) and (2), so they could be reordered and the readers could dereference an invalid mn->ops. mmu_notifier_register() does take all the mm locks before adding to the hlist, but those have acquire semantics which isn't sufficient. By calling hlist_add_head_rcu() instead of hlist_add_head() we update the hlist using a store-release, ensuring that readers see prior initialization of my_struct. This situation is better illustated by litmus test MP+onceassign+derefonce. Link: http://lkml.kernel.org/r/20190502133532.24981-1-jean-philippe.brucker@arm.com Fixes: cddb8a5c14aa ("mmu-notifiers: core") Signed-off-by: Jean-Philippe Brucker Cc: Jérôme Glisse Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 513b9607409d..b5670620aea0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -274,7 +274,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, * thanks to mm_take_all_locks(). */ spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); mm_drop_all_locks(mm); -- cgit v1.2.3-59-g8ed1b From cacca6baf0b0a2dfe8eb3430b5f81916f35284cc Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:58:53 -0700 Subject: mm/vmalloc.c: remove "node" argument Patch series "Some cleanups for the KVA/vmalloc", v5. This patch (of 4): Remove unused argument from the __alloc_vmap_area() function. Link: http://lkml.kernel.org/r/20190606120411.8298-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a5413a6e51fa..b645686ef9b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -986,7 +986,7 @@ adjust_va_to_fit_type(struct vmap_area *va, */ static __always_inline unsigned long __alloc_vmap_area(unsigned long size, unsigned long align, - unsigned long vstart, unsigned long vend, int node) + unsigned long vstart, unsigned long vend) { unsigned long nva_start_addr; struct vmap_area *va; @@ -1063,7 +1063,7 @@ retry: * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ - addr = __alloc_vmap_area(size, align, vstart, vend, node); + addr = __alloc_vmap_area(size, align, vstart, vend); if (unlikely(addr == vend)) goto overflow; -- cgit v1.2.3-59-g8ed1b From 82dd23e84be3ead53b6d584d836f51852d1096e6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:58:57 -0700 Subject: mm/vmalloc.c: preload a CPU with one object for split purpose Refactor the NE_FIT_TYPE split case when it comes to an allocation of one extra object. We need it in order to build a remaining space. The preload is done per CPU in non-atomic context with GFP_KERNEL flags. More permissive parameters can be beneficial for systems which are suffer from high memory pressure or low memory condition. For example on my KVM system(4xCPUs, no swap, 256MB RAM) i can simulate the failure of page allocation with GFP_NOWAIT flags. Using "stress-ng" tool and starting N workers spinning on fork() and exit(), i can trigger below trace: [ 179.815161] stress-ng-fork: page allocation failure: order:0, mode:0x40800(GFP_NOWAIT|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 [ 179.815168] CPU: 0 PID: 12612 Comm: stress-ng-fork Not tainted 5.2.0-rc3+ #1003 [ 179.815170] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 179.815171] Call Trace: [ 179.815178] dump_stack+0x5c/0x7b [ 179.815182] warn_alloc+0x108/0x190 [ 179.815187] __alloc_pages_slowpath+0xdc7/0xdf0 [ 179.815191] __alloc_pages_nodemask+0x2de/0x330 [ 179.815194] cache_grow_begin+0x77/0x420 [ 179.815197] fallback_alloc+0x161/0x200 [ 179.815200] kmem_cache_alloc+0x1c9/0x570 [ 179.815202] alloc_vmap_area+0x32c/0x990 [ 179.815206] __get_vm_area_node+0xb0/0x170 [ 179.815208] __vmalloc_node_range+0x6d/0x230 [ 179.815211] ? _do_fork+0xce/0x3d0 [ 179.815213] copy_process.part.46+0x850/0x1b90 [ 179.815215] ? _do_fork+0xce/0x3d0 [ 179.815219] _do_fork+0xce/0x3d0 [ 179.815226] ? __do_page_fault+0x2bf/0x4e0 [ 179.815229] do_syscall_64+0x55/0x130 [ 179.815231] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 179.815234] RIP: 0033:0x7fedec4c738b ... [ 179.815237] RSP: 002b:00007ffda469d730 EFLAGS: 00000246 ORIG_RAX: 0000000000000038 [ 179.815239] RAX: ffffffffffffffda RBX: 00007ffda469d730 RCX: 00007fedec4c738b [ 179.815240] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011 [ 179.815241] RBP: 00007ffda469d780 R08: 00007fededd6e300 R09: 00007ffda47f50a0 [ 179.815242] R10: 00007fededd6e5d0 R11: 0000000000000246 R12: 0000000000000000 [ 179.815243] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000 [ 179.815245] Mem-Info: [ 179.815249] active_anon:12686 inactive_anon:14760 isolated_anon:0 active_file:502 inactive_file:61 isolated_file:70 unevictable:2 dirty:0 writeback:0 unstable:0 slab_reclaimable:2380 slab_unreclaimable:7520 mapped:15069 shmem:14813 pagetables:10833 bounce:0 free:1922 free_pcp:229 free_cma:0 Link: http://lkml.kernel.org/r/20190606120411.8298-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Roman Gushchin Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b645686ef9b6..45e0dc0e09f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -365,6 +365,13 @@ static LIST_HEAD(free_vmap_area_list); */ static struct rb_root free_vmap_area_root = RB_ROOT; +/* + * Preload a CPU with one object for "no edge" split case. The + * aim is to get rid of allocations from the atomic context, thus + * to use more permissive allocation masks. + */ +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -951,9 +958,24 @@ adjust_va_to_fit_type(struct vmap_area *va, * L V NVA V R * |---|-------|---| */ - lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); - if (unlikely(!lva)) - return -1; + lva = __this_cpu_xchg(ne_fit_preload_node, NULL); + if (unlikely(!lva)) { + /* + * For percpu allocator we do not do any pre-allocation + * and leave it as it is. The reason is it most likely + * never ends up with NE_FIT_TYPE splitting. In case of + * percpu allocations offsets and sizes are aligned to + * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE + * are its main fitting cases. + * + * There are a few exceptions though, as an example it is + * a first allocation (early boot up) when we have "one" + * big free space that has to be split. + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (!lva) + return -1; + } /* * Build the remainder. @@ -1032,7 +1054,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { - struct vmap_area *va; + struct vmap_area *va, *pva; unsigned long addr; int purged = 0; @@ -1057,7 +1079,32 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); retry: + /* + * Preload this CPU with one extra vmap_area object to ensure + * that we have it available when fit type of free area is + * NE_FIT_TYPE. + * + * The preload is done in non-atomic context, thus it allows us + * to use more permissive allocation masks to be more stable under + * low memory condition and high memory pressure. + * + * Even if it fails we do not really care about that. Just proceed + * as it is. "overflow" path will refill the cache we allocate from. + */ + preempt_disable(); + if (!__this_cpu_read(ne_fit_preload_node)) { + preempt_enable(); + pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); + preempt_disable(); + + if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) { + if (pva) + kmem_cache_free(vmap_area_cachep, pva); + } + } + spin_lock(&vmap_area_lock); + preempt_enable(); /* * If an allocation fails, the "vend" address is -- cgit v1.2.3-59-g8ed1b From 54f63d9d8a39118486eb8a7168cda5845240c3d2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:59:00 -0700 Subject: mm/vmalloc.c: get rid of one single unlink_va() when merge It does not make sense to try to "unlink" the node that is definitely not linked with a list nor tree. On the first merge step VA just points to the previously disconnected busy area. On the second step, check if the node has been merged and do "unlink" if so, because now it points to an object that must be linked. Link: http://lkml.kernel.org/r/20190606120411.8298-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hillf Danton Reviewed-by: Roman Gushchin Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 45e0dc0e09f8..857dd8415a2e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -719,9 +719,6 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Check and update the tree if needed. */ augment_tree_propagate_from(sibling); - /* Remove this VA, it has been merged. */ - unlink_va(va, root); - /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); @@ -746,12 +743,11 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Check and update the tree if needed. */ augment_tree_propagate_from(sibling); - /* Remove this VA, it has been merged. */ - unlink_va(va, root); + if (merged) + unlink_va(va, root); /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); - return; } } -- cgit v1.2.3-59-g8ed1b From 460e42d19a13d49455c5b269e8e0a1b1d522a895 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jul 2019 20:59:03 -0700 Subject: mm/vmalloc.c: switch to WARN_ON() and move it under unlink_va() Trigger a warning if an object that is about to be freed is detached. We used to have a BUG_ON(), but even though it is considered as faulty behaviour that is not a good reason to break a system. Link: http://lkml.kernel.org/r/20190606120411.8298-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Roman Gushchin Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 857dd8415a2e..84f50d7e40bc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -534,20 +534,17 @@ link_va(struct vmap_area *va, struct rb_root *root, static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { - /* - * During merging a VA node can be empty, therefore - * not linked with the tree nor list. Just check it. - */ - if (!RB_EMPTY_NODE(&va->rb_node)) { - if (root == &free_vmap_area_root) - rb_erase_augmented(&va->rb_node, - root, &free_vmap_area_rb_augment_cb); - else - rb_erase(&va->rb_node, root); + if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) + return; - list_del(&va->list); - RB_CLEAR_NODE(&va->rb_node); - } + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); + + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); } #if DEBUG_AUGMENT_PROPAGATE_CHECK @@ -1162,8 +1159,6 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); static void __free_vmap_area(struct vmap_area *va) { - BUG_ON(RB_EMPTY_NODE(&va->rb_node)); - /* * Remove from the busy tree/list. */ -- cgit v1.2.3-59-g8ed1b From d9009d67f42e59760aae5471ba2f62b3d5d531d1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 11 Jul 2019 20:59:06 -0700 Subject: mm/vmalloc.c: spelling> s/informaion/information/ Link: http://lkml.kernel.org/r/20190607113509.15032-1-geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Morton Acked-by: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 84f50d7e40bc..edb212298c8a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2812,7 +2812,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * any information, as /dev/kmem. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't @@ -2891,7 +2891,7 @@ finished: * Note: In usual ops, vwrite() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * any information, as /dev/kmem. * * Return: number of bytes for which addr and buf should be * increased (same number as @count) or %0 if [addr...addr+count) -- cgit v1.2.3-59-g8ed1b From ec11408a1630eed2cb03db55b8b372267f5f1032 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Jul 2019 20:59:09 -0700 Subject: mm/large system hash: use vmalloc for size > MAX_ORDER when !hashdist The kernel currently clamps large system hashes to MAX_ORDER when hashdist is not set, which is rather arbitrary. vmalloc space is limited on 32-bit machines, but this shouldn't result in much more used because of small physical memory limiting system hash sizes. Include "vmalloc" or "linear" in the kernel log message. Link: http://lkml.kernel.org/r/20190605144814.29319-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae56e8feec0c..05143e0f821f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7981,6 +7981,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long log2qty, size; void *table = NULL; gfp_t gfp_flags; + bool virt; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8037,6 +8038,7 @@ void *__init alloc_large_system_hash(const char *tablename, gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { + virt = false; size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) @@ -8044,26 +8046,26 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (hashdist) { + } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + virt = true; } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, gfp_flags); - kmemleak_alloc(table, size, 1, gfp_flags); - } + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + virt ? "vmalloc" : "linear"); if (_hash_shift) *_hash_shift = log2qty; -- cgit v1.2.3-59-g8ed1b From e03a5125ec7bd1d4e8b062816a8813436876dc7c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Jul 2019 20:59:12 -0700 Subject: mm/large system hash: clear hashdist when only one node with memory is booted CONFIG_NUMA on 64-bit CPUs currently enables hashdist unconditionally even when booting on single node machines. This causes the large system hashes to be allocated with vmalloc, and mapped with small pages. This change clears hashdist if only one node has come up with memory. This results in the important large inode and dentry hashes using memblock allocations. All others are within 4MB size up to about 128GB of RAM, which allows them to be allocated from the linear map on most non-NUMA images. Other big hashes like futex and TCP should eventually be moved over to the same style of allocation as those vfs caches that use HASH_EARLY if !hashdist, so they don't exceed MAX_ORDER on very large non-NUMA images. This brings dTLB misses for linux kernel tree `git diff` from ~45,000 to ~8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off (performance is in the noise, under 1% difference, page tables are likely to be well cached for this workload). Link: http://lkml.kernel.org/r/20190605144814.29319-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05143e0f821f..3a555ce69006 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7534,10 +7534,28 @@ static int page_alloc_cpu_dead(unsigned int cpu) return 0; } +#ifdef CONFIG_NUMA +int hashdist = HASHDIST_DEFAULT; + +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + void __init page_alloc_init(void) { int ret; +#ifdef CONFIG_NUMA + if (num_node_state(N_MEMORY) == 1) + hashdist = 0; +#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, "mm/page_alloc:dead", NULL, page_alloc_cpu_dead); @@ -7922,19 +7940,6 @@ out: return ret; } -#ifdef CONFIG_NUMA -int hashdist = HASHDIST_DEFAULT; - -static int __init set_hashdist(char *str) -{ - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; -} -__setup("hashdist=", set_hashdist); -#endif - #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit v1.2.3-59-g8ed1b From 6471384af2a6530696fc0203bafe4de41a23c9ef Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 11 Jul 2019 20:59:19 -0700 Subject: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options Patch series "add init_on_alloc/init_on_free boot options", v10. Provide init_on_alloc and init_on_free boot options. These are aimed at preventing possible information leaks and making the control-flow bugs that depend on uninitialized values more deterministic. Enabling either of the options guarantees that the memory returned by the page allocator and SL[AU]B is initialized with zeroes. SLOB allocator isn't supported at the moment, as its emulation of kmem caches complicates handling of SLAB_TYPESAFE_BY_RCU caches correctly. Enabling init_on_free also guarantees that pages and heap objects are initialized right after they're freed, so it won't be possible to access stale data by using a dangling pointer. As suggested by Michal Hocko, right now we don't let the heap users to disable initialization for certain allocations. There's not enough evidence that doing so can speed up real-life cases, and introducing ways to opt-out may result in things going out of control. This patch (of 2): The new options are needed to prevent possible information leaks and make control-flow bugs that depend on uninitialized values more deterministic. This is expected to be on-by-default on Android and Chrome OS. And it gives the opportunity for anyone else to use it under distros too via the boot args. (The init_on_free feature is regularly requested by folks where memory forensics is included in their threat models.) init_on_alloc=1 makes the kernel initialize newly allocated pages and heap objects with zeroes. Initialization is done at allocation time at the places where checks for __GFP_ZERO are performed. init_on_free=1 makes the kernel initialize freed pages and heap objects with zeroes upon their deletion. This helps to ensure sensitive data doesn't leak via use-after-free accesses. Both init_on_alloc=1 and init_on_free=1 guarantee that the allocator returns zeroed memory. The two exceptions are slab caches with constructors and SLAB_TYPESAFE_BY_RCU flag. Those are never zero-initialized to preserve their semantics. Both init_on_alloc and init_on_free default to zero, but those defaults can be overridden with CONFIG_INIT_ON_ALLOC_DEFAULT_ON and CONFIG_INIT_ON_FREE_DEFAULT_ON. If either SLUB poisoning or page poisoning is enabled, those options take precedence over init_on_alloc and init_on_free: initialization is only applied to unpoisoned allocations. Slowdown for the new features compared to init_on_free=0, init_on_alloc=0: hackbench, init_on_free=1: +7.62% sys time (st.err 0.74%) hackbench, init_on_alloc=1: +7.75% sys time (st.err 2.14%) Linux build with -j12, init_on_free=1: +8.38% wall time (st.err 0.39%) Linux build with -j12, init_on_free=1: +24.42% sys time (st.err 0.52%) Linux build with -j12, init_on_alloc=1: -0.13% wall time (st.err 0.42%) Linux build with -j12, init_on_alloc=1: +0.57% sys time (st.err 0.40%) The slowdown for init_on_free=0, init_on_alloc=0 compared to the baseline is within the standard error. The new features are also going to pave the way for hardware memory tagging (e.g. arm64's MTE), which will require both on_alloc and on_free hooks to set the tags for heap objects. With MTE, tagging will have the same cost as memory initialization. Although init_on_free is rather costly, there are paranoid use-cases where in-memory data lifetime is desired to be minimized. There are various arguments for/against the realism of the associated threat models, but given that we'll need the infrastructure for MTE anyway, and there are people who want wipe-on-free behavior no matter what the performance cost, it seems reasonable to include it in this series. [glider@google.com: v8] Link: http://lkml.kernel.org/r/20190626121943.131390-2-glider@google.com [glider@google.com: v9] Link: http://lkml.kernel.org/r/20190627130316.254309-2-glider@google.com [glider@google.com: v10] Link: http://lkml.kernel.org/r/20190628093131.199499-2-glider@google.com Link: http://lkml.kernel.org/r/20190617151050.92663-2-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Kees Cook Acked-by: Michal Hocko [page and dmapool parts Acked-by: James Morris ] Cc: Christoph Lameter Cc: Masahiro Yamada Cc: "Serge E. Hallyn" Cc: Nick Desaulniers Cc: Kostya Serebryany Cc: Dmitry Vyukov Cc: Sandeep Patil Cc: Laura Abbott Cc: Randy Dunlap Cc: Jann Horn Cc: Mark Rutland Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 9 ++++ drivers/infiniband/core/uverbs_ioctl.c | 2 +- include/linux/mm.h | 24 +++++++++ mm/dmapool.c | 4 +- mm/page_alloc.c | 71 ++++++++++++++++++++++--- mm/slab.c | 16 ++++-- mm/slab.h | 20 +++++++ mm/slub.c | 40 ++++++++++++-- net/core/sock.c | 2 +- security/Kconfig.hardening | 29 ++++++++++ 10 files changed, 199 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index aa4e7e7b87c2..099c5a4be95b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1668,6 +1668,15 @@ initrd= [BOOT] Specify the location of the initial ramdisk + init_on_alloc= [MM] Fill newly allocated pages and heap objects with + zeroes. + Format: 0 | 1 + Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON. + + init_on_free= [MM] Fill freed pages and heap objects with zeroes. + Format: 0 | 1 + Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. + init_pkru= [x86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 829b0c6944d8..61758201d9b2 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -127,7 +127,7 @@ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, res = (void *)pbundle->internal_buffer + pbundle->internal_used; pbundle->internal_used = ALIGN(new_used, sizeof(*pbundle->internal_buffer)); - if (flags & __GFP_ZERO) + if (want_init_on_alloc(flags)) memset(res, 0, size); return res; } diff --git a/include/linux/mm.h b/include/linux/mm.h index bb242ad810eb..f88f0eabcc5e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2700,6 +2700,30 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DECLARE_STATIC_KEY_TRUE(init_on_alloc); +#else +DECLARE_STATIC_KEY_FALSE(init_on_alloc); +#endif +static inline bool want_init_on_alloc(gfp_t flags) +{ + if (static_branch_unlikely(&init_on_alloc) && + !page_poisoning_enabled()) + return true; + return flags & __GFP_ZERO; +} + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DECLARE_STATIC_KEY_TRUE(init_on_free); +#else +DECLARE_STATIC_KEY_FALSE(init_on_free); +#endif +static inline bool want_init_on_free(void) +{ + return static_branch_unlikely(&init_on_free) && + !page_poisoning_enabled(); +} + #ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); #else diff --git a/mm/dmapool.c b/mm/dmapool.c index 8c94c89a6f7e..fe5d33060415 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -378,7 +378,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, #endif spin_unlock_irqrestore(&pool->lock, flags); - if (mem_flags & __GFP_ZERO) + if (want_init_on_alloc(mem_flags)) memset(retval, 0, pool->size); return retval; @@ -428,6 +428,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } offset = vaddr - page->vaddr; + if (want_init_on_free()) + memset(vaddr, 0, pool->size); #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { spin_unlock_irqrestore(&pool->lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a555ce69006..dbd0d5cbbcbb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -135,6 +135,55 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_alloc); +#else +DEFINE_STATIC_KEY_FALSE(init_on_alloc); +#endif +EXPORT_SYMBOL(init_on_alloc); + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_free); +#else +DEFINE_STATIC_KEY_FALSE(init_on_free); +#endif +EXPORT_SYMBOL(init_on_free); + +static int __init early_init_on_alloc(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); + if (bool_result) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + return ret; +} +early_param("init_on_alloc", early_init_on_alloc); + +static int __init early_init_on_free(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); + if (bool_result) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + return ret; +} +early_param("init_on_free", early_init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -1067,6 +1116,14 @@ out: return ret; } +static void kernel_init_free_pages(struct page *page, int numpages) +{ + int i; + + for (i = 0; i < numpages; i++) + clear_highpage(page + i); +} + static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { @@ -1118,6 +1175,9 @@ static __always_inline bool free_pages_prepare(struct page *page, PAGE_SIZE << order); } arch_free_page(page, order); + if (want_init_on_free()) + kernel_init_free_pages(page, 1 << order); + kernel_poison_pages(page, 1 << order, 0); if (debug_pagealloc_enabled()) kernel_map_pages(page, 1 << order, 0); @@ -2019,8 +2079,8 @@ static inline int check_new_page(struct page *page) static inline bool free_pages_prezeroed(void) { - return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled(); + return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled()) || want_init_on_free(); } #ifdef CONFIG_DEBUG_VM @@ -2090,13 +2150,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order, static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { - int i; - post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); diff --git a/mm/slab.c b/mm/slab.c index e9d90b0da47b..9df370558e5d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1811,6 +1811,14 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, cachep->num = 0; + /* + * If slab auto-initialization on free is enabled, store the freelist + * off-slab, so that its contents don't end up in one of the allocated + * objects. + */ + if (unlikely(slab_want_init_on_free(cachep))) + return false; + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) return false; @@ -3248,7 +3256,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - if (unlikely(flags & __GFP_ZERO) && ptr) + if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) memset(ptr, 0, cachep->object_size); slab_post_alloc_hook(cachep, flags, 1, &ptr); @@ -3305,7 +3313,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); - if (unlikely(flags & __GFP_ZERO) && objp) + if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) memset(objp, 0, cachep->object_size); slab_post_alloc_hook(cachep, flags, 1, &objp); @@ -3426,6 +3434,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); + if (unlikely(slab_want_init_on_free(cachep))) + memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3513,7 +3523,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); /* Clear memory outside IRQ disabled section */ - if (unlikely(flags & __GFP_ZERO)) + if (unlikely(slab_want_init_on_alloc(flags, s))) for (i = 0; i < size; i++) memset(p[i], 0, s->object_size); diff --git a/mm/slab.h b/mm/slab.h index a62372d0f271..9057b8056b07 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -607,4 +607,24 @@ static inline int cache_random_seq_create(struct kmem_cache *cachep, static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + #endif /* MM_SLAB_H */ diff --git a/mm/slub.c b/mm/slub.c index c9541a480627..e6c030e47364 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1279,6 +1279,10 @@ check_slabs: if (*str == ',') slub_debug_slabs = str + 1; out: + if ((static_branch_unlikely(&init_on_alloc) || + static_branch_unlikely(&init_on_free)) && + (slub_debug & SLAB_POISON)) + pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); return 1; } @@ -1422,6 +1426,28 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) static inline bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail) { + + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + int rsize; + + if (slab_want_init_on_free(s)) + do { + object = next; + next = get_freepointer(s, object); + /* + * Clear the object and the metadata, but don't touch + * the redzone. + */ + memset(object, 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad + : 0; + memset((char *)object + s->inuse, 0, + s->size - s->inuse - rsize); + set_freepointer(s, object, next); + } while (object != old_tail); + /* * Compiler cannot detect this function can be removed if slab_free_hook() * evaluates to nothing. Thus, catch all relevant config debug options here. @@ -1431,9 +1457,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object; - void *next = *head; - void *old_tail = *tail ? *tail : *head; + next = *head; /* Head and tail of the reconstructed freelist */ *head = NULL; @@ -2729,8 +2753,14 @@ redo: prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } + /* + * If the object has been wiped upon free, make sure it's fully + * initialized by zeroing out freelist pointer. + */ + if (unlikely(slab_want_init_on_free(s)) && object) + memset(object + s->offset, 0, sizeof(void *)); - if (unlikely(gfpflags & __GFP_ZERO) && object) + if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(object, 0, s->object_size); slab_post_alloc_hook(s, gfpflags, 1, &object); @@ -3151,7 +3181,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_enable(); /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(flags & __GFP_ZERO)) { + if (unlikely(slab_want_init_on_alloc(flags, s))) { int j; for (j = 0; j < i; j++) diff --git a/net/core/sock.c b/net/core/sock.c index 3e073ca6138f..d57b0cc995a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1597,7 +1597,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) + if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index c6cb2d9b2905..a1ffe2eb4d5f 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -160,6 +160,35 @@ config STACKLEAK_RUNTIME_DISABLE runtime to control kernel stack erasing for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. +config INIT_ON_ALLOC_DEFAULT_ON + bool "Enable heap memory zeroing on allocation by default" + help + This has the effect of setting "init_on_alloc=1" on the kernel + command line. This can be disabled with "init_on_alloc=0". + When "init_on_alloc" is enabled, all page allocator and slab + allocator memory will be zeroed when allocated, eliminating + many kinds of "uninitialized heap memory" flaws, especially + heap content exposures. The performance impact varies by + workload, but most cases see <1% impact. Some synthetic + workloads have measured as high as 7%. + +config INIT_ON_FREE_DEFAULT_ON + bool "Enable heap memory zeroing on free by default" + help + This has the effect of setting "init_on_free=1" on the kernel + command line. This can be disabled with "init_on_free=0". + Similar to "init_on_alloc", when "init_on_free" is enabled, + all page allocator and slab allocator memory will be zeroed + when freed, eliminating many kinds of "uninitialized heap memory" + flaws, especially heap content exposures. The primary difference + with "init_on_free" is that data lifetime in memory is reduced, + as anything freed is wiped immediately, making live forensics or + cold boot memory attacks unable to recover freed memory contents. + The performance impact varies by workload, but is more expensive + than "init_on_alloc" due to the negative cache effects of + touching "cold" memory areas. Most cases see 3-5% impact. Some + synthetic workloads have measured as high as 8%. + endmenu endmenu -- cgit v1.2.3-59-g8ed1b From af5d440365894b5ca51f29866c1a01496dce52c4 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Jul 2019 20:59:27 -0700 Subject: mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned Commit 9092c71bb724 ("mm: use sc->priority for slab shrink targets") has broken up the relationship between sc->nr_scanned and slab pressure. The sc->nr_scanned can't double slab pressure anymore. So, it sounds no sense to still keep sc->nr_scanned inc'ed. Actually, it would prevent from adding pressure on slab shrink since excessive sc->nr_scanned would prevent from scan->priority raise. The bonnie test doesn't show this would change the behavior of slab shrinkers. w/ w/o /sec %CP /sec %CP Sequential delete: 3960.6 94.6 3997.6 96.2 Random delete: 2518 63.8 2561.6 64.6 The slight increase of "/sec" without the patch would be caused by the slight increase of CPU usage. Link: http://lkml.kernel.org/r/1559025859-72759-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Johannes Weiner Cc: Josef Bacik Cc: Michal Hocko Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: Hillf Danton Cc: "Huang, Ying" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 96aafbf8ce4e..277a36de11c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1137,11 +1137,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_unmap && page_mapped(page)) goto keep_locked; - /* Double the slab pressure for mapped and swapcache pages */ - if ((page_mapped(page) || PageSwapCache(page)) && - !(PageAnon(page) && !PageSwapBacked(page))) - sc->nr_scanned++; - may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); -- cgit v1.2.3-59-g8ed1b From 98879b3b9edc1604f2d1a6686576ef4d08ed3310 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Jul 2019 20:59:30 -0700 Subject: mm: vmscan: correct some vmscan counters for THP swapout Commit bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped out"), THP can be swapped out in a whole. But, nr_reclaimed and some other vm counters still get inc'ed by one even though a whole THP (512 pages) gets swapped out. This doesn't make too much sense to memory reclaim. For example, direct reclaim may just need reclaim SWAP_CLUSTER_MAX pages, reclaiming one THP could fulfill it. But, if nr_reclaimed is not increased correctly, direct reclaim may just waste time to reclaim more pages, SWAP_CLUSTER_MAX * 512 pages in worst case. And, it may cause pgsteal_{kswapd|direct} is greater than pgscan_{kswapd|direct}, like the below: pgsteal_kswapd 122933 pgsteal_direct 26600225 pgscan_kswapd 174153 pgscan_direct 14678312 nr_reclaimed and nr_scanned must be fixed in parallel otherwise it would break some page reclaim logic, e.g. vmpressure: this looks at the scanned/reclaimed ratio so it won't change semantics as long as scanned & reclaimed are fixed in parallel. compaction/reclaim: compaction wants a certain number of physical pages freed up before going back to compacting. kswapd priority raising: kswapd raises priority if we scan fewer pages than the reclaim target (which itself is obviously expressed in order-0 pages). As a result, kswapd can falsely raise its aggressiveness even when it's making great progress. Other than nr_scanned and nr_reclaimed, some other counters, e.g. pgactivate, nr_skipped, nr_ref_keep and nr_unmap_fail need to be fixed too since they are user visible via cgroup, /proc/vmstat or trace points, otherwise they would be underreported. When isolating pages from LRUs, nr_taken has been accounted in base page, but nr_scanned and nr_skipped are still accounted in THP. It doesn't make too much sense too since this may cause trace point underreport the numbers as well. So accounting those counters in base page instead of accounting THP as one page. nr_dirty, nr_unqueued_dirty, nr_congested and nr_writeback are used by file cache, so they are not impacted by THP swap. This change may result in lower steal/scan ratio in some cases since THP may get split during page reclaim, then a part of tail pages get reclaimed instead of the whole 512 pages, but nr_scanned is accounted by 512, particularly for direct reclaim. But, this should be not a significant issue. Link: http://lkml.kernel.org/r/1559025859-72759-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: Hillf Danton Cc: Josef Bacik Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 277a36de11c1..f8e3dcd527b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1118,6 +1118,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + unsigned int nr_pages; cond_resched(); @@ -1129,7 +1130,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON_PAGE(PageActive(page), page); - sc->nr_scanned++; + nr_pages = 1 << compound_order(page); + + /* Account the number of base pages even though THP */ + sc->nr_scanned += nr_pages; if (unlikely(!page_evictable(page))) goto activate_locked; @@ -1250,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: - stat->nr_ref_keep++; + stat->nr_ref_keep += nr_pages; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1282,7 +1286,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (!add_to_swap(page)) { if (!PageTransHuge(page)) - goto activate_locked; + goto activate_locked_split; /* Fallback to swap normal pages */ if (split_huge_page_to_list(page, page_list)) @@ -1291,7 +1295,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(page)) - goto activate_locked; + goto activate_locked_split; } may_enter_fs = 1; @@ -1305,6 +1309,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } + /* + * THP may get split above, need minus tail pages and update + * nr_pages to avoid accounting tail pages twice. + * + * The tail pages that are added into swap cache successfully + * reach here. + */ + if ((nr_pages > 1) && !PageTransHuge(page)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } + /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. @@ -1315,7 +1331,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; if (!try_to_unmap(page, flags)) { - stat->nr_unmap_fail++; + stat->nr_unmap_fail += nr_pages; goto activate_locked; } } @@ -1442,7 +1458,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, unlock_page(page); free_it: - nr_reclaimed++; + /* + * THP may get swapped out in a whole, need account + * all base pages. + */ + nr_reclaimed += nr_pages; /* * Is there need to periodically free_page_list? It would @@ -1455,6 +1475,15 @@ free_it: list_add(&page->lru, &free_pages); continue; +activate_locked_split: + /* + * The tail pages that are failed to add into swap cache + * reach here. Fixup nr_scanned and nr_pages. + */ + if (nr_pages > 1) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || @@ -1464,8 +1493,7 @@ activate_locked: if (!PageMlocked(page)) { int type = page_is_file_cache(page); SetPageActive(page); - pgactivate++; - stat->nr_activate[type] += hpage_nr_pages(page); + stat->nr_activate[type] += nr_pages; count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1475,6 +1503,8 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); free_unref_page_list(&free_pages); @@ -1646,10 +1676,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, LIST_HEAD(pages_skipped); isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); + total_scan = 0; scan = 0; - for (total_scan = 0; - scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); - total_scan++) { + while (scan < nr_to_scan && !list_empty(src)) { struct page *page; page = lru_to_page(src); @@ -1657,9 +1686,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); + nr_pages = 1 << compound_order(page); + total_scan += nr_pages; + if (page_zonenum(page) > sc->reclaim_idx) { list_move(&page->lru, &pages_skipped); - nr_skipped[page_zonenum(page)]++; + nr_skipped[page_zonenum(page)] += nr_pages; continue; } @@ -1668,11 +1700,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * return with no isolated pages if the LRU mostly contains * ineligible pages. This causes the VM to not reclaim any * pages, triggering a premature OOM. + * + * Account all tail pages of THP. This would not cause + * premature OOM since __isolate_lru_page() returns -EBUSY + * only when the page is being freed somewhere else. */ - scan++; + scan += nr_pages; switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); nr_taken += nr_pages; nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); -- cgit v1.2.3-59-g8ed1b From 1e426fe28261b03f297992e89da3320b42816f4e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 21:00:07 -0700 Subject: mm: use down_read_killable for locking mmap_sem in access_remote_vm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is used by ptrace and proc files like /proc/pid/cmdline and /proc/pid/environ. Access_remote_vm never returns error codes, all errors are ignored and only size of successfully read data is returned. So, if current task was killed we'll simply return 0 (bytes read). Mmap_sem could be locked for a long time or forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Link: http://lkml.kernel.org/r/156007494202.3335.16782303099589302087.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Michal Koutný Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Cyrill Gorcunov Cc: Kirill Tkhai Cc: Al Viro Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +++- mm/nommu.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index ad4bf1a1a0ef..53bd59579861 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4344,7 +4344,9 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *old_buf = buf; int write = gup_flags & FOLL_WRITE; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) + return 0; + /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, ret, offset; diff --git a/mm/nommu.c b/mm/nommu.c index 07165ad2e548..eb3e2e558da1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1704,7 +1704,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) + return 0; /* the access must start within one of the target process's mappings */ vma = find_vma(mm, addr); -- cgit v1.2.3-59-g8ed1b From 97105f0ab7b877a8ece2005e214894e93793950c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 21:00:13 -0700 Subject: mm: vmalloc: show number of vmalloc pages in /proc/meminfo Vmalloc() is getting more and more used these days (kernel stacks, bpf and percpu allocator are new top users), and the total % of memory consumed by vmalloc() can be pretty significant and changes dynamically. /proc/meminfo is the best place to display this information: its top goal is to show top consumers of the memory. Since the VmallocUsed field in /proc/meminfo is not in use for quite a long time (it has been defined to 0 by a5ad88ce8c7f ("mm: get rid of 'vmalloc_info' from /proc/meminfo")), let's reuse it for showing the actual physical memory consumption of vmalloc(). Link: http://lkml.kernel.org/r/20190417194002.12369-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 568d90e17c17..465ea0153b2a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 51e131245379..9b21d0047710 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -72,10 +72,12 @@ extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU extern void __init vmalloc_init(void); +extern unsigned long vmalloc_nr_pages(void); #else static inline void vmalloc_init(void) { } +static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif extern void *vmalloc(unsigned long size); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index edb212298c8a..4fa8d84599b0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -406,6 +406,13 @@ static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static unsigned long lazy_max_pages(void); +static atomic_long_t nr_vmalloc_pages; + +unsigned long vmalloc_nr_pages(void) +{ + return atomic_long_read(&nr_vmalloc_pages); +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -2237,6 +2244,7 @@ static void __vunmap(const void *addr, int deallocate_pages) BUG_ON(!page); __free_pages(page, 0); } + atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); kvfree(area->pages); } @@ -2414,12 +2422,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (map_vm_area(area, prot, pages)) goto fail; -- cgit v1.2.3-59-g8ed1b From 135e53514ef2cb200b616bf3fa4272cfa6c39291 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Thu, 11 Jul 2019 21:00:17 -0700 Subject: mm/memory-failure.c: clarify error message Some user who install SIGBUS handler that does longjmp out therefore keeping the process alive is confused by the error message "[188988.765862] Memory failure: 0x1840200: Killing cellsrv:33395 due to hardware memory corruption" Slightly modify the error message to improve clarity. Link: http://lkml.kernel.org/r/1558403523-22079-1-git-send-email-jane.chu@oracle.com Signed-off-by: Jane Chu Acked-by: Naoya Horiguchi Acked-by: Pankaj Gupta Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f045514d8d20..7e08cbf3ba49 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -213,7 +213,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) short addr_lsb = tk->size_shift; int ret; - pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { -- cgit v1.2.3-59-g8ed1b From f168a9a54ec39b3f832c353733898b713b6b5c1f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 11 Jul 2019 21:00:20 -0700 Subject: mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() Since commit c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") corrected how CSS_TASK_ITER_PROCS works, mem_cgroup_scan_tasks() can use CSS_TASK_ITER_PROCS in order to check only one thread from each thread group. [penguin-kernel@I-love.SAKURA.ne.jp: remove thread group leader check in oom_evaluate_task()] Link: http://lkml.kernel.org/r/1560853257-14934-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Link: http://lkml.kernel.org/r/c763afc8-f0ae-756a-56a7-395f625b95fc@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/oom_kill.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2cb7e4e5c51a..773ae5674e12 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1167,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, 0, &it); + css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f719b64741d6..606e5e4c6a3e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -346,9 +346,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) if (!points || points < oc->chosen_points) goto next; - /* Prefer thread group leaders for display purposes */ - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) - goto next; select: if (oc->chosen) put_task_struct(oc->chosen); -- cgit v1.2.3-59-g8ed1b From 5eee7e1cdb97123bb55ac14ccd3af8b6edc31537 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:23 -0700 Subject: mm, oom: refactor dump_tasks for memcg OOMs dump_tasks() traverses all the existing processes even for the memcg OOM context which is not only unnecessary but also wasteful. This imposes a long RCU critical section even from a contained context which can be quite disruptive. Change dump_tasks() to be aligned with select_bad_process and use mem_cgroup_scan_tasks to selectively traverse only processes of the target memcg hierarchy during memcg OOM. Link: http://lkml.kernel.org/r/20190617231207.160865-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Tetsuo Handa Cc: Vladimir Davydov Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Paul Jackson Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 68 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 606e5e4c6a3e..59326614508a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -382,10 +382,38 @@ static void select_bad_process(struct oom_control *oc) oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } +static int dump_task(struct task_struct *p, void *arg) +{ + struct oom_control *oc = arg; + struct task_struct *task; + + if (oom_unkillable_task(p, NULL, oc->nodemask)) + return 0; + + task = find_lock_task_mm(p); + if (!task) { + /* + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. + */ + return 0; + } + + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_SWAPENTS), + task->signal->oom_score_adj, task->comm); + task_unlock(task); + + return 0; +} + /** * dump_tasks - dump current memory state of all system tasks - * @memcg: current's memory controller, if constrained - * @nodemask: nodemask passed to page allocator for mempolicy ooms + * @oc: pointer to struct oom_control * * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes @@ -393,37 +421,21 @@ static void select_bad_process(struct oom_control *oc) * State information includes task's pid, uid, tgid, vm size, rss, * pgtables_bytes, swapents, oom_score_adj value, and name. */ -static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_tasks(struct oom_control *oc) { - struct task_struct *p; - struct task_struct *task; - pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); - rcu_read_lock(); - for_each_process(p) { - if (oom_unkillable_task(p, memcg, nodemask)) - continue; - task = find_lock_task_mm(p); - if (!task) { - /* - * This is a kthread or all of p's threads have already - * detached their mm's. There's no need to report - * them; they can't be oom killed anyway. - */ - continue; - } + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); + else { + struct task_struct *p; - pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", - task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), - task->signal->oom_score_adj, task->comm); - task_unlock(task); + rcu_read_lock(); + for_each_process(p) + dump_task(p, oc); + rcu_read_unlock(); } - rcu_read_unlock(); } static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) @@ -455,7 +467,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) dump_unreclaimable_slab(); } if (sysctl_oom_dump_tasks) - dump_tasks(oc->memcg, oc->nodemask); + dump_tasks(oc); if (p) dump_oom_summary(oc, p); } -- cgit v1.2.3-59-g8ed1b From 6ba749ee78ef42ffdf4b95c042fc574a37d229d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:26 -0700 Subject: mm, oom: remove redundant task_in_mem_cgroup() check oom_unkillable_task() can be called from three different contexts i.e. global OOM, memcg OOM and oom_score procfs interface. At the moment oom_unkillable_task() does a task_in_mem_cgroup() check on the given process. Since there is no reason to perform task_in_mem_cgroup() check for global OOM and oom_score procfs interface, those contexts provide NULL memcg and skips the task_in_mem_cgroup() check. However for memcg OOM context, the oom_unkillable_task() is always called from mem_cgroup_scan_tasks() and thus task_in_mem_cgroup() check becomes redundant and effectively dead code. So, just remove the task_in_mem_cgroup() check altogether. Link: http://lkml.kernel.org/r/20190624212631.87212-2-shakeelb@google.com Signed-off-by: Shakeel Butt Signed-off-by: Tetsuo Handa Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- include/linux/memcontrol.h | 7 ------- include/linux/oom.h | 2 +- mm/memcontrol.c | 26 -------------------------- mm/oom_kill.c | 19 +++++++------------ 5 files changed, 9 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/fs/proc/base.c b/fs/proc/base.c index 534fb1ae498a..64dadd469786 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -532,7 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; - points = oom_badness(task, NULL, NULL, totalpages) * + points = oom_badness(task, NULL, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 68402842c337..44c41462be33 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -394,7 +394,6 @@ out: struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); @@ -875,12 +874,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) -{ - return true; -} - static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; diff --git a/include/linux/oom.h b/include/linux/oom.h index d07992009265..b75104690311 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask, + const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 773ae5674e12..4f05735b02d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -{ - struct mem_cgroup *task_memcg; - struct task_struct *p; - bool ret; - - p = find_lock_task_mm(task); - if (p) { - task_memcg = get_mem_cgroup_from_mm(p->mm); - task_unlock(p); - } else { - /* - * All threads may have already detached their mm's, but the oom - * killer still needs to detect if they have already been oom - * killed to prevent needlessly killing additional tasks. - */ - rcu_read_lock(); - task_memcg = mem_cgroup_from_task(task); - css_get(&task_memcg->css); - rcu_read_unlock(); - } - ret = mem_cgroup_is_descendant(task_memcg, memcg); - css_put(&task_memcg->css); - return ret; -} - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 59326614508a..b353f468a36a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -153,17 +153,13 @@ static inline bool is_memcg_oom(struct oom_control *oc) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask) + const nodemask_t *nodemask) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; - /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (memcg && !task_in_mem_cgroup(p, memcg)) - return true; - /* p may not have freeable memory in nodemask */ if (!has_intersects_mems_allowed(p, nodemask)) return true; @@ -194,20 +190,19 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation - * @memcg: task's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, +unsigned long oom_badness(struct task_struct *p, const nodemask_t *nodemask, unsigned long totalpages) { long points; long adj; - if (oom_unkillable_task(p, memcg, nodemask)) + if (oom_unkillable_task(p, nodemask)) return 0; p = find_lock_task_mm(p); @@ -318,7 +313,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, NULL, oc->nodemask)) + if (oom_unkillable_task(task, oc->nodemask)) goto next; /* @@ -342,7 +337,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); + points = oom_badness(task, oc->nodemask, oc->totalpages); if (!points || points < oc->chosen_points) goto next; @@ -387,7 +382,7 @@ static int dump_task(struct task_struct *p, void *arg) struct oom_control *oc = arg; struct task_struct *task; - if (oom_unkillable_task(p, NULL, oc->nodemask)) + if (oom_unkillable_task(p, oc->nodemask)) return 0; task = find_lock_task_mm(p); @@ -1084,7 +1079,7 @@ bool out_of_memory(struct oom_control *oc) check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && - current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && + current->mm && !oom_unkillable_task(current, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; -- cgit v1.2.3-59-g8ed1b From ac311a14c682dcd8a120a6244d0542ec654e3d93 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:31 -0700 Subject: oom: decouple mems_allowed from oom_unkillable_task Commit ef08e3b4981a ("[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset") introduces a heuristic where a potential oom-killer victim is skipped if the intersection of the potential victim and the current (the process triggered the oom) is empty based on the reason that killing such victim most probably will not help the current allocating process. However the commit 7887a3da753e ("[PATCH] oom: cpuset hint") changed the heuristic to just decrease the oom_badness scores of such potential victim based on the reason that the cpuset of such processes might have changed and previously they may have allocated memory on mems where the current allocating process can allocate from. Unintentionally 7887a3da753e ("[PATCH] oom: cpuset hint") introduced a side effect as the oom_badness is also exposed to the user space through /proc/[pid]/oom_score, so, readers with different cpusets can read different oom_score of the same process. Later, commit 6cf86ac6f36b ("oom: filter tasks not sharing the same cpuset") fixed the side effect introduced by 7887a3da753e by moving the cpuset intersection back to only oom-killer context and out of oom_badness. However the combination of ab290adbaf8f ("oom: make oom_unkillable_task() helper function") and 26ebc984913b ("oom: /proc//oom_score treat kernel thread honestly") unintentionally brought back the cpuset intersection check into the oom_badness calculation function. Other than doing cpuset/mempolicy intersection from oom_badness, the memcg oom context is also doing cpuset/mempolicy intersection which is quite wrong and is caught by syzcaller with the following report: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321 mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169 select_bad_process mm/oom_kill.c:374 [inline] out_of_memory mm/oom_kill.c:1088 [inline] out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035 mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573 mem_cgroup_oom mm/memcontrol.c:1905 [inline] try_charge+0xfbe/0x1480 mm/memcontrol.c:2468 mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073 mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088 do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201 do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359 wp_huge_pmd mm/memory.c:3793 [inline] __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006 handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053 do_user_addr_fault arch/x86/mm/fault.c:1455 [inline] __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521 do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552 page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156 RIP: 0033:0x400590 Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48 8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01 00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b RSP: 002b:00007fff7bc49780 EFLAGS: 00010206 RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001 RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008 R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0 Modules linked in: ---[ end trace a65689219582ffff ]--- RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 The fix is to decouple the cpuset/mempolicy intersection check from oom_unkillable_task() and make sure cpuset/mempolicy intersection check is only done in the global oom context. [shakeelb@google.com: change function name and update comment] Link: http://lkml.kernel.org/r/20190628152421.198994-3-shakeelb@google.com Link: http://lkml.kernel.org/r/20190624212631.87212-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: syzbot+d0fc9d3c166bc5e4a94b@syzkaller.appspotmail.com Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- include/linux/oom.h | 1 - mm/oom_kill.c | 57 ++++++++++++++++++++++++++++++----------------------- 3 files changed, 33 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/fs/proc/base.c b/fs/proc/base.c index 64dadd469786..77eb628ecc7f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; - points = oom_badness(task, NULL, totalpages) * - 1000 / totalpages; + points = oom_badness(task, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; diff --git a/include/linux/oom.h b/include/linux/oom.h index b75104690311..c696c265f019 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b353f468a36a..d1c9c4e66d59 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,21 +64,33 @@ int sysctl_oom_dump_tasks = 1; */ DEFINE_MUTEX(oom_lock); +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + #ifdef CONFIG_NUMA /** - * has_intersects_mems_allowed() - check task eligiblity for kill + * oom_cpuset_eligible() - check task eligiblity for kill * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. + * + * This function is assuming oom-killer context and 'current' has triggered + * the oom-killer. */ -static bool has_intersects_mems_allowed(struct task_struct *start, - const nodemask_t *mask) +static bool oom_cpuset_eligible(struct task_struct *start, + struct oom_control *oc) { struct task_struct *tsk; bool ret = false; + const nodemask_t *mask = oc->nodemask; + + if (is_memcg_oom(oc)) + return true; rcu_read_lock(); for_each_thread(start, tsk) { @@ -105,8 +117,7 @@ static bool has_intersects_mems_allowed(struct task_struct *start, return ret; } #else -static bool has_intersects_mems_allowed(struct task_struct *tsk, - const nodemask_t *mask) +static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) { return true; } @@ -146,24 +157,13 @@ static inline bool is_sysrq_oom(struct oom_control *oc) return oc->order == -1; } -static inline bool is_memcg_oom(struct oom_control *oc) -{ - return oc->memcg != NULL; -} - /* return true if the task is not adequate as candidate victim task. */ -static bool oom_unkillable_task(struct task_struct *p, - const nodemask_t *nodemask) +static bool oom_unkillable_task(struct task_struct *p) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; - - /* p may not have freeable memory in nodemask */ - if (!has_intersects_mems_allowed(p, nodemask)) - return true; - return false; } @@ -190,19 +190,17 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation - * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) { long points; long adj; - if (oom_unkillable_task(p, nodemask)) + if (oom_unkillable_task(p)) return 0; p = find_lock_task_mm(p); @@ -313,7 +311,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, oc->nodemask)) + if (oom_unkillable_task(task)) + goto next; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) goto next; /* @@ -337,7 +339,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, oc->nodemask, oc->totalpages); + points = oom_badness(task, oc->totalpages); if (!points || points < oc->chosen_points) goto next; @@ -382,7 +384,11 @@ static int dump_task(struct task_struct *p, void *arg) struct oom_control *oc = arg; struct task_struct *task; - if (oom_unkillable_task(p, oc->nodemask)) + if (oom_unkillable_task(p)) + return 0; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) return 0; task = find_lock_task_mm(p); @@ -1079,7 +1085,8 @@ bool out_of_memory(struct oom_control *oc) check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && - current->mm && !oom_unkillable_task(current, oc->nodemask) && + current->mm && !oom_unkillable_task(current) && + oom_cpuset_eligible(current, oc) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; -- cgit v1.2.3-59-g8ed1b From 2c207985f354dfb549e5a543102a3e084eea81f6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 11 Jul 2019 21:00:34 -0700 Subject: mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() Since commit bbbe48029720 ("mm, oom: remove 'prefer children over parent' heuristic") removed the "%s: Kill process %d (%s) score %u or sacrifice child\n" line, oc->chosen_points is no longer used after select_bad_process(). Link: http://lkml.kernel.org/r/1560853435-15575-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Shakeel Butt Cc: Roman Gushchin Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d1c9c4e66d59..eda2e2a0bdc6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -375,8 +375,6 @@ static void select_bad_process(struct oom_control *oc) break; rcu_read_unlock(); } - - oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } static int dump_task(struct task_struct *p, void *arg) -- cgit v1.2.3-59-g8ed1b