From ab77dab46210bb630e06c6803c5d84074bacd351 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:04:29 -0700 Subject: fs/dax.c: use new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. commit 1c8f422059ae ("mm: change return type to vm_fault_t") There was an existing bug inside dax_load_hole() if vm_insert_mixed had failed to allocate a page table, we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM. With new vmf_insert_mixed() this issue is addressed. vm_insert_mixed_mkwrite has inefficiency when it returns an error value, driver has to convert it to vm_fault_t type. With new vmf_insert_mixed_mkwrite() this limitation will be addressed. Link: http://lkml.kernel.org/r/20180510181121.GA15239@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Jan Kara Reviewed-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Alexander Viro Cc: Dan Williams Cc: Michal Hocko Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 01f5464e0fd2..6a97893a8de7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1955,12 +1955,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_mixed); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +/* + * If the insertion of PTE failed because someone else already added a + * different entry in the mean time, we treat that as success as we assume + * the same entry was actually inserted. + */ + +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, true); + int err; + + err = __vm_insert_mixed(vma, addr, pfn, true); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed_mkwrite); +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old -- cgit v1.2.3-59-g8ed1b From 128227e7fe4087b60f1bd31f762e61237eb23790 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:05:13 -0700 Subject: slab: __GFP_ZERO is incompatible with a constructor __GFP_ZERO requests that the object be initialised to all-zeroes, while the purpose of a constructor is to initialise an object to a particular pattern. We cannot do both. Add a warning to catch any users who mistakenly pass a __GFP_ZERO flag when allocating a slab with a constructor. Link: http://lkml.kernel.org/r/20180412191322.GA21205@bombadil.infradead.org Fixes: d07dbea46405 ("Slab allocators: support __GFP_ZERO in all allocators") Signed-off-by: Matthew Wilcox Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 ++ mm/slob.c | 4 +++- mm/slub.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2f308253c3d7..c1fe8099b3cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2665,6 +2665,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, invalid_mask, &invalid_mask, flags, &flags); dump_stack(); } + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); check_irq_off(); @@ -3071,6 +3072,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); if (!objp) return objp; if (cachep->flags & SLAB_POISON) { diff --git a/mm/slob.c b/mm/slob.c index 623e8a5c46ce..307c2c9feb44 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (b && c->ctor) + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); c->ctor(b); + } kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; diff --git a/mm/slub.c b/mm/slub.c index 44aa7847324a..dc960401ce90 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2444,6 +2444,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c = *pc; struct page *page; + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + freelist = get_partial(s, flags, node, c); if (freelist) -- cgit v1.2.3-59-g8ed1b From a38965bf941b7c2af50de09c96bc5f03e136caef Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 7 Jun 2018 17:05:17 -0700 Subject: mm/slub.c: add __printf verification to slab_err() __printf is useful to verify format and arguments. Remove the following warning (with W=1): mm/slub.c:721:2: warning: function might be possible candidate for `gnu_printf' format attribute [-Wsuggest-attribute=format] Link: http://lkml.kernel.org/r/20180505200706.19986-1-malat@debian.org Signed-off-by: Mathieu Malaterre Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index dc960401ce90..ec5916483b07 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -711,7 +711,7 @@ void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; -- cgit v1.2.3-59-g8ed1b From 05088e5de02d274efba5e902d6cc9618e5de671a Mon Sep 17 00:00:00 2001 From: Canjiang Lu Date: Thu, 7 Jun 2018 17:05:20 -0700 Subject: mm/slub: remove obsolete comment The obsolete comment removed in this patch was introduced by 51df1142816e4 ("slub: Dynamically size kmalloc cache allocations"). I paste related modification from that commit: +#ifdef CONFIG_NUMA + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + temp_kmem_cache_node = kmem_cache_node; + kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); + + kmem_cache_bootstrap_fixup(kmem_cache_node); + + caches++; +#else + /* + * kmem_cache has kmem_cache_node embedded and we moved it! + * Update the list heads + */ + INIT_LIST_HEAD(&kmem_cache->local_node.partial); + list_splice(&temp_kmem_cache->local_node.partial, &kmem_cache->local_node.partial); +#ifdef CONFIG_SLUB_DEBUG + INIT_LIST_HEAD(&kmem_cache->local_node.full); + list_splice(&temp_kmem_cache->local_node.full, &kmem_cache->local_node.full); +#endif As we can see there're used to distinguish the difference handling between NUMA/non-NUMA configuration in the original commit. I think it doesn't make any sense in current implementation which is placed above kmem_cache_node = bootstrap(&boot_kmem_cache_node); So maybe it's better to remove them now? Link: http://lkml.kernel.org/r/5af26f58.1c69fb81.1be0e.c520SMTPIN_ADDED_BROKEN@mx.google.com Signed-off-by: Canjiang Lu Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ec5916483b07..48f75872c356 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4241,12 +4241,6 @@ void __init kmem_cache_init(void) SLAB_HWCACHE_ALIGN, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); - - /* - * Allocate kmem_cache_node properly from the kmem_cache slab. - * kmem_cache_node is separately allocated so no need to - * update any list pointers. - */ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ -- cgit v1.2.3-59-g8ed1b From 88aa7cc688d48ddd84558b41d5905a0db9535c4b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 7 Jun 2018 17:05:28 -0700 Subject: mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct mmap_sem is on the hot path of kernel, and it very contended, but it is abused too. It is used to protect arg_start|end and evn_start|end when reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make sense since those proc files just expect to read 4 values atomically and not related to VM, they could be set to arbitrary values by C/R. And, the mmap_sem contention may cause unexpected issue like below: INFO: task ps:14018 blocked for more than 120 seconds. Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ps D 0 14018 1 0x00000004 Call Trace: schedule+0x36/0x80 rwsem_down_read_failed+0xf0/0x150 call_rwsem_down_read_failed+0x18/0x30 down_read+0x20/0x40 proc_pid_cmdline_read+0xd9/0x4e0 __vfs_read+0x37/0x150 vfs_read+0x96/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xc5 Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock for them to mitigate the abuse of mmap_sem. So, introduce a new spinlock in mm_struct to protect the concurrent access to arg_start|end, env_start|end and others, as well as replace write map_sem to read to protect the race condition between prctl and sys_brk which might break check_data_rlimit(), and makes prctl more friendly to other VM operations. This patch just eliminates the abuse of mmap_sem, but it can't resolve the above hung task warning completely since the later access_remote_vm() call needs acquire mmap_sem. The mmap_sem scalability issue will be solved in the future. [yang.shi@linux.alibaba.com: add comment about mmap_sem and arg_lock] Link: http://lkml.kernel.org/r/1524077799-80690-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1523730291-109696-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Cyrill Gorcunov Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Mateusz Guzik Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 8 ++++---- include/linux/mm_types.h | 2 ++ kernel/fork.c | 1 + kernel/sys.c | 10 ++++++++-- mm/init-mm.c | 1 + 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/fs/proc/base.c b/fs/proc/base.c index af128b374143..ef3f7df50023 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -239,12 +239,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); @@ -927,10 +927,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21612347d311..49dd59ea90f7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -413,6 +413,8 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; + + spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; diff --git a/kernel/fork.c b/kernel/fork.c index 80b48a8fb47b..c6d1c1ce9ed7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -899,6 +899,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/kernel/sys.c b/kernel/sys.c index d1b2b8d934bb..38509dc1f77b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d15ebc0..f0179c9c04c2 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -22,6 +22,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) -- cgit v1.2.3-59-g8ed1b From bb98f2c5ac5db92d34908dbac81a8de7c47c8353 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:31 -0700 Subject: mm, memcontrol: move swap charge handling into get_swap_page() Patch series "mm, memcontrol: Implement memory.swap.events", v2. This patchset implements memory.swap.events which contains max and fail events so that userland can monitor and respond to swap running out. This patch (of 2): get_swap_page() is always followed by mem_cgroup_try_charge_swap(). This patch moves mem_cgroup_try_charge_swap() into get_swap_page() and makes get_swap_page() call the function even after swap allocation failure. This simplifies the callers and consolidates memcg related logic and will ease adding swap related memcg events. Link: http://lkml.kernel.org/r/20180416230934.GH1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ mm/shmem.c | 4 ---- mm/swap_slots.c | 10 +++++++--- mm/swap_state.c | 3 --- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1695f38630f1..e8166521a474 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6012,6 +6012,9 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + if (!entry.val) + return 0; + memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && diff --git a/mm/shmem.c b/mm/shmem.c index 9d6c7e595415..8c43e207cd3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1322,9 +1322,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; - if (mem_cgroup_try_charge_swap(page, swap)) - goto free_swap; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -1353,7 +1350,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); -free_swap: put_swap_page(page, swap); redirty: set_page_dirty(page); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index f2641894f440..f51ac051c0c9 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) get_swap_pages(1, true, &entry); - return entry; + goto out; } /* @@ -347,10 +347,14 @@ repeat: } mutex_unlock(&cache->alloc_lock); if (entry.val) - return entry; + goto out; } get_swap_pages(1, false, &entry); - +out: + if (mem_cgroup_try_charge_swap(page, entry)) { + put_swap_page(page, entry); + entry.val = 0; + } return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2340c3..ab8e59cd18ea 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -216,9 +216,6 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) - goto fail; - /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC -- cgit v1.2.3-59-g8ed1b From f3a53a3a1e5b3b9bc114b5b7930fbe89d9ffed5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:35 -0700 Subject: mm, memcontrol: implement memory.swap.events Add swap max and fail events so that userland can monitor and respond to running out of swap. I'm not too sure about the fail event. Right now, it's a bit confusing which stats / events are recursive and which aren't and also which ones reflect events which originate from a given cgroup and which targets the cgroup. No idea what the right long term solution is and it could just be that growing them organically is actually the only right thing to do. Link: http://lkml.kernel.org/r/20180416231151.GI1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 16 ++++++++++++++++ include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 24 +++++++++++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 74cdeaed9f7a..b0dda10b9382 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1199,6 +1199,22 @@ PAGE_SIZE multiple when read back. Swap usage hard limit. If a cgroup's swap usage reaches this limit, anonymous memory of the cgroup will not be swapped out. + memory.swap.events + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + max + The number of times the cgroup's swap usage was about + to go over the max boundary and swap allocation + failed. + + fail + The number of times swap allocation failed either + because of running out of swap system-wide or max + limit. + Usage Guidelines ~~~~~~~~~~~~~~~~ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71bc2c66..517096c3cc99 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum memcg_memory_event { MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, + MEMCG_SWAP_MAX, + MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; @@ -208,6 +210,9 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; + /* handle for "memory.swap.events" */ + struct cgroup_file swap_events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8166521a474..d86665cf4a49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6012,13 +6012,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; - if (!entry.val) + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); return 0; + } memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); return -ENOMEM; } @@ -6156,6 +6160,18 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, return nbytes; } +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -6168,6 +6184,12 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, { } /* terminate */ }; -- cgit v1.2.3-59-g8ed1b From 5b9c98f3082b4e8a24ef5e6bb21ed0558b54349a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:05:53 -0700 Subject: mm/shmem: add __rcu annotations and properly deref radix entry Patch series "restructure memfd code", v4. This patch (of 3): In preparation for memfd code restucture, clean up sparse warnings. Most changes required adding __rcu annotations. The routine find_swap_entry was modified to properly deference radix tree entries. Link: http://lkml.kernel.org/r/20180415182119.4517-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Marc-Andr Lureau Cc: David Herrmann Cc: Khalid Aziz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 8c43e207cd3b..54bec60380df 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { struct radix_tree_node *node; - void **pslot; + void __rcu **pslot; void *item; VM_BUG_ON(!expected); @@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* ifdef here to avoid bloating shmem.o when not necessary */ -int shmem_huge __read_mostly; +static int shmem_huge __read_mostly; #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -682,7 +682,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; struct page *page; unsigned long swapped = 0; @@ -1098,13 +1098,19 @@ static void shmem_evict_inode(struct inode *inode) static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; unsigned long found = -1; unsigned int checked = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { + void *entry = radix_tree_deref_slot(slot); + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (entry == item) { found = iter.index; break; } @@ -2622,7 +2628,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) static void shmem_tag_pins(struct address_space *mapping) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; pgoff_t start; struct page *page; @@ -2664,7 +2670,7 @@ static void shmem_tag_pins(struct address_space *mapping) static int shmem_wait_for_pins(struct address_space *mapping) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; pgoff_t start; struct page *page; int error, scan; -- cgit v1.2.3-59-g8ed1b From c49fcfcda8b5b06529103664f0291f433f5e6d24 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:05:57 -0700 Subject: mm/shmem: update file sealing comments and file checking In preparation for memfd code restructure, update comments, definitions and function names dealing with file sealing to indicate that tmpfs and hugetlbfs are the supported filesystems. Also, change file pointer checks in memfd_file_seals_ptr to use defined interfaces instead of directly referencing file_operation structs. Link: http://lkml.kernel.org/r/20180415182119.4517-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Khalid Aziz Cc: Andrea Arcangeli Cc: David Herrmann Cc: Hugh Dickins Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 54bec60380df..ceba031d9985 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2620,12 +2620,13 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) /* * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on shmem. + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. */ -#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ -static void shmem_tag_pins(struct address_space *mapping) +static void memfd_tag_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; @@ -2646,7 +2647,7 @@ static void shmem_tag_pins(struct address_space *mapping) } else if (page_count(page) - page_mapcount(page) > 1) { xa_lock_irq(&mapping->i_pages); radix_tree_tag_set(&mapping->i_pages, iter.index, - SHMEM_TAG_PINNED); + MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); } @@ -2667,7 +2668,7 @@ static void shmem_tag_pins(struct address_space *mapping) * The caller must guarantee that no new user will acquire writable references * to those pages to avoid races. */ -static int shmem_wait_for_pins(struct address_space *mapping) +static int memfd_wait_for_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; @@ -2675,11 +2676,11 @@ static int shmem_wait_for_pins(struct address_space *mapping) struct page *page; int error, scan; - shmem_tag_pins(mapping); + memfd_tag_pins(mapping); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) break; if (!scan) @@ -2690,7 +2691,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, SHMEM_TAG_PINNED) { + start, MEMFD_TAG_PINNED) { page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { @@ -2717,7 +2718,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) xa_lock_irq(&mapping->i_pages); radix_tree_tag_clear(&mapping->i_pages, - iter.index, SHMEM_TAG_PINNED); + iter.index, MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { @@ -2733,11 +2734,11 @@ continue_resched: static unsigned int *memfd_file_seals_ptr(struct file *file) { - if (file->f_op == &shmem_file_operations) + if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS - if (file->f_op == &hugetlbfs_file_operations) + if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif @@ -2757,16 +2758,17 @@ static int memfd_add_seals(struct file *file, unsigned int seals) /* * SEALING - * Sealing allows multiple parties to share a shmem-file but restrict - * access to a specific subset of file operations. Seals can only be - * added, but never removed. This way, mutually untrusted parties can - * share common memory regions with a well-defined policy. A malicious - * peer can thus never perform unwanted operations on a shared object. + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. * - * Seals are only supported on special shmem-files and always affect - * the whole underlying inode. Once a seal is set, it may prevent some - * kinds of access to the file. Currently, the following seals are - * defined: + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing @@ -2780,9 +2782,9 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * added. * * Semantics of sealing are only defined on volatile files. Only - * anonymous shmem files support sealing. More importantly, seals are - * never written to disk. Therefore, there's no plan to support it on - * other file types. + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) @@ -2808,7 +2810,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) if (error) goto unlock; - error = shmem_wait_for_pins(file->f_mapping); + error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; -- cgit v1.2.3-59-g8ed1b From 5d752600a8c373382264392f5b573b2fc9c0e8ea Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:06:01 -0700 Subject: mm: restructure memfd code With the addition of memfd hugetlbfs support, we now have the situation where memfd depends on TMPFS -or- HUGETLBFS. Previously, memfd was only supported on tmpfs, so it made sense that the code resided in shmem.c. In the current code, memfd is only functional if TMPFS is defined. If HUGETLFS is defined and TMPFS is not defined, then memfd functionality will not be available for hugetlbfs. This does not cause BUGs, just a lack of potentially desired functionality. Code is restructured in the following way: - include/linux/memfd.h is a new file containing memfd specific definitions previously contained in shmem_fs.h. - mm/memfd.c is a new file containing memfd specific code previously contained in shmem.c. - memfd specific code is removed from shmem_fs.h and shmem.c. - A new config option MEMFD_CREATE is added that is defined if TMPFS or HUGETLBFS is defined. No functional changes are made to the code: restructuring only. Link: http://lkml.kernel.org/r/20180415182119.4517-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Khalid Aziz Cc: Andrea Arcangeli Cc: David Herrmann Cc: Hugh Dickins Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 3 + fs/fcntl.c | 2 +- include/linux/memfd.h | 16 +++ include/linux/shmem_fs.h | 13 -- mm/Makefile | 1 + mm/memfd.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++ mm/shmem.c | 324 -------------------------------------------- 7 files changed, 366 insertions(+), 338 deletions(-) create mode 100644 include/linux/memfd.h create mode 100644 mm/memfd.c (limited to 'mm') diff --git a/fs/Kconfig b/fs/Kconfig index ac4ac908f001..51f78a28072a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,6 +203,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config MEMFD_CREATE + def_bool TMPFS || HUGETLBFS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/fcntl.c b/fs/fcntl.c index c42169459298..12273b6ea56d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/memfd.h b/include/linux/memfd.h new file mode 100644 index 000000000000..4f1600413f91 --- /dev/null +++ b/include/linux/memfd.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MEMFD_H +#define __LINUX_MEMFD_H + +#include + +#ifdef CONFIG_MEMFD_CREATE +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +#else +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} +#endif + +#endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 73b5e655a76e..f155dc607112 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TMPFS - -extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); - -#else - -static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) -{ - return -EINVAL; -} - -#endif - #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE extern bool shmem_huge_enabled(struct vm_area_struct *vma); #else diff --git a/mm/Makefile b/mm/Makefile index b4e54a9ae9c5..8716bdabe1e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 000000000000..27069518e3c5 --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,345 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + } else if (page_count(page) - page_mapcount(page) > 1) { + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, + MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); + } + + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + int error, scan; + + memfd_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, + start, MEMFD_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); +continue_resched: + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/shmem.c b/mm/shmem.c index ceba031d9985..cb4d7fa389d0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2618,243 +2618,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -/* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on tmpfs - * or hugetlbfs because they are memory only filesystems. - */ -#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE -#define LAST_SCAN 4 /* about 150ms max */ - -static void memfd_tag_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; - struct page *page; - - lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); -} - -/* - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, - * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages - * and see whether it has an elevated ref-count. If so, we tag them and wait for - * them to be dropped. - * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. - */ -static int memfd_wait_for_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; - struct page *page; - int error, scan; - - memfd_tag_pins(mapping); - - error = 0; - for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) - break; - - if (!scan) - lru_add_drain_all(); - else if (schedule_timeout_killable((HZ << scan) / 200)) - scan = LAST_SCAN; - - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, MEMFD_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - - /* - * On the last scan, we clean up all those tags - * we inserted; but make a note that we still - * found pages pinned. - */ - error = -EBUSY; - } - - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); - } - - return error; -} - -static unsigned int *memfd_file_seals_ptr(struct file *file) -{ - if (shmem_file(file)) - return &SHMEM_I(file_inode(file))->seals; - -#ifdef CONFIG_HUGETLBFS - if (is_file_hugepages(file)) - return &HUGETLBFS_I(file_inode(file))->seals; -#endif - - return NULL; -} - -#define F_ALL_SEALS (F_SEAL_SEAL | \ - F_SEAL_SHRINK | \ - F_SEAL_GROW | \ - F_SEAL_WRITE) - -static int memfd_add_seals(struct file *file, unsigned int seals) -{ - struct inode *inode = file_inode(file); - unsigned int *file_seals; - int error; - - /* - * SEALING - * Sealing allows multiple parties to share a tmpfs or hugetlbfs file - * but restrict access to a specific subset of file operations. Seals - * can only be added, but never removed. This way, mutually untrusted - * parties can share common memory regions with a well-defined policy. - * A malicious peer can thus never perform unwanted operations on a - * shared object. - * - * Seals are only supported on special tmpfs or hugetlbfs files and - * always affect the whole underlying inode. Once a seal is set, it - * may prevent some kinds of access to the file. Currently, the - * following seals are defined: - * SEAL_SEAL: Prevent further seals from being set on this file - * SEAL_SHRINK: Prevent the file from shrinking - * SEAL_GROW: Prevent the file from growing - * SEAL_WRITE: Prevent write access to the file - * - * As we don't require any trust relationship between two parties, we - * must prevent seals from being removed. Therefore, sealing a file - * only adds a given set of seals to the file, it never touches - * existing seals. Furthermore, the "setting seals"-operation can be - * sealed itself, which basically prevents any further seal from being - * added. - * - * Semantics of sealing are only defined on volatile files. Only - * anonymous tmpfs and hugetlbfs files support sealing. More - * importantly, seals are never written to disk. Therefore, there's - * no plan to support it on other file types. - */ - - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - if (seals & ~(unsigned int)F_ALL_SEALS) - return -EINVAL; - - inode_lock(inode); - - file_seals = memfd_file_seals_ptr(file); - if (!file_seals) { - error = -EINVAL; - goto unlock; - } - - if (*file_seals & F_SEAL_SEAL) { - error = -EPERM; - goto unlock; - } - - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { - error = mapping_deny_writable(file->f_mapping); - if (error) - goto unlock; - - error = memfd_wait_for_pins(file->f_mapping); - if (error) { - mapping_allow_writable(file->f_mapping); - goto unlock; - } - } - - *file_seals |= seals; - error = 0; - -unlock: - inode_unlock(inode); - return error; -} - -static int memfd_get_seals(struct file *file) -{ - unsigned int *seals = memfd_file_seals_ptr(file); - - return seals ? *seals : -EINVAL; -} - -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long error; - - switch (cmd) { - case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - - error = memfd_add_seals(file, arg); - break; - case F_GET_SEALS: - error = memfd_get_seals(file); - break; - default: - error = -EINVAL; - break; - } - - return error; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -3677,93 +3440,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } -#define MFD_NAME_PREFIX "memfd:" -#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) -#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) - -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) - -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) -{ - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; - - if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; - } else { - /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) - return -EINVAL; - } - - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; - - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); - if (!name) - return -ENOMEM; - - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { - error = -EFAULT; - goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } - - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } - - if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; - - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE, - (flags >> MFD_HUGE_SHIFT) & - MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; - } - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; - - if (flags & MFD_ALLOW_SEALING) { - file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; -} - #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) -- cgit v1.2.3-59-g8ed1b From e69438596bb3e97809e76be315e54a4a444f4797 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 7 Jun 2018 17:06:04 -0700 Subject: mm/page_alloc: remove realsize in free_area_init_core() Highmem's realsize always equals to freesize, so it is not necessary to spare a variable to record this. Link: http://lkml.kernel.org/r/20180413083859.65888-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22320ea27489..4fbe211340e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6229,18 +6229,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, freesize, memmap_pages; + unsigned long size, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; - realsize = freesize = zone->present_pages; + freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = calc_memmap_size(size, realsize); + memmap_pages = calc_memmap_size(size, freesize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; @@ -6272,7 +6272,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; + zone->managed_pages = freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif -- cgit v1.2.3-59-g8ed1b From 3010a5ea665a089361e435093bd737399123fcc4 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 7 Jun 2018 17:06:08 -0700 Subject: mm: introduce ARCH_HAS_PTE_SPECIAL Currently the PTE special supports is turned on in per architecture header files. Most of the time, it is defined in arch/*/include/asm/pgtable.h depending or not on some other per architecture static definition. This patch introduce a new configuration variable to manage this directly in the Kconfig files. It would later replace __HAVE_ARCH_PTE_SPECIAL. Here notes for some architecture where the definition of __HAVE_ARCH_PTE_SPECIAL is not obvious: arm __HAVE_ARCH_PTE_SPECIAL which is currently defined in arch/arm/include/asm/pgtable-3level.h which is included by arch/arm/include/asm/pgtable.h when CONFIG_ARM_LPAE is set. So select ARCH_HAS_PTE_SPECIAL if ARM_LPAE. powerpc __HAVE_ARCH_PTE_SPECIAL is defined in 2 files: - arch/powerpc/include/asm/book3s/64/pgtable.h - arch/powerpc/include/asm/pte-common.h The first one is included if (PPC_BOOK3S & PPC64) while the second is included in all the other cases. So select ARCH_HAS_PTE_SPECIAL all the time. sparc: __HAVE_ARCH_PTE_SPECIAL is defined if defined(__sparc__) && defined(__arch64__) which are defined through the compiler in sparc/Makefile if !SPARC32 which I assume to be if SPARC64. So select ARCH_HAS_PTE_SPECIAL if SPARC64 There is no functional change introduced by this patch. Link: http://lkml.kernel.org/r/1523433816-14460-2-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Suggested-by: Jerome Glisse Reviewed-by: Jerome Glisse Acked-by: David Rientjes Cc: Michal Hocko Cc: "Aneesh Kumar K . V" Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Will Deacon Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Vineet Gupta Cc: Palmer Dabbelt Cc: Albert Ou Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Rientjes Cc: Robin Murphy Cc: Christophe LEROY Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/features/vm/pte_special/arch-support.txt | 2 +- arch/arc/Kconfig | 1 + arch/arc/include/asm/pgtable.h | 2 -- arch/arm/Kconfig | 1 + arch/arm/include/asm/pgtable-3level.h | 1 - arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 2 -- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/pgtable.h | 3 --- arch/powerpc/include/asm/pte-common.h | 3 --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable-bits.h | 3 --- arch/s390/Kconfig | 1 + arch/s390/include/asm/pgtable.h | 1 - arch/sh/Kconfig | 1 + arch/sh/include/asm/pgtable.h | 2 -- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/pgtable_64.h | 3 --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable_types.h | 1 - include/linux/pfn_t.h | 4 ++-- mm/Kconfig | 3 +++ mm/gup.c | 4 ++-- mm/memory.c | 2 +- 24 files changed, 18 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 6a608a6dcf71..a8378424bc98 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -1,6 +1,6 @@ # # Feature name: pte_special -# Kconfig: __HAVE_ARCH_PTE_SPECIAL +# Kconfig: ARCH_HAS_PTE_SPECIAL # description: arch supports the pte_special()/pte_mkspecial() VM APIs # ----------------------- diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 89d47eac18b2..e81bcd271be7 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -48,6 +48,7 @@ config ARC select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA + select ARCH_HAS_PTE_SPECIAL config MIGHT_HAVE_PCI bool diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 08fe33830d4b..8ec5599a0957 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE)); PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8f460bdd4be1..534563ac7f5f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -8,6 +8,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_SET_MEMORY select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a4836087358..6d50a11d7793 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte) pte_val(pte) |= L_PTE_SPECIAL; return pte; } -#define __HAVE_ARCH_PTE_SPECIAL #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b25ed7834f6c..4759566a78cb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7c4c8f318ba9..9f82d6b53851 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pgd_pte(pgd_t pgd) { return __pte(pgd_val(pgd)); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 076fe3094856..8f959df2de7a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,6 +135,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 42fe7c2ff2df..63cee159022b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -335,9 +335,6 @@ extern unsigned long pci_io_base; /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 050b0d775324..bef56141a549 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -208,9 +208,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef _PAGE_READ /* if not defined, we should not find _PAGE_WRITE too */ #define _PAGE_READ 0 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 274bc064c41f..17f19e67993b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -42,6 +42,7 @@ config RISCV select THREAD_INFO_IN_TASK select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER + select ARCH_HAS_PTE_SPECIAL config MMU def_bool y diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 997ddbb1d370..2fa2942be221 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -42,7 +42,4 @@ _PAGE_WRITE | _PAGE_EXEC | \ _PAGE_USER | _PAGE_GLOBAL)) -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #endif /* _ASM_RISCV_PGTABLE_BITS_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b7deee7e738f..baed39772c84 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d24d33bf188..9809694e1389 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr) #define _PAGE_WRITE 0x020 /* SW pte write bit */ #define _PAGE_SPECIAL 0x040 /* SW associated with special page */ #define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */ -#define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ae619d54018c..4d61a085982b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config SUPERH def_bool y + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_COHERENT_DMA_MMAP if !MMU diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 89c513a982fc..f6abfe2bca93 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define __HAVE_ARCH_PTE_SPECIAL - #include #endif /* __ASM_SH_PGTABLE_H */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index b42ba888217d..9a2b8877f174 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -88,6 +88,7 @@ config SPARC64 select ARCH_USE_QUEUED_SPINLOCKS select GENERIC_TIME_VSYSCALL select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_PTE_SPECIAL config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 44d6ac47e035..1393a8ac596b 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_PUD_HUGE _PAGE_PMD_HUGE -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - /* SUN4U pte bits... */ #define _PAGE_SZ4MB_4U _AC(0x6000000000000000,UL) /* 4MB Page */ #define _PAGE_SZ512K_4U _AC(0x4000000000000000,UL) /* 512K Page */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb6e3a219294..f182a4e8e5bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 1e5a40673953..99fff853c944 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -65,7 +65,6 @@ #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif -#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a03c2642a87c..21713dc14ce2 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud); #endif #endif /* __HAVE_ARCH_PTE_DEVMAP */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) { return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL; @@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn) { return false; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #endif /* _LINUX_PFN_T_H_ */ diff --git a/mm/Kconfig b/mm/Kconfig index 3e0b6e87f65d..00bffa7a5112 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -754,3 +754,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/gup.c b/mm/gup.c index 541904a7c60f..010153989b9b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1354,7 +1354,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1430,7 +1430,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 6a97893a8de7..42d2d4366c11 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,7 +817,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL # define HAVE_PTE_SPECIAL 1 #else # define HAVE_PTE_SPECIAL 0 -- cgit v1.2.3-59-g8ed1b From 00b3a331fdf7f1466ac93a008ccd2dab45cb46c5 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 7 Jun 2018 17:06:12 -0700 Subject: mm: remove odd HAVE_PTE_SPECIAL Remove the additional define HAVE_PTE_SPECIAL and rely directly on CONFIG_ARCH_HAS_PTE_SPECIAL. There is no functional change introduced by this patch Link: http://lkml.kernel.org/r/1523533733-25437-1-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Acked-by: David Rientjes Reviewed-by: Andrew Morton Cc: Jerome Glisse Cc: Michal Hocko Cc: Christophe LEROY Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 42d2d4366c11..bc381486d527 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); - if (HAVE_PTE_SPECIAL) { + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) @@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } - /* !HAVE_PTE_SPECIAL case follows: */ + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; + check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the - * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -1933,7 +1929,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* -- cgit v1.2.3-59-g8ed1b From 1c4bc43ddfd52cbe5a08bb86ae636f55d2799424 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 7 Jun 2018 17:06:15 -0700 Subject: mm/memblock: introduce PHYS_ADDR_MAX So far code was using ULLONG_MAX and type casting to obtain a phys_addr_t with all bits set. The typecast is necessary to silence compiler warnings on 32-bit platforms. Use the simpler but still type safe approach "~(phys_addr_t)0" to create a preprocessor define for all bits set. Link: http://lkml.kernel.org/r/20180406213809.566-1-stefan@agner.ch Signed-off-by: Stefan Agner Suggested-by: Linus Torvalds Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Catalin Marinas Cc: Pavel Tatashin Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + mm/memblock.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7aed92624531..7c4e8f1f72d8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -29,6 +29,7 @@ #define LLONG_MIN (-LLONG_MAX - 1) #define ULLONG_MAX (~0ULL) #define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) #define S8_MAX ((s8)(U8_MAX>>1)) diff --git a/mm/memblock.c b/mm/memblock.c index 5108356ad8aa..eec988c21c7e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void) /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); + return *size = min(*size, PHYS_ADDR_MAX - base); } /* @@ -925,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1516,13 +1516,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size - * of those regions, max_addr will keep original value ULLONG_MAX + * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_memblock(memory, r) { if (limit <= r->size) { @@ -1537,7 +1537,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; if (!limit) return; @@ -1545,14 +1545,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) @@ -1580,7 +1580,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, - base + size, (phys_addr_t)ULLONG_MAX); + base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) @@ -1593,7 +1593,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); -- cgit v1.2.3-59-g8ed1b From bbec2e15170aae3e084d7d9afc730aeebe01b654 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:18 -0700 Subject: mm: rename page_counter's count/limit into usage/max This patch renames struct page_counter fields: count -> usage limit -> max and the corresponding functions: page_counter_limit() -> page_counter_set_max() mem_cgroup_get_limit() -> mem_cgroup_get_max() mem_cgroup_resize_limit() -> mem_cgroup_resize_max() memcg_update_kmem_limit() -> memcg_update_kmem_max() memcg_update_tcp_limit() -> memcg_update_tcp_max() The idea behind this renaming is to have the direct matching between memory cgroup knobs (low, high, max) and page_counters API. This is pure renaming, this patch doesn't bring any functional change. Link: http://lkml.kernel.org/r/20180405185921.4942-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 +- include/linux/page_counter.h | 12 ++--- mm/hugetlb_cgroup.c | 6 +-- mm/memcontrol.c | 112 +++++++++++++++++++++---------------------- mm/oom_kill.c | 2 +- mm/page_counter.c | 28 +++++------ 6 files changed, 82 insertions(+), 82 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 517096c3cc99..577a19a6a93b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -467,7 +467,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, void mem_cgroup_handle_over_high(void); -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -858,7 +858,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } -static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c15ab80ad32d..94029dad9317 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,8 +7,8 @@ #include struct page_counter { - atomic_long_t count; - unsigned long limit; + atomic_long_t usage; + unsigned long max; struct page_counter *parent; /* legacy */ @@ -25,14 +25,14 @@ struct page_counter { static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { - atomic_long_set(&counter->count, 0); - counter->limit = PAGE_COUNTER_MAX; + atomic_long_set(&counter->usage, 0); + counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { - return atomic_long_read(&counter->count); + return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); @@ -41,7 +41,7 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..68c2f2f3c05b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d86665cf4a49..79bb5aeaa800 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2444,10 +2444,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; int ret; @@ -2460,22 +2460,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; @@ -2757,7 +2757,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +2871,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +2913,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +2941,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3126,8 +3126,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3626,7 +3626,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5131,7 +5131,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5155,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6074,7 +6074,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6095,7 +6095,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6129,7 +6129,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6151,9 +6151,9 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + err = page_counter_set_max(&memcg->swap, max); + mutex_unlock(&memcg_max_mutex); if (err) return err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ba6cb88cf58..6694348b27e9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3ad60a4..41937c9a9d11 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -22,7 +22,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +41,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +82,9 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -123,20 +123,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,17 +149,17 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } -- cgit v1.2.3-59-g8ed1b From 230671533d64631116be3ff9d407bd9ca5a58e1b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:22 -0700 Subject: mm: memory.low hierarchical behavior This patch aims to address an issue in current memory.low semantics, which makes it hard to use it in a hierarchy, where some leaf memory cgroups are more valuable than others. For example, there are memcgs A, A/B, A/C, A/D and A/E: A A/memory.low = 2G, A/memory.current = 6G //\\ BC DE B/memory.low = 3G B/memory.current = 2G C/memory.low = 1G C/memory.current = 2G D/memory.low = 0 D/memory.current = 2G E/memory.low = 10G E/memory.current = 0 If we apply memory pressure, B, C and D are reclaimed at the same pace while A's usage exceeds 2G. This is obviously wrong, as B's usage is fully below B's memory.low, and C has 1G of protection as well. Also, A is pushed to the size, which is less than A's 2G memory.low, which is also wrong. A simple bash script (provided below) can be used to reproduce the problem. Current results are: A: 1430097920 A/B: 711929856 A/C: 717426688 A/D: 741376 A/E: 0 To address the issue a concept of effective memory.low is introduced. Effective memory.low is always equal or less than original memory.low. In a case, when there is no memory.low overcommittment (and also for top-level cgroups), these two values are equal. Otherwise it's a part of parent's effective memory.low, calculated as a cgroup's memory.low usage divided by sum of sibling's memory.low usages (under memory.low usage I mean the size of actually protected memory: memory.current if memory.current < memory.low, 0 otherwise). It's necessary to track the actual usage, because otherwise an empty cgroup with memory.low set (A/E in my example) will affect actual memory distribution, which makes no sense. To avoid traversing the cgroup tree twice, page_counters code is reused. Calculating effective memory.low can be done in the reclaim path, as we conveniently traversing the cgroup tree from top to bottom and check memory.low on each level. So, it's a perfect place to calculate effective memory low and save it to use it for children cgroups. This also eliminates a need to traverse the cgroup tree from bottom to top each time to check if parent's guarantee is not exceeded. Setting/resetting effective memory.low is intentionally racy, but it's fine and shouldn't lead to any significant differences in actual memory distribution. With this patch applied results are matching the expectations: A: 2147930112 A/B: 1428721664 A/C: 718393344 A/D: 815104 A/E: 0 Test script: #!/bin/bash CGPATH="/sys/fs/cgroup" truncate /file1 --size 2G truncate /file2 --size 2G truncate /file3 --size 2G truncate /file4 --size 50G mkdir "${CGPATH}/A" echo "+memory" > "${CGPATH}/A/cgroup.subtree_control" mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" echo 2G > "${CGPATH}/A/memory.low" echo 3G > "${CGPATH}/A/B/memory.low" echo 1G > "${CGPATH}/A/C/memory.low" echo 0 > "${CGPATH}/A/D/memory.low" echo 10G > "${CGPATH}/A/E/memory.low" echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1 echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2 echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3 echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4 echo "A: " `cat "${CGPATH}/A/memory.current"` echo "A/B: " `cat "${CGPATH}/A/B/memory.current"` echo "A/C: " `cat "${CGPATH}/A/C/memory.current"` echo "A/D: " `cat "${CGPATH}/A/D/memory.current"` echo "A/E: " `cat "${CGPATH}/A/E/memory.current"` rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" rmdir "${CGPATH}/A" rm /file1 /file2 /file3 /file4 Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +- include/linux/page_counter.h | 7 +++ mm/memcontrol.c | 112 ++++++++++++++++++++++++++++++++----------- mm/page_counter.c | 43 +++++++++++++++++ 4 files changed, 134 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 577a19a6a93b..891945507044 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,8 +181,7 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Normal memory consumption range */ - unsigned long low; + /* Upper bound of normal memory consumption range */ unsigned long high; /* Range enforcement for interrupt charges */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 94029dad9317..7902a727d3b6 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,8 +9,14 @@ struct page_counter { atomic_long_t usage; unsigned long max; + unsigned long low; struct page_counter *parent; + /* effective memory.low and memory.low usage tracking */ + unsigned long elow; + atomic_long_t low_usage; + atomic_long_t children_low_usage; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79bb5aeaa800..d0261059aab9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4270,7 +4270,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg->low = 0; page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5064,7 +5064,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5086,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5348,36 +5348,72 @@ struct cgroup_subsys memory_cgrp_subsys = { * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. + * Returns %true if memory consumption of @memcg is below the normal range. * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. + * @root is exclusive; it is never low when looked at directly * - * For example, given cgroup A with children B and C: + * To provide a proper hierarchical behavior, effective memory.low value + * is used. * - * A - * / \ - * B C + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. * - * and + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. + * + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: + * + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 + * + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): + * + * A/memory.current = 2G + * + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_low_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_low() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { + unsigned long usage, low_usage, siblings_low_usage; + unsigned long elow, parent_elow; + struct mem_cgroup *parent; + if (mem_cgroup_disabled()) return false; @@ -5386,12 +5422,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) if (memcg == root) return false; - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; - } + elow = memcg->memory.low; + usage = page_counter_read(&memcg->memory); + parent = parent_mem_cgroup(memcg); - return true; + if (parent == root) + goto exit; + + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + + if (!elow || !parent_elow) + goto exit; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (!low_usage || !siblings_low_usage) + goto exit; + + elow = min(elow, parent_elow * low_usage / siblings_low_usage); +exit: + memcg->memory.elow = elow; + return usage < elow; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index 41937c9a9d11..a5ff4cbc355a 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,28 @@ #include #include +static void propagate_low_usage(struct page_counter *c, unsigned long usage) +{ + unsigned long low_usage, old; + long delta; + + if (!c->parent) + return; + + if (!c->low && !atomic_long_read(&c->low_usage)) + return; + + if (usage <= c->low) + low_usage = usage; + else + low_usage = 0; + + old = atomic_long_xchg(&c->low_usage, low_usage); + delta = low_usage - old; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -23,6 +45,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_low_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -42,6 +65,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -85,6 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_low_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -164,6 +190,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_low_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse -- cgit v1.2.3-59-g8ed1b From 5f93ad67436b3da8312db5ad4eedfa739c00787f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:26 -0700 Subject: mm: treat memory.low value inclusive If memcg's usage is equal to the memory.low value, avoid reclaiming from this cgroup while there is a surplus of reclaimable memory. This sounds more logical and also matches memory.high and memory.max behavior: both are inclusive. Empty cgroups are not considered protected, so MEMCG_LOW events are not emitted for empty cgroups, if there is no more reclaimable memory in the system. Link: http://lkml.kernel.org/r/20180406122132.GA7185@castle Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d0261059aab9..8bab4660e6aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5344,14 +5344,14 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is below the normal range + * mem_cgroup_low - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is below the normal range. + * Returns %true if memory consumption of @memcg is in the normal range. * * @root is exclusive; it is never low when looked at directly * @@ -5445,7 +5445,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: memcg->memory.elow = elow; - return usage < elow; + return usage && usage <= elow; } /** -- cgit v1.2.3-59-g8ed1b From 688272809fcce5b17fcefd5892b59f3788efb144 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 7 Jun 2018 17:06:34 -0700 Subject: mm, gup: prevent pmd checking race in follow_pmd_mask() mmap_sem will be read locked when calling follow_pmd_mask(). But this cannot prevent PMD from being changed for all cases when PTL is unlocked, for example, from pmd_trans_huge() to pmd_none() via MADV_DONTNEED. So it is possible for the pmd_present() check in follow_pmd_mask() to encounter an invalid PMD. This may cause an incorrect VM_BUG_ON() or an infinite loop. Fix this by reading the PMD entry into a local variable with READ_ONCE() and checking the local variable and pmd_none() in the retry loop. As Kirill pointed out, with PTL unlocked, the *pmd may be changed under us, so reading it directly again and again may incur weird bugs. So although using *pmd directly other than for pmd_present() checking may be safe, it is still better to replace them to read *pmd once and check the local variable multiple times. When PTL unlocked, replace all *pmd with local variable was suggested by Kirill. Link: http://lkml.kernel.org/r/20180419083514.1365-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Zi Yan Cc: "Kirill A. Shutemov" Cc: Al Viro Cc: "Aneesh Kumar K.V" Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 010153989b9b..1020c7f8f5ee 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, unsigned int *page_mask) { - pmd_t *pmd; + pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - if (pmd_none(*pmd)) + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, - __hugepd(pmd_val(*pmd)), flags, + __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(*pmd)); - if (is_pmd_migration_entry(*pmd)) + !is_pmd_migration_entry(pmdval)); + if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_sem is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); goto retry; } - if (pmd_devmap(*pmd)) { + if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } - if (likely(!pmd_trans_huge(*pmd))) + if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags); - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) -- cgit v1.2.3-59-g8ed1b From d538c164fc018bdb79f07f904fa9d48acf18bb0a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 7 Jun 2018 17:06:39 -0700 Subject: mm/sparse.c: check __highest_present_section_nr only for a present section When searching a present section, there are two boundaries: * __highest_present_section_nr * NR_MEM_SECTIONS And it is known, __highest_present_section_nr is a more strict boundary than NR_MEM_SECTIONS. This means it would be necessary to check __highest_present_section_nr only. Link: http://lkml.kernel.org/r/20180326081956.75275-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Rientjes Reviewed-by: Andrew Morton Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 73dc2fcc0eab..3570ff294ab1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -190,15 +190,13 @@ static inline int next_present_section_nr(int section_nr) section_nr++; if (present_section_nr(section_nr)) return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + } while ((section_nr <= __highest_present_section_nr)); return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) -- cgit v1.2.3-59-g8ed1b From 08994b24673b6ae33ee40fc3b5e265c6762848e4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 7 Jun 2018 17:06:43 -0700 Subject: mm/sparse.c: pass the __highest_present_section_nr + 1 to alloc_func() In commit c4e1be9ec113 ("mm, sparsemem: break out of loops early") __highest_present_section_nr is introduced to reduce the loop counts for present section. This is also helpful for usemap and memmap allocation. This patch uses __highest_present_section_nr + 1 to optimize the loop. Link: http://lkml.kernel.org/r/20180326081956.75275-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 3570ff294ab1..f13f2723950a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -522,7 +522,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) map_count = 1; } /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + alloc_func(data, pnum_begin, __highest_present_section_nr+1, map_count, nodeid_begin); } -- cgit v1.2.3-59-g8ed1b From 82a2e924ff2c6accbc840dfa46c42b24da457a31 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 7 Jun 2018 17:06:46 -0700 Subject: mm: vmalloc: clean up vunmap to avoid pgtable ops twice vunmap does page table clear operations twice in the case when DEBUG_PAGEALLOC_ENABLE_DEFAULT is enabled. So, clean up the code as that is unintended. As a perf gain, we save few us. Below ftrace data was obtained while doing 1 MB of vmalloc/vfree on ARM64 based SoC *without* this patch applied. After this patch, we can save ~3 us (on 1 extra vunmap_page_range). CPU DURATION FUNCTION CALLS | | | | | | | 6) | __vunmap() { 6) | vmap_debug_free_range() { 6) 3.281 us | vunmap_page_range(); 6) + 45.468 us | } 6) 2.760 us | vunmap_page_range(); 6) ! 505.105 us | } [cpandya@codeaurora.org: v3] Link: http://lkml.kernel.org/r/1525176960-18408-1-git-send-email-cpandya@codeaurora.org Link: http://lkml.kernel.org/r/1523876342-10545-1-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Laura Abbott Cc: Catalin Marinas Cc: Johannes Weiner Cc: Florian Fainelli Cc: Yisheng Xie Cc: Ard Biesheuvel Cc: Wei Yang Cc: Byungchul Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 63a5f502da08..12bd82e6554e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1142,7 +1129,6 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!PAGE_ALIGNED(addr)); debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); @@ -1499,7 +1485,6 @@ struct vm_struct *remove_vm_area(const void *addr) va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); -- cgit v1.2.3-59-g8ed1b From f3c01d2f3ade6790db67f80fef60df84424f8964 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 7 Jun 2018 17:06:50 -0700 Subject: mm: vmalloc: avoid racy handling of debugobjects in vunmap Currently, __vunmap flow is, 1) Release the VM area 2) Free the debug objects corresponding to that vm area. This leave some race window open. 1) Release the VM area 1.5) Some other client gets the same vm area 1.6) This client allocates new debug objects on the same vm area 2) Free the debug objects corresponding to this vm area. Here, we actually free 'other' client's debug objects. Fix this by freeing the debug objects first and then releasing the VM area. Link: http://lkml.kernel.org/r/1523961828-9485-2-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Reviewed-by: Andrew Morton Cc: Ard Biesheuvel Cc: Byungchul Park Cc: Catalin Marinas Cc: Florian Fainelli Cc: Johannes Weiner Cc: Laura Abbott Cc: Vlastimil Babka Cc: Wei Yang Cc: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 12bd82e6554e..4df66e1abeb1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1504,7 +1504,7 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = remove_vm_area(addr); + area = find_vmap_area((unsigned long)addr)->vm; if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); @@ -1514,6 +1514,7 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(addr, get_vm_area_size(area)); debug_check_no_obj_freed(addr, get_vm_area_size(area)); + remove_vm_area(addr); if (deallocate_pages) { int i; -- cgit v1.2.3-59-g8ed1b From 05e3ff9505858a39dc696ca195b5d79e524aac03 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 7 Jun 2018 17:06:53 -0700 Subject: mm: vmalloc: pass proper vm_start into debugobjects Client can call vunmap with some intermediate 'addr' which may not be the start of the VM area. Entire unmap code works with vm->vm_start which is proper but debug object API is called with 'addr'. This could be a problem within debug objects. Pass proper start address into debug object API. [akpm@linux-foundation.org: fix warning] Link: http://lkml.kernel.org/r/1523961828-9485-3-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Reviewed-by: Andrew Morton Cc: Ard Biesheuvel Cc: Byungchul Park Cc: Catalin Marinas Cc: Florian Fainelli Cc: Johannes Weiner Cc: Laura Abbott Cc: Vlastimil Babka Cc: Wei Yang Cc: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4df66e1abeb1..89efac3a020e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1128,15 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1511,8 +1512,8 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); remove_vm_area(addr); if (deallocate_pages) { -- cgit v1.2.3-59-g8ed1b From 89fdcd262fd40712da65e922558872a12b203332 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 7 Jun 2018 17:06:59 -0700 Subject: mm: shmem: make stat.st_blksize return huge page size if THP is on Since tmpfs THP was supported in 4.8, hugetlbfs is not the only filesystem with huge page support anymore. tmpfs can use huge page via THP when mounting by "huge=" mount option. When applications use huge page on hugetlbfs, it just need check the filesystem magic number, but it is not enough for tmpfs. Make stat.st_blksize return huge page size if it is mounted by appropriate "huge=" option to give applications a hint to optimize the behavior with THP. Some applications may not do wisely with THP. For example, QEMU may mmap file on non huge page aligned hint address with MAP_FIXED, which results in no pages are PMD mapped even though THP is used. Some applications may mmap file with non huge page aligned offset. Both behaviors make THP pointless. statfs.f_bsize still returns 4KB for tmpfs since THP could be split, and it also may fallback to 4KB page silently if there is not enough huge page. Furthermore, different f_bsize makes max_blocks and free_blocks calculation harder but without too much benefit. Returning huge page size via stat.st_blksize sounds good enough. Since PUD size huge page for THP has not been supported, now it just returns HPAGE_PMD_SIZE. Hugh said: : Sorry, I have no enthusiasm for this patch; but do I feel strongly : enough to override you and everyone else to NAK it? No, I don't feel : that strongly, maybe st_blksize isn't worth arguing over. : : We did look at struct stat when designing huge tmpfs, to see if there : were any fields that should be adjusted for it; but concluded none. : Yes, it would sometimes be nice to have a quickly accessible indicator : for when tmpfs has been mounted huge (scanning /proc/mounts for options : can be tiresome, agreed); but since tmpfs tries to supply huge (or not) : pages transparently, no difference seemed right. : : So, because st_blksize is a not very useful field of struct stat, with : "size" in the name, we're going to put HPAGE_PMD_SIZE in there instead : of PAGE_SIZE, if the tmpfs was mounted with one of the huge "huge" : options (force or always, okay; within_size or advise, not so much). : Though HPAGE_PMD_SIZE is no more its "preferred I/O size" or "blocksize : for file system I/O" than PAGE_SIZE was. : : Which we can expect to speed up some applications and disadvantage : others, depending on how they interpret st_blksize: just like if we : changed it in the same way on non-huge tmpfs. (Did I actually try : changing st_blksize early on, and find it broke something? If so, I've : now forgotten what, and a search through commit messages didn't find : it; but I guess we'll find out soon enough.) : : If there were an mstat() syscall, returning a field "preferred : alignment", then we could certainly agree to put HPAGE_PMD_SIZE in : there; but in stat()'s st_blksize? And what happens when (in future) : mm maps this or that hard-disk filesystem's blocks with a pmd mapping - : should that filesystem then advertise a bigger st_blksize, despite the : same disk layout as before? What happens with DAX? : : And this change is not going to help the QEMU suboptimality that : brought you here (or does QEMU align mmaps according to st_blksize?). : QEMU ought to work well with kernels without this change, and kernels : with this change; and I hope it can easily deal with both by avoiding : that use of MAP_FIXED which prevented the kernel's intended alignment. [akpm@linux-foundation.org: remove unneeded `else'] Link: http://lkml.kernel.org/r/1524665633-83806-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Michal Hocko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index cb4d7fa389d0..103f3d1040f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && + shmem_huge != SHMEM_HUGE_DENY) + return true; + return false; +} + /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); + + if (is_huge_enabled(sb_info)) + stat->blksize = HPAGE_PMD_SIZE; + return 0; } -- cgit v1.2.3-59-g8ed1b From 93781325da6e07af57a21f111d1f2b9f128fa397 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 7 Jun 2018 17:07:02 -0700 Subject: lockdep: fix fs_reclaim annotation While revisiting my Btrfs swapfile series [1], I introduced a situation in which reclaim would lock i_rwsem, and even though the swapon() path clearly made GFP_KERNEL allocations while holding i_rwsem, I got no complaints from lockdep. It turns out that the rework of the fs_reclaim annotation was broken: if the current task has PF_MEMALLOC set, we don't acquire the dummy fs_reclaim lock, but when reclaiming we always check this _after_ we've just set the PF_MEMALLOC flag. In most cases, we can fix this by moving the fs_reclaim_{acquire,release}() outside of the memalloc_noreclaim_{save,restore}(), althought kswapd is slightly different. After applying this, I got the expected lockdep splats. 1: https://lwn.net/Articles/625412/ Link: http://lkml.kernel.org/r/9f8aa70652a98e98d7c4de0fc96a4addcee13efe.1523778026.git.osandov@fb.com Fixes: d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation") Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 4 ++++ mm/page_alloc.c | 20 +++++++++++++++----- mm/vmscan.c | 20 +++++++++++++------- 3 files changed, 32 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 76a8cb4ef178..44d356f5e47c 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -163,9 +163,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP +extern void __fs_reclaim_acquire(void); +extern void __fs_reclaim_release(void); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else +static inline void __fs_reclaim_acquire(void) { } +static inline void __fs_reclaim_release(void) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4fbe211340e0..b93cc79a8db4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3701,7 +3701,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla #endif /* CONFIG_COMPACTION */ #ifdef CONFIG_LOCKDEP -struct lockdep_map __fs_reclaim_map = +static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); static bool __need_fs_reclaim(gfp_t gfp_mask) @@ -3726,17 +3726,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return true; } +void __fs_reclaim_acquire(void) +{ + lock_map_acquire(&__fs_reclaim_map); +} + +void __fs_reclaim_release(void) +{ + lock_map_release(&__fs_reclaim_map); +} + void fs_reclaim_acquire(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_acquire(&__fs_reclaim_map); + __fs_reclaim_acquire(); } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_release(&__fs_reclaim_map); + __fs_reclaim_release(); } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -3754,8 +3764,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3763,8 +3773,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(gfp_mask); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9270a4370d54..0e67b477ecef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3318,11 +3318,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, .may_swap = 1, }; + + __fs_reclaim_acquire(); + count_vm_event(PAGEOUTRUN); do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool ret; sc.reclaim_idx = classzone_idx; @@ -3395,7 +3399,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + __fs_reclaim_release(); + ret = try_to_freeze(); + __fs_reclaim_acquire(); + if (ret || kthread_should_stop()) break; /* @@ -3412,6 +3419,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3600,9 +3608,7 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); - fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); - fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3684,16 +3690,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned long nr_reclaimed; unsigned int noreclaim_flag; - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } @@ -3870,6 +3876,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in }; cond_resched(); + fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE @@ -3877,7 +3884,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3892,9 +3898,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return sc.nr_reclaimed >= nr_pages; } -- cgit v1.2.3-59-g8ed1b From 88484826bc67844eeae1855ebe32cce9edd937c9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:07:11 -0700 Subject: mm/ksm: move [set_]page_stable_node from ksm.h to ksm.c page_stable_node() and set_page_stable_node() are only used in mm/ksm.c and there is no point to keep them in the include/linux/ksm.h [akpm@linux-foundation.org: fix SYSFS=n build] Link: http://lkml.kernel.org/r/1524552106-7356-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 11 ----------- mm/ksm.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index bbdfca3db6e1..161e8164abcf 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -37,17 +37,6 @@ static inline void ksm_exit(struct mm_struct *mm) __ksm_exit(mm); } -static inline struct stable_node *page_stable_node(struct page *page) -{ - return PageKsm(page) ? page_rmapping(page) : NULL; -} - -static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) -{ - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); -} - /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, diff --git a/mm/ksm.c b/mm/ksm.c index 7d6558f3bac9..e2d2886fb1df 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -840,6 +840,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *page_stable_node(struct page *page) +{ + return PageKsm(page) ? page_rmapping(page) : NULL; +} + +static inline void set_page_stable_node(struct page *page, + struct stable_node *stable_node) +{ + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +} + #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: -- cgit v1.2.3-59-g8ed1b From 12ba780d64f6c96d40de9c4d1468fb732e64be4c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 7 Jun 2018 17:07:15 -0700 Subject: tmpfs: allow decoding a file handle of an unlinked file tmpfs uses the helper d_find_alias() to find a dentry from a decoded inode, but d_find_alias() skips unhashed dentries, so unlinked files cannot be decoded from a file handle. This can be reproduced using xfstests test program open_by_handle: $ open_by handle -c /tmp/testdir $ open_by_handle -dk /tmp/testdir open_by_handle(/tmp/testdir/file000000) returned 116 incorrectly on an unlinked open file! To fix this, if d_find_alias() can't find a hashed alias, call d_find_any_alias() to return an unhashed one. Link: http://lkml.kernel.org/r/CAOQ4uxg+qSLP0KwdW+h1tcPqOCQd+_pGZVXiePQB1TXCMBMctQ@mail.gmail.com Signed-off-by: Amir Goldstein Reviewed-by: NeilBrown Cc: Hugh Dickins Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 103f3d1040f8..9a1b553e30e0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3209,6 +3209,15 @@ static int shmem_match(struct inode *ino, void *vfh) return ino->i_ino == inum && fh[0] == ino->i_generation; } +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + + static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3225,7 +3234,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { - dentry = d_find_alias(inode); + dentry = shmem_find_alias(inode); iput(inode); } -- cgit v1.2.3-59-g8ed1b From 9ccc361716362e8abb01d9944f8df97b4aac3e23 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 7 Jun 2018 17:07:19 -0700 Subject: memcg: writeback: use memcg->cgwb_list directly mem_cgroup_cgwb_list is a very simple wrapper and it will never be used outside of code under CONFIG_CGROUP_WRITEBACK. so use memcg->cgwb_list directly. Link: http://lkml.kernel.org/r/1524406173-212182-1-git-send-email-wanglong19@meituan.com Signed-off-by: Wang Long Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - mm/backing-dev.c | 4 ++-- mm/memcontrol.c | 5 ----- 3 files changed, 2 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 891945507044..10d741e8fe51 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1097,7 +1097,6 @@ static inline void dec_lruvec_page_state(struct page *page, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fe3ebd6ac00..347cc834c04a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); - memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ @@ -736,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8bab4660e6aa..2751259f0a76 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3562,11 +3562,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -{ - return &memcg->cgwb_list; -} - static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); -- cgit v1.2.3-59-g8ed1b From 8dd53fd3b702c12050ec5ddd07b56f8c4608ebea Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 7 Jun 2018 17:07:23 -0700 Subject: memcg: mark memcg1_events static const Mark memcg1_events static: it's only used by memcontrol.c. And mark it const: it's not modified. Link: http://lkml.kernel.org/r/20180503192940.94971-1-gthelen@google.com Signed-off-by: Greg Thelen Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2751259f0a76..847a775af225 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3083,7 +3083,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ /* Universal VM events cgroup1 shows, original sort order */ -unsigned int memcg1_events[] = { +static const unsigned int memcg1_events[] = { PGPGIN, PGPGOUT, PGFAULT, -- cgit v1.2.3-59-g8ed1b From bb4a7ea2b1449722cec9d787dca5a74ca36e8eeb Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 7 Jun 2018 17:07:27 -0700 Subject: mm: memcontrol: drain stocks on resize limit Resizing the memcg limit for cgroup-v2 drains the stocks before triggering the memcg reclaim. Do the same for cgroup-v1 to make the behavior consistent. Link: http://lkml.kernel.org/r/20180504205548.110696-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Greg Thelen Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 847a775af225..18dcdac0b158 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2450,6 +2450,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, unsigned long max, bool memsw) { bool enlarge = false; + bool drained = false; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; @@ -2480,6 +2481,12 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, if (!ret) break; + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw)) { ret = -EBUSY; -- cgit v1.2.3-59-g8ed1b From d12c60f64cf8c768fddfd848eaf7ee57a13f18b1 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 7 Jun 2018 17:07:31 -0700 Subject: mm: memcontrol: drain memcg stock on force_empty The per-cpu memcg stock can retain a charge of upto 32 pages. On a machine with large number of cpus, this can amount to a decent amount of memory. Additionally force_empty interface might be triggering unneeded memcg reclaims. Link: http://lkml.kernel.org/r/20180507201651.165879-1-shakeelb@google.com Signed-off-by: Junaid Shahid Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Greg Thelen Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 18dcdac0b158..e6de0d6a3a8d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2610,6 +2610,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); + + drain_all_stock(memcg); + /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { int progress; -- cgit v1.2.3-59-g8ed1b From 25cf23d7a95716fc6eb165208b5eb2e3b2e86f82 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:07:35 -0700 Subject: mm/memblock: print memblock_remove memblock_remove report is useful to see why MemTotal of /proc/meminfo between two kernels makes difference. Link: http://lkml.kernel.org/r/20180508104223.8028-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index eec988c21c7e..93ad42bc8a73 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_remove: [%pa-%pa] %pS\n", + &base, &end, (void *)_RET_IP_); + return memblock_remove_range(&memblock.memory, base, size); } -- cgit v1.2.3-59-g8ed1b From fb52bbaee598f58352d8732637ebe7013b2df79f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 7 Jun 2018 17:07:43 -0700 Subject: mm: move is_pageblock_removable_nolock() to mm/memory_hotplug.c is_pageblock_removable_nolock() is not used outside of mm/memory_hotplug.c. Move it next to unique caller is_mem_section_removable() and make it static. Remove prototype in to silence gcc warning (W=1): mm/page_alloc.c:7704:6: warning: no previous prototype for `is_pageblock_removable_nolock' [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20180509190001.24789-1-malat@debian.org Signed-off-by: Mathieu Malaterre Suggested-by: Michal Hocko Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 - mm/memory_hotplug.c | 23 +++++++++++++++++++++++ mm/page_alloc.c | 23 ----------------------- 3 files changed, 23 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b0265265c28..4e9828cda7a2 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,7 +107,6 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 25982467800b..7deb49f69e27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page) return page + pageblock_nr_pages; } +static bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (!zone_spans_pfn(zone, pfn)) + return false; + + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); +} + /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b93cc79a8db4..d0d26da12086 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7692,29 +7692,6 @@ unmovable: return true; } -bool is_pageblock_removable_nolock(struct page *page) -{ - struct zone *zone; - unsigned long pfn; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); -} - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) -- cgit v1.2.3-59-g8ed1b From bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:07:46 -0700 Subject: memcg: introduce memory.min Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [guro@fb.com: s/low/min/ in docs] Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com Signed-off-by: Roman Gushchin Reviewed-by: Randy Dunlap Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 27 +++++++- include/linux/memcontrol.h | 15 ++-- include/linux/page_counter.h | 11 ++- mm/memcontrol.c | 118 +++++++++++++++++++++++++------- mm/page_counter.c | 63 ++++++++++++----- mm/vmscan.c | 18 ++++- 6 files changed, 202 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7b56ca80e37a..e34d3c938729 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back. The total amount of memory currently being used by the cgroup and its descendants. + memory.min + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Hard memory protection. If the memory usage of a cgroup + is within its effective min boundary, the cgroup's memory + won't be reclaimed under any conditions. If there is no + unprotected reclaimable memory available, OOM killer + is invoked. + + Effective min boundary is limited by memory.min values of + all ancestor cgroups. If there is memory.min overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.min. + + Putting more memory than generally available under this + protection is discouraged and may lead to constant OOMs. + + If a memory cgroup is not populated with processes, + its memory.min is ignored. + memory.low A read-write single value file which exists on non-root cgroups. The default is "0". @@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment - (child cgroup or cgroups are requiring more protected memory, + (child cgroup or cgroups are requiring more protected memory than parent will allow), then each child cgroup will get - the part of parent's protection proportional to the its + the part of parent's protection proportional to its actual memory usage below memory.low. Putting more memory than generally available under this diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 10d741e8fe51..9c04cf8e6487 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,12 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7902a727d3b6..bab7e57f659b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -8,10 +8,16 @@ struct page_counter { atomic_long_t usage; - unsigned long max; + unsigned long min; unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; @@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6de0d6a3a8d..e3d56927a724 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); @@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, @@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is in the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is in the normal range. + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected * - * @root is exclusive; it is never low when looked at directly + * @root is exclusive; it is never protected when looked at directly * - * To provide a proper hierarchical behavior, effective memory.low value - * is used. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * * Effective memory.low is always equal or less than the original memory.low. * If there is no memory.low overcommittment (which is always true for @@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = { * E/memory.current = 0 * * These calculations require constant tracking of the actual low usages - * (see propagate_low_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_low() + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() * path for each memory cgroup top-down from the reclaim, * it's possible to optimize this part, and save calculated elow * for next usage. This part is intentionally racy, but it's ok, * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { - unsigned long usage, low_usage, siblings_low_usage; - unsigned long elow, parent_elow; struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; - elow = memcg->memory.low; usage = page_counter_read(&memcg->memory); - parent = parent_mem_cgroup(memcg); + if (!usage) + return MEMCG_PROT_NONE; + + emin = memcg->memory.min; + elow = memcg->memory.low; + parent = parent_mem_cgroup(memcg); if (parent == root) goto exit; + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); + } + parent_elow = READ_ONCE(parent->memory.elow); elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; - if (!elow || !parent_elow) - goto exit; + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); - - if (!low_usage || !siblings_low_usage) - goto exit; + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } - elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: + memcg->memory.emin = emin; memcg->memory.elow = elow; - return usage && usage <= elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index a5ff4cbc355a..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,26 +13,38 @@ #include #include -static void propagate_low_usage(struct page_counter *c, unsigned long usage) +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) { - unsigned long low_usage, old; + unsigned long protected, old_protected; long delta; if (!c->parent) return; - if (!c->low && !atomic_long_read(&c->low_usage)) - return; + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } - if (usage <= c->low) - low_usage = usage; - else - low_usage = 0; + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; - old = atomic_long_xchg(&c->low_usage, low_usage); - delta = low_usage - old; - if (delta) - atomic_long_add(delta, &c->parent->children_low_usage); + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } } /** @@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_set_low - set the amount of protected memory * @counter: counter @@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) counter->low = nr_pages; for (c = counter; c; c = c->parent) - propagate_low_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e67b477ecef..03822f86f288 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; -- cgit v1.2.3-59-g8ed1b From d62ff365b812ecd7415252568c1836811b3dde02 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Jun 2018 17:07:50 -0700 Subject: mm/vmpressure.c: use kstrndup instead of kmalloc+strncpy Using kstrndup() simplifies the code. Link: http://lkml.kernel.org/r/20180503201807.24941-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 85350ce2d25d..7142207224d3 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -390,12 +390,11 @@ int vmpressure_register_event(struct mem_cgroup *memcg, char *token; int ret = 0; - spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) { ret = -ENOMEM; goto out; } - strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); /* Find required level */ token = strsep(&spec, ","); -- cgit v1.2.3-59-g8ed1b From 3cadfa2b9497abbd3ae11a790d53ed51b4cf8ac5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Jun 2018 17:07:53 -0700 Subject: mm/vmpressure.c: convert to use match_string() helper The new helper returns index of the matching string in an array. We are going to use it here. Link: http://lkml.kernel.org/r/20180503203206.44046-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 7142207224d3..4854584ec436 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } -static enum vmpressure_levels str_to_level(const char *arg) -{ - enum vmpressure_levels level; - - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) - if (!strcmp(vmpressure_str_levels[level], arg)) - return level; - return -1; -} - -static enum vmpressure_modes str_to_mode(const char *arg) -{ - enum vmpressure_modes mode; - - for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) - if (!strcmp(vmpressure_str_modes[mode], arg)) - return mode; - return -1; -} - #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** @@ -398,18 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg, /* Find required level */ token = strsep(&spec, ","); - level = str_to_level(token); - if (level == -1) { - ret = -EINVAL; + level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (level < 0) { + ret = level; goto out; } /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = str_to_mode(token); - if (mode == -1) { - ret = -EINVAL; + mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (mode < 0) { + ret = mode; goto out; } } -- cgit v1.2.3-59-g8ed1b From a380b40abbcecda28037b712fc437e157a1e9b57 Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Thu, 7 Jun 2018 17:07:57 -0700 Subject: mm/page_alloc.c: remove useless parameter of finalise_ac() finalise_ac() has parameter order which is not used at all. Remove it. Signed-off-by: Huaisheng Ye Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d0d26da12086..96d48d24e1ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4336,8 +4336,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, - unsigned int order, struct alloc_context *ac) +static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) { /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4368,7 +4367,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, order, &ac); + finalise_ac(gfp_mask, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); -- cgit v1.2.3-59-g8ed1b From 2bcd6454bae78775632e1848ce1eabf65c6fbd4f Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:00 -0700 Subject: mm: use new return type vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Link: http://lkml.kernel.org/r/20180511190542.GA2412@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Dan Williams Cc: Jan Kara Cc: Ross Zwisler Cc: Rik van Riel Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Pavel Tatashin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/filemap.c | 8 ++++---- mm/nommu.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index b7012bb1d218..d1dc618a56bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2303,10 +2303,10 @@ extern void truncate_inode_pages_range(struct address_space *, extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_fault *vmf); +extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -extern int filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ int __must_check write_one_page(struct page *page); diff --git a/mm/filemap.c b/mm/filemap.c index 0604cb02e6f3..52517f28e6f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; @@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; pgoff_t max_off; struct page *page; - int ret = 0; + vm_fault_t ret = 0; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2693,11 +2693,11 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - int ret = VM_FAULT_LOCKED; + vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); diff --git a/mm/nommu.c b/mm/nommu.c index 13723736d38f..4452d8bd9ae4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); return 0; -- cgit v1.2.3-59-g8ed1b From b3ec9f33acb8d3a6173515d3d547be969dc70566 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:04 -0700 Subject: mm: change return type to vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See commit 1c8f422059ae ("mm: change return type to vm_fault_t") Link: http://lkml.kernel.org/r/20180512063745.GA26866@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Joe Perches Cc: Michal Hocko Cc: Hugh Dickins Cc: Dan Williams Cc: David Rientjes Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++--- mm/hugetlb.c | 2 +- mm/mmap.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 49dd59ea90f7..eb9da245286d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -629,9 +629,9 @@ struct vm_special_mapping { * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ - int (*fault)(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, - struct vm_fault *vmf); + vm_fault_t (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 129088710510..6c6decc63469 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3159,7 +3159,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_fault *vmf) +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index d817764a9974..d1eb87ef4b1a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_fault *vmf); +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_fault *vmf) +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; -- cgit v1.2.3-59-g8ed1b From 285b8dcaacfc36b0468aaa03e3c628006ae31381 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 7 Jun 2018 17:08:08 -0700 Subject: mm, hugetlbfs: pass fault address to no page handler This is to take better advantage of general huge page clearing optimization (commit c79b57e462b5: "mm: hugetlb: clear target sub-page last when clearing huge page") for hugetlbfs. In the general optimization patch, the sub-page to access will be cleared last to avoid the cache lines of to access sub-page to be evicted when clearing other sub-pages. This works better if we have the address of the sub-page to access, that is, the fault address inside the huge page. So the hugetlbfs no page fault handler is changed to pass that information. This will benefit workloads which don't access the begin of the hugetlbfs huge page after the page fault under heavy cache contention for shared last level cache. The patch is a generic optimization which should benefit quite some workloads, not for a specific use case. To demonstrate the performance benefit of the patch, we tested it with vm-scalability run on hugetlbfs. With this patch, the throughput increases ~28.1% in vm-scalability anon-w-seq test case with 88 processes on a 2 socket Xeon E5 2699 v4 system (44 cores, 88 threads). The test case creates 88 processes, each process mmaps a big anonymous memory area with MAP_HUGETLB and writes to it from the end to the begin. For each process, other processes could be seen as other workload which generates heavy cache pressure. At the same time, the cache miss rate reduced from ~36.3% to ~25.6%, the IPC (instruction per cycle) increased from 0.3 to 0.37, and the time spent in user space is reduced ~19.3%. Link: http://lkml.kernel.org/r/20180517083539.9242-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Mike Kravetz Cc: Michal Hocko Cc: David Rientjes Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Andi Kleen Cc: Jan Kara Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Cc: "Aneesh Kumar K.V" Cc: Punit Agrawal Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6c6decc63469..696befffe6f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3686,6 +3686,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); /* * Currently, we are forced to kill the process in the event the @@ -3716,7 +3717,7 @@ retry: u32 hash; struct vm_fault vmf = { .vma = vma, - .address = address, + .address = haddr, .flags = flags, /* * Hard to debug if it ends up being @@ -3733,14 +3734,14 @@ retry: * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, address); + idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } - page = alloc_huge_page(vma, address, 0); + page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); if (ret == -ENOMEM) @@ -3789,12 +3790,12 @@ retry: * the spinlock. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3808,17 +3809,17 @@ retry: if (anon_rmap) { ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, vma, address); + hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); + set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); } spin_unlock(ptl); @@ -3830,7 +3831,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, address, page); + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -3883,10 +3884,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); - address &= huge_page_mask(h); - - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -3896,20 +3896,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); } else { - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -3939,16 +3939,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, - vma, address); + vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3973,16 +3973,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, + ret = hugetlb_cow(mm, vma, haddr, ptep, pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, + if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, ptep); + update_mmu_cache(vma, haddr, ptep); out_put_page: if (page != pagecache_page) unlock_page(page); -- cgit v1.2.3-59-g8ed1b From 6e292b9be7f4358985ce33ae1f59ab30a8c09e08 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:18 -0700 Subject: mm: split page_type out from _mapcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're already using a union of many fields here, so stop abusing the _mapcount and make page_type its own field. That implies renaming some of the machinery that creates PageBuddy, PageBalloon and PageKmemcg; bring back the PG_buddy, PG_balloon and PG_kmemcg names. As suggested by Kirill, make page_type a bitmask. Because it starts out life as -1 (thanks to sharing the storage with _mapcount), setting a page flag means clearing the appropriate bit. This gives us space for probably twenty or so extra bits (depending how paranoid we want to be about _mapcount underflow). Link: http://lkml.kernel.org/r/20180518194519.3820-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 ++++++++----- include/linux/page-flags.h | 45 ++++++++++++++++++++++++++------------------- kernel/crash_core.c | 1 + mm/page_alloc.c | 13 +++++-------- scripts/tags.sh | 6 +++--- 5 files changed, 43 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index eb9da245286d..3a554fdf45c2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,14 @@ struct page { }; union { + /* + * If the page is neither PageSlab nor mappable to userspace, + * the value stored here may help determine what this page + * is used for. See page-flags.h for a list of page types + * which are currently stored here. + */ + unsigned int page_type; + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ @@ -109,11 +117,6 @@ struct page { /* * Count of ptes mapped in mms, to show when * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. */ atomic_t _mapcount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e34a27727b9a..8c25b28a35aa 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -642,49 +642,56 @@ PAGEFLAG_FALSE(DoubleMap) #endif /* - * For pages that are never mapped to userspace, page->mapcount may be - * used for storing extra information about page type. Any value used - * for this purpose must be <= -2, but it's better start not too close - * to -2 so that an underflow of the page_mapcount() won't be mistaken - * for a special page. + * For pages that are never mapped to userspace (and aren't PageSlab), + * page_type may be used. Because it is initialised to -1, we invert the + * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and + * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and + * low bits so that an underflow or overflow of page_mapcount() won't be + * mistaken for a page type value. */ -#define PAGE_MAPCOUNT_OPS(uname, lname) \ + +#define PAGE_TYPE_BASE 0xf0000000 +/* Reserve 0x0000007f to catch underflows of page_mapcount */ +#define PG_buddy 0x00000080 +#define PG_balloon 0x00000100 +#define PG_kmemcg 0x00000200 + +#define PageType(page, flag) \ + ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + +#define PAGE_TYPE_OPS(uname, lname) \ static __always_inline int Page##uname(struct page *page) \ { \ - return atomic_read(&page->_mapcount) == \ - PAGE_##lname##_MAPCOUNT_VALUE; \ + return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ - atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ + VM_BUG_ON_PAGE(!PageType(page, 0), page); \ + page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - atomic_set(&page->_mapcount, -1); \ + page->page_type |= PG_##lname; \ } /* - * PageBuddy() indicate that the page is free and in the buddy system + * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) -PAGE_MAPCOUNT_OPS(Buddy, BUDDY) +PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is set on pages that are on the balloon page list + * PageBalloon() is true for pages that are on the balloon page list * (see mm/balloon_compaction.c). */ -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) -PAGE_MAPCOUNT_OPS(Balloon, BALLOON) +PAGE_TYPE_OPS(Balloon, balloon) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. */ -#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) -PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) +PAGE_TYPE_OPS(Kmemcg, kmemcg) extern bool is_free_buddy_page(struct page *page); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f7674d676889..b66aced5e8c2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 96d48d24e1ef..5afdf495c374 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if + * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. - * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is - * serialized by zone->lock. + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) - * field. + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. diff --git a/scripts/tags.sh b/scripts/tags.sh index e587610d1492..66f08bb1cce9 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -179,9 +179,9 @@ regex_c=( '/\ Date: Thu, 7 Jun 2018 17:08:26 -0700 Subject: mm: switch s_mem and slab_cache in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow us to store slub's counters in the same bits as slab's s_mem. slub now needs to set page->mapping to NULL as it frees the page, just like slab does. Link: http://lkml.kernel.org/r/20180518194519.3820-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- mm/slub.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a554fdf45c2..6f153f2fab50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -83,7 +83,7 @@ struct page { /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ struct address_space *mapping; - void *s_mem; /* slab first object */ + struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ }; @@ -194,7 +194,7 @@ struct page { spinlock_t ptl; #endif #endif - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + void *s_mem; /* slab first object */ }; #ifdef CONFIG_MEMCG diff --git a/mm/slub.c b/mm/slub.c index 48f75872c356..0170ea8a97fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1695,6 +1695,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); page_mapcount_reset(page); + page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; memcg_uncharge_slab(page, order, s); -- cgit v1.2.3-59-g8ed1b From 7d27a04bb2b5bd665f147439d18ae236080eef32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:31 -0700 Subject: mm: move 'private' union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By moving page->private to the fourth word of struct page, we can put the SLUB counters in the same word as SLAB's s_mem and still do the cmpxchg_double trick. Now the SLUB counters no longer overlap with the mapcount or refcount so we can drop the call to page_mapcount_reset() and simplify set_page_slub_counters() to a single line. Link: http://lkml.kernel.org/r/20180518194519.3820-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 56 +++++++++++++++++++++--------------------------- mm/slub.c | 20 ++--------------- 2 files changed, 27 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6f153f2fab50..bcc5ee8b7b07 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -65,15 +65,9 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) -#define _slub_counter_t unsigned long #else -#define _slub_counter_t unsigned int -#endif -#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ #define _struct_page_alignment -#define _slub_counter_t unsigned int -#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#endif struct page { /* First double word block */ @@ -95,6 +89,30 @@ struct page { /* page_deferred_list().prev -- second tail page */ }; + union { + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; +#if USE_SPLIT_PTE_PTLOCKS +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif +#endif + void *s_mem; /* slab first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + union { /* * If the page is neither PageSlab nor mappable to userspace, @@ -104,13 +122,7 @@ struct page { */ unsigned int page_type; - _slub_counter_t counters; unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; int units; /* SLOB */ struct { /* Page cache */ @@ -179,24 +191,6 @@ struct page { #endif }; - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif -#endif - void *s_mem; /* slab first object */ - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif diff --git a/mm/slub.c b/mm/slub.c index 0170ea8a97fe..f5db87839ab4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) -{ - struct page tmp; - tmp.counters = counters_new; - /* - * page->counters can cover frozen/inuse/objects as well - * as page->_refcount. If we assign to ->counters directly - * we run the risk of losing updates to page->_refcount, so - * be careful and only assign to the fields we need. - */ - page->frozen = tmp.frozen; - page->inuse = tmp.inuse; - page->objects = tmp.objects; -} - /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); return true; } @@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); local_irq_restore(flags); return true; @@ -1694,7 +1679,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - page_mapcount_reset(page); page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; -- cgit v1.2.3-59-g8ed1b From fa3015b7eed5da68a40c7667e8b3d5c20ef56e82 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:42 -0700 Subject: mm: use page->deferred_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we can represent the location of 'deferred_list' in C instead of comments, make use of that ability. Link: http://lkml.kernel.org/r/20180518194519.3820-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++----- mm/page_alloc.c | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ac5591d8622c..ba8fdc0b6e7f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * ->lru in the tail pages is occupied by compound_head. - * Let's use ->mapping + ->index in the second tail page as list_head. - */ - return (struct list_head *)&page[2].mapping; + /* ->lru in the tail pages is occupied by compound_head. */ + return &page[2].deferred_list; } void prep_transhuge_page(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5afdf495c374..628fd3e24731 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -952,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 2: /* * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. + * deferred_list.next -- ignore value. */ break; default: -- cgit v1.2.3-59-g8ed1b From b7ccc7f8c643de0b71cd2da3245d0c27038a8dd7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:46 -0700 Subject: mm: move lru union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the LRU is two words, this does not affect the double-word alignment of SLUB's freelist. Link: http://lkml.kernel.org/r/20180518194519.3820-10-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 102 +++++++++++++++++++++++------------------------ mm/slub.c | 8 ++-- 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 158f9693f865..5b97bd445ada 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -72,6 +72,57 @@ struct hmm; struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* + * WARNING: bit 0 of the first word encode PageTail(). That means + * the rest users of the storage space MUST NOT use the bit to + * avoid collision and false-positive PageTail(). + */ + union { + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone_lru_lock ! + * Can be used as a generic list + * by the page owner. + */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif + }; + + struct rcu_head rcu_head; /* Used by SLAB + * when destroying via RCU + */ + /* Tail pages of compound page */ + struct { + unsigned long compound_head; /* If bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ + }; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + struct { + unsigned long __pad; /* do not overlay pmd_huge_pte + * with compound_head to avoid + * possible bit 0 collision. + */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + }; +#endif + }; + /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ @@ -135,57 +186,6 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; - /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to - * avoid collision and false-positive PageTail(). - */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif diff --git a/mm/slub.c b/mm/slub.c index f5db87839ab4..a96bf429af08 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -52,11 +52,11 @@ * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning + * have the ability to do a cmpxchg_double. It only protects: * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * B. page->inuse -> Number of objects in use + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not * on any list. The processor that froze the slab is the one who can -- cgit v1.2.3-59-g8ed1b From 4da1984edbbed0ffc961006d265d78a7b0095ea4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:50 -0700 Subject: mm: combine LRU and main union in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives us five words of space in a single union in struct page. The compound_mapcount moves position (from offset 24 to offset 20) on 64-bit systems, but that does not seem likely to cause any trouble. Link: http://lkml.kernel.org/r/20180518194519.3820-11-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 97 +++++++++++++++++++++++------------------------- mm/page_alloc.c | 2 +- 2 files changed, 47 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b97bd445ada..dfe6d648e055 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,59 +73,19 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to + * Five words (20/40 bytes) are available in this union. + * WARNING: bit 0 of the first word is used for PageTail(). That + * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - - /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ + /** + * @lru: Pageout list, eg. active_list protected by + * zone_lru_lock. Sometimes used as a generic list + * by the page owner. + */ + struct list_head lru; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -138,6 +98,19 @@ struct page { unsigned long private; }; struct { /* slab, slob and slub */ + union { + struct list_head slab_list; /* uses lru */ + struct { /* Partial pages */ + struct page *next; +#ifdef CONFIG_64BIT + int pages; /* Nr of pages left */ + int pobjects; /* Approximate count */ +#else + short int pages; + short int pobjects; +#endif + }; + }; struct kmem_cache *slab_cache; /* not slob */ /* Double-word boundary */ void *freelist; /* first free object */ @@ -151,9 +124,22 @@ struct page { }; }; }; - atomic_t compound_mapcount; /* first tail page */ - struct list_head deferred_list; /* second tail page */ + struct { /* Tail pages of compound page */ + unsigned long compound_head; /* Bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + atomic_t compound_mapcount; + }; + struct { /* Second tail page of compound page */ + unsigned long _compound_pad_1; /* compound_head */ + unsigned long _compound_pad_2; + struct list_head deferred_list; + }; struct { /* Page table pages */ + unsigned long _pt_pad_1; /* compound_head */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS @@ -162,6 +148,15 @@ struct page { spinlock_t ptl; #endif }; + + /** @rcu_head: You can use this to free a page by RCU. */ + struct rcu_head rcu_head; + + /** + * @pgmap: For ZONE_DEVICE pages, this points to the hosting + * device page map. + */ + struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 628fd3e24731..ef1811531999 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -943,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping is compound_mapcount() */ + /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; -- cgit v1.2.3-59-g8ed1b From bf68c214df6d73e85fc19b6bbcb642554b491b98 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:05 -0700 Subject: slab,slub: remove rcu_head size checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rcu_head may now grow larger than list_head without affecting slab or slub. Link: http://lkml.kernel.org/r/20180518194519.3820-15-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 -- mm/slub.c | 27 ++------------------------- 2 files changed, 2 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index c1fe8099b3cd..36688f6c87eb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void) { int i; - BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < - sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) diff --git a/mm/slub.c b/mm/slub.c index a96bf429af08..d5bddf0f4792 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1686,17 +1686,9 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } -#define need_reserve_slab_rcu \ - (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) - static void rcu_free_slab(struct rcu_head *h) { - struct page *page; - - if (need_reserve_slab_rcu) - page = virt_to_head_page(h); - else - page = container_of((struct list_head *)h, struct page, lru); + struct page *page = container_of(h, struct page, rcu_head); __free_slab(page->slab_cache, page); } @@ -1704,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - struct rcu_head *head; - - if (need_reserve_slab_rcu) { - int order = compound_order(page); - int offset = (PAGE_SIZE << order) - s->reserved; - - VM_BUG_ON(s->reserved != sizeof(*head)); - head = page_address(page) + offset; - } else { - head = &page->rcu_head; - } - - call_rcu(head, rcu_free_slab); + call_rcu(&page->rcu_head, rcu_free_slab); } else __free_slab(s, page); } @@ -3583,9 +3563,6 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) s->random = get_random_long(); #endif - if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) - s->reserved = sizeof(struct rcu_head); - if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { -- cgit v1.2.3-59-g8ed1b From 9736d2a95e36ac3f60b063f498961103f3d4f165 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:10 -0700 Subject: slub: remove kmem_cache->reserved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reserved field was only used for embedding an rcu_head in the data structure. With the previous commit, we no longer need it. That lets us remove the 'reserved' argument to a lot of functions. Link: http://lkml.kernel.org/r/20180518194519.3820-16-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 1 - mm/slub.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3773e26c08c1..09fa2c6f0e68 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,7 +101,6 @@ struct kmem_cache { void (*ctor)(void *); unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ - unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ diff --git a/mm/slub.c b/mm/slub.c index d5bddf0f4792..f885dcf09750 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size) { - return (((unsigned int)PAGE_SIZE << order) - reserved) / size; + return ((unsigned int)PAGE_SIZE << order) / size; } static inline struct kmem_cache_order_objects oo_make(unsigned int order, - unsigned int size, unsigned int reserved) + unsigned int size) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + order_objects(order, size, reserved) + (order << OO_SHIFT) + order_objects(order, size) }; return x; @@ -832,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)) - s->reserved; + length = PAGE_SIZE << compound_order(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -921,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = order_objects(compound_order(page), s->size, s->reserved); + maxobj = order_objects(compound_order(page), s->size); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", page->objects, maxobj); @@ -971,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = order_objects(compound_order(page), s->size, s->reserved); + max_objects = order_objects(compound_order(page), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -3193,21 +3193,21 @@ static unsigned int slub_min_objects; */ static inline unsigned int slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, - unsigned int fract_leftover, unsigned int reserved) + unsigned int fract_leftover) { unsigned int min_order = slub_min_order; unsigned int order; - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size)); order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; - rem = (slab_size - reserved) % size; + rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; @@ -3216,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size, return order; } -static inline int calculate_order(unsigned int size, unsigned int reserved) +static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; @@ -3233,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = order_objects(slub_max_order, size, reserved); + max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); while (min_objects > 1) { @@ -3242,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction, reserved); + slub_max_order, fraction); if (order <= slub_max_order) return order; fraction /= 2; @@ -3254,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1, reserved); + order = slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1, reserved); + order = slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -3529,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size, s->reserved); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -3547,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size, s->reserved); - s->min = oo_make(get_order(size), size, s->reserved); + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -3558,7 +3558,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); - s->reserved = 0; #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif @@ -5077,7 +5076,7 @@ SLAB_ATTR_RO(destroy_by_rcu); static ssize_t reserved_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->reserved); + return sprintf(buf, "0\n"); } SLAB_ATTR_RO(reserved); -- cgit v1.2.3-59-g8ed1b From 325d7d4a968669bec18c054db6b5de5f4fc82874 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:14 -0700 Subject: slub: remove 'reserved' file from sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Christoph doubts anyone was using the 'reserved' file in sysfs, so remove it. Link: http://lkml.kernel.org/r/20180518194519.3820-17-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f885dcf09750..15505479c3ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5074,12 +5074,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t reserved_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "0\n"); -} -SLAB_ATTR_RO(reserved); - #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -5392,7 +5386,6 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, - &reserved_attr.attr, &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, -- cgit v1.2.3-59-g8ed1b From 20acce679910fb6d30a89fa02586a3b4a134dfeb Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:09:17 -0700 Subject: mm/shmem.c: use new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See commit 1c8f422059ae ("mm: change return type to vm_fault_t") vmf_error() is the newly introduce inline function in 4.17-rc6. Link: http://lkml.kernel.org/r/20180521202410.GA17912@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 9a1b553e30e0..2b686e3f53ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1947,14 +1947,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in return ret; } -static int shmem_fault(struct vm_fault *vmf) +static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; - int error; - int ret = VM_FAULT_LOCKED; + int err; + vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can @@ -2022,10 +2022,10 @@ static int shmem_fault(struct vm_fault *vmf) else if (vma->vm_flags & VM_HUGEPAGE) sgp = SGP_HUGE; - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); - if (error) - return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (err) + return vmf_error(err); return ret; } -- cgit v1.2.3-59-g8ed1b From be09102b4190561b67e3809b07a7fd29c9774152 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:09:21 -0700 Subject: mm: memcg: allow lowering memory.swap.max below the current usage Currently an attempt to set swap.max into a value lower than the actual swap usage fails, which causes configuration problems as there's no way of lowering the configuration below the current usage short of turning off swap entirely. This makes swap.max difficult to use and allows delegatees to lock the delegator out of reducing swap allocation. This patch updates swap_max_write() so that the limit can be lowered below the current usage. It doesn't implement active reclaiming of swap entries for the following reasons. * mem_cgroup_swap_full() already tells the swap machinary to aggressively reclaim swap entries if the usage is above 50% of limit, so simply lowering the limit automatically triggers gradual reclaim. * Forcing back swapped out pages is likely to heavily impact the workload and mess up the working set. Given that swap usually is a lot less valuable and less scarce, letting the existing usage dissipate over time through the above gradual reclaim and as they're falted back in is likely the better behavior. Link: http://lkml.kernel.org/r/20180523185041.GR1718769@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Acked-by: Roman Gushchin Acked-by: Rik van Riel Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ mm/memcontrol.c | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index e34d3c938729..8a2c52d5c53b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,6 +1245,11 @@ PAGE_SIZE multiple when read back. because of running out of swap system-wide or max limit. + When reduced under the current usage, the existing swap + entries are reclaimed gradually and the swap usage may stay + higher than the limit for an extended period of time. This + reduces the impact on the workload and memory management. + Usage Guidelines ~~~~~~~~~~~~~~~~ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3d56927a724..c1e64d60ed02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6280,11 +6280,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_max_mutex); - err = page_counter_set_max(&memcg->swap, max); - mutex_unlock(&memcg_max_mutex); - if (err) - return err; + xchg(&memcg->swap.max, max); return nbytes; } -- cgit v1.2.3-59-g8ed1b From df2cc96e77011cf7989208b206da9817e0321028 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:09:25 -0700 Subject: userfaultfd: prevent non-cooperative events vs mcopy_atomic races If a process monitored with userfaultfd changes it's memory mappings or forks() at the same time as uffd monitor fills the process memory with UFFDIO_COPY, the actual creation of page table entries and copying of the data in mcopy_atomic may happen either before of after the memory mapping modifications and there is no way for the uffd monitor to maintain consistent view of the process memory layout. For instance, let's consider fork() running in parallel with userfaultfd_copy(): process | uffd monitor ---------------------------------+------------------------------ fork() | userfaultfd_copy() ... | ... dup_mmap() | down_read(mmap_sem) down_write(mmap_sem) | /* create PTEs, copy data */ dup_uffd() | up_read(mmap_sem) copy_page_range() | up_write(mmap_sem) | dup_uffd_complete() | /* notify monitor */ | If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be present by the time copy_page_range() is called and they will appear in the child's memory mappings. However, if the fork() is the first to take the mmap_sem, the new pages won't be mapped in the child's address space. If the pages are not present and child tries to access them, the monitor will get page fault notification and everything is fine. However, if the pages *are present*, the child can access them without uffd noticing. And if we copy them into child it'll see the wrong data. Since we are talking about background copy, we'd need to decide whether the pages should be copied or not regardless #PF notifications. Since userfaultfd monitor has no way to determine what was the order, let's disallow userfaultfd_copy in parallel with the non-cooperative events. In such case we return -EAGAIN and the uffd monitor can understand that userfaultfd_copy() clashed with a non-cooperative event and take an appropriate action. Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Pavel Emelyanov Cc: Andrea Arcangeli Cc: Mike Kravetz Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 22 ++++++++++++++++++++-- include/linux/userfaultfd_k.h | 6 ++++-- mm/userfaultfd.c | 22 +++++++++++++++++----- 3 files changed, 41 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c8468f..123bf7d516fc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -62,6 +62,8 @@ struct userfaultfd_ctx { enum userfaultfd_state state; /* released */ bool released; + /* memory mappings are changing because of non-cooperative event */ + bool mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: + WRITE_ONCE(ctx->mmap_changing, false); userfaultfd_ctx_put(ctx); } @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + WRITE_ONCE(octx->mmap_changing, true); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); } } @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); up_read(&mm->mmap_sem); msg_init(&ewq.msg); @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + uffdio_copy.len, &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2f3b68ba910..e091f0a11b11 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -31,10 +31,12 @@ extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len); + unsigned long src_start, unsigned long len, + bool *mmap_changing); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len); + unsigned long len, + bool *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 39791b81ede7..5029f241908f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + bool zeropage, + bool *mmap_changing) { struct vm_area_struct *dst_vma; ssize_t err; @@ -430,6 +431,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, retry: down_read(&dst_mm->mmap_sem); + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. @@ -563,13 +573,15 @@ out: } ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len) + unsigned long src_start, unsigned long len, + bool *mmap_changing) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, + mmap_changing); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len) + unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); } -- cgit v1.2.3-59-g8ed1b From 7810e6781e0fcbca78b91cf65053f895bf59e85f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 7 Jun 2018 17:09:29 -0700 Subject: mm, page_alloc: do not break __GFP_THISNODE by zonelist reset In __alloc_pages_slowpath() we reset zonelist and preferred_zoneref for allocations that can ignore memory policies. The zonelist is obtained from current CPU's node. This is a problem for __GFP_THISNODE allocations that want to allocate on a different node, e.g. because the allocating thread has been migrated to a different CPU. This has been observed to break SLAB in our 4.4-based kernel, because there it relies on __GFP_THISNODE working as intended. If a slab page is put on wrong node's list, then further list manipulations may corrupt the list because page_to_nid() is used to determine which node's list_lock should be locked and thus we may take a wrong lock and race. Current SLAB implementation seems to be immune by luck thanks to commit 511e3a058812 ("mm/slab: make cache_grow() handle the page allocated on arbitrary node") but there may be others assuming that __GFP_THISNODE works as promised. We can fix it by simply removing the zonelist reset completely. There is actually no reason to reset it, because memory policies and cpusets don't affect the zonelist choice in the first place. This was different when commit 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK") introduced the code, as mempolicies provided their own restricted zonelists. We might consider this for 4.17 although I don't know if there's anything currently broken. SLAB is currently not affected, but in kernels older than 4.7 that don't yet have 511e3a058812 ("mm/slab: make cache_grow() handle the page allocated on arbitrary node") it is. That's at least 4.4 LTS. Older ones I'll have to check. So stable backports should be more important, but will have to be reviewed carefully, as the code went through many changes. BTW I think that also the ac->preferred_zoneref reset is currently useless if we don't also reset ac->nodemask from a mempolicy to NULL first (which we probably should for the OOM victims etc?), but I would leave that for a separate patch. Link: http://lkml.kernel.org/r/20180525130853.13915-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Fixes: 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK") Acked-by: Mel Gorman Cc: Michal Hocko Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef1811531999..07b3c23762ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4169,7 +4169,6 @@ retry: * orientated. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } -- cgit v1.2.3-59-g8ed1b From daa280753cefa692607190852a98a9c06ae9ec9a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 7 Jun 2018 17:09:32 -0700 Subject: mm/shmem.c: zero out unused vma fields in shmem_pseudo_vma_init() shmem/tmpfs uses pseudo vma to allocate page with correct NUMA policy. The pseudo vma doesn't have vm_page_prot set. We are going to encode encryption KeyID in vm_page_prot. Having garbage there causes problems. Zero out all unused fields in the pseudo vma. Link: http://lkml.kernel.org/r/20180531135602.20321-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 2b686e3f53ad..e9a7ac74823d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1420,10 +1420,9 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - vma->vm_start = 0; + memset(vma, 0, sizeof(*vma)); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_ops = NULL; vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); } -- cgit v1.2.3-59-g8ed1b From ce91f6ee5b3bbbad8caff61b1c46d845c8db19bf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Jun 2018 17:09:40 -0700 Subject: mm: kvmalloc does not fallback to vmalloc for incompatible gfp flags kvmalloc warned about incompatible gfp_mask to catch abusers (mostly GFP_NOFS) with an intention that this will motivate authors of the code to fix those. Linus argues that this just motivates people to do even more hacks like if (gfp == GFP_KERNEL) kvmalloc else kmalloc I haven't seen this happening much (Linus pointed to bucket_lock special cases an atomic allocation but my git foo hasn't found much more) but it is true that we can grow those in future. Therefore Linus suggested to simply not fallback to vmalloc for incompatible gfp flags and rather stick with the kmalloc path. Link: http://lkml.kernel.org/r/20180601115329.27807-1-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: Linus Torvalds Cc: Tom Herbert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bucket_locks.c | 5 +---- mm/util.c | 6 ++++-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/lib/bucket_locks.c b/lib/bucket_locks.c index 266a97c5708b..ade3ce6c4af6 100644 --- a/lib/bucket_locks.c +++ b/lib/bucket_locks.c @@ -30,10 +30,7 @@ int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, } if (sizeof(spinlock_t) != 0) { - if (gfpflags_allow_blocking(gfp)) - tlocks = kvmalloc(size * sizeof(spinlock_t), gfp); - else - tlocks = kmalloc_array(size, sizeof(spinlock_t), gfp); + tlocks = kvmalloc_array(size, sizeof(spinlock_t), gfp); if (!tlocks) return -ENOMEM; for (i = 0; i < size; i++) diff --git a/mm/util.c b/mm/util.c index c2d0a7cdb189..3351659200e6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -391,7 +391,8 @@ EXPORT_SYMBOL(vm_mmap); * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not + * fall back to vmalloc. */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -402,7 +403,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) * so the given set of flags has to be compatible. */ - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + if ((flags & GFP_KERNEL) != GFP_KERNEL) + return kmalloc_node(size, flags, node); /* * We want to attempt a large physically contiguous block first because -- cgit v1.2.3-59-g8ed1b