From fa76da461bb0be13c8339d984dcf179151167c8f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:02:16 -0700 Subject: mm: /proc/pid/smaps_rollup: fix NULL pointer deref in smaps_pte_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leonardo reports an apparent regression in 4.19-rc7: BUG: unable to handle kernel NULL pointer dereference at 00000000000000f0 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 6032 Comm: python Not tainted 4.19.0-041900rc7-lowlatency #201810071631 Hardware name: LENOVO 80UG/Toronto 4A2, BIOS 0XCN45WW 08/09/2018 RIP: 0010:smaps_pte_range+0x32d/0x540 Code: 80 00 00 00 00 74 a9 48 89 de 41 f6 40 52 40 0f 85 04 02 00 00 49 2b 30 48 c1 ee 0c 49 03 b0 98 00 00 00 49 8b 80 a0 00 00 00 <48> 8b b8 f0 00 00 00 e8 b7 ef ec ff 48 85 c0 0f 84 71 ff ff ff a8 RSP: 0018:ffffb0cbc484fb88 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000560ddb9e9000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000560ddb9e9 RDI: 0000000000000001 RBP: ffffb0cbc484fbc0 R08: ffff94a5a227a578 R09: ffff94a5a227a578 R10: 0000000000000000 R11: 0000560ddbbe7000 R12: ffffe903098ba728 R13: ffffb0cbc484fc78 R14: ffffb0cbc484fcf8 R15: ffff94a5a2e9cf48 FS: 00007f6dfb683740(0000) GS:ffff94a5aaf80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000f0 CR3: 000000011c118001 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __walk_page_range+0x3c2/0x6f0 walk_page_vma+0x42/0x60 smap_gather_stats+0x79/0xe0 ? gather_pte_stats+0x320/0x320 ? gather_hugetlb_stats+0x70/0x70 show_smaps_rollup+0xcd/0x1c0 seq_read+0x157/0x400 __vfs_read+0x3a/0x180 ? security_file_permission+0x93/0xc0 ? security_file_permission+0x93/0xc0 vfs_read+0x8f/0x140 ksys_read+0x55/0xc0 __x64_sys_read+0x1a/0x20 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Decoded code matched to local compilation+disassembly points to smaps_pte_entry(): } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = find_get_entry(vma->vm_file->f_mapping, linear_page_index(vma, addr)); Here, vma->vm_file is NULL. mss->check_shmem_swap should be false in that case, however for smaps_rollup, smap_gather_stats() can set the flag true for one vma and leave it true for subsequent vma's where it should be false. To fix, reset the check_shmem_swap flag to false. There's also related bug which sets mss->swap to shmem_swapped, which in the context of smaps_rollup overwrites any value accumulated from previous vma's. Fix that as well. Note that the report suggests a regression between 4.17.19 and 4.19-rc7, which makes the 4.19 series ending with commit 258f669e7e88 ("mm: /proc/pid/smaps_rollup: convert to single value seq_file") suspicious. But the mss was reused for rollup since 493b0e9d945f ("mm: add /proc/pid/smaps_rollup") so let's play it safe with the stable backport. Link: http://lkml.kernel.org/r/555fbd1f-4ac9-0b58-dcd4-5dc4380ff7ca@suse.cz Link: https://bugzilla.kernel.org/show_bug.cgi?id=201377 Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup") Signed-off-by: Vlastimil Babka Reported-by: Leonardo Soares Müller Tested-by: Leonardo Soares Müller Cc: Greg Kroah-Hartman Cc: Daniel Colascione Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4..a027473561c6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, smaps_walk.private = mss; #ifdef CONFIG_SHMEM + /* In case of smaps_rollup, reset the value from previous vma */ + mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss->swap = shmem_swapped; + mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; -- cgit v1.2.3-59-g8ed1b From ae62c16e105a869524afcf8a07ee85c5ae5d0479 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Oct 2018 15:02:19 -0700 Subject: userfaultfd: disable irqs when taking the waitqueue lock userfaultfd contains howe-grown locking of the waitqueue lock, and does not disable interrupts. This relies on the fact that no one else takes it from interrupt context and violates an invariat of the normal waitqueue locking scheme. With aio poll it is easy to trigger other locks that disable interrupts (or are called from interrupt context). Link: http://lkml.kernel.org/r/20181018154101.18750-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Andrea Arcangeli Reviewed-by: Andrew Morton Cc: [4.19.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bfa0ec69f924..356d2b8568c1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); __add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ret = -EAGAIN; break; } - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); schedule(); - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); } __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); -- cgit v1.2.3-59-g8ed1b From 32c1b90dcd90c7f6343e1b328801f7be5f7da88d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 26 Oct 2018 15:02:41 -0700 Subject: ocfs2/dlm: remove unnecessary parentheses Clang warns when more than one set of parentheses is used for a single conditional statement: fs/ocfs2/dlm/dlmthread.c:534:18: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] if ((res->owner == dlm->node_num)) { ~~~~~~~~~~~^~~~~~~~~~~~~~~~ fs/ocfs2/dlm/dlmthread.c:534:18: note: remove extraneous parentheses around the comparison to silence this warning if ((res->owner == dlm->node_num)) { ~ ^ ~ Link: http://lkml.kernel.org/r/20180924181929.6853-1-natechancellor@gmail.com Signed-off-by: Nathan Chancellor Reported-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 838a06d4066a..074d5de17bb2 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num)) { + if (res->owner == dlm->node_num) { if (res->state & (DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_BLOCK_DIRTY)) return; -- cgit v1.2.3-59-g8ed1b From 2de24cb742d4f0c41358aa078bed7f089c827ac7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 26 Oct 2018 15:02:45 -0700 Subject: ocfs2: remove unused pointer 'eb' Pointer 'eb' is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'eb' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20180828141907.10826-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a342f008e42f..d1cbb27808e2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle, * rightmost extent list. */ if (path->p_tree_depth) { - struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); @@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle, mlog_errno(ret); goto out; } - - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; } if (rec->e_cpos == split_rec->e_cpos && -- cgit v1.2.3-59-g8ed1b From 0ae1c2dbdccc090762eb3c89183d7fc80dafca76 Mon Sep 17 00:00:00 2001 From: Ding Xiang Date: Fri, 26 Oct 2018 15:02:48 -0700 Subject: ocfs2: remove unneeded null check Null check for kfree is unnecessary, so remove it. Link: http://lkml.kernel.org/r/1535704514-26559-1-git-send-email-dingxiang@cmss.chinamobile.com Signed-off-by: Ding Xiang Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 302cd7caa4a7..da578ad4c08f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1392,8 +1392,7 @@ retry: unlock: spin_unlock(&oi->ip_lock); out: - if (new) - kfree(new); + kfree(new); return ret; } -- cgit v1.2.3-59-g8ed1b From 999865764f5f128896402572b439269acb471022 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 26 Oct 2018 15:02:52 -0700 Subject: fs/ocfs2/dlm/dlmdebug.c: fix a sleep-in-atomic-context bug in dlm_print_one_mle() The kernel module may sleep with holding a spinlock. The function call paths (from bottom to top) in Linux-4.16 are: [FUNC] get_zeroed_page(GFP_NOFS) fs/ocfs2/dlm/dlmdebug.c, 332: get_zeroed_page in dlm_print_one_mle fs/ocfs2/dlm/dlmmaster.c, 240: dlm_print_one_mle in __dlm_put_mle fs/ocfs2/dlm/dlmmaster.c, 255: __dlm_put_mle in dlm_put_mle fs/ocfs2/dlm/dlmmaster.c, 254: spin_lock in dlm_put_ml [FUNC] get_zeroed_page(GFP_NOFS) fs/ocfs2/dlm/dlmdebug.c, 332: get_zeroed_page in dlm_print_one_mle fs/ocfs2/dlm/dlmmaster.c, 240: dlm_print_one_mle in __dlm_put_mle fs/ocfs2/dlm/dlmmaster.c, 222: __dlm_put_mle in dlm_put_mle_inuse fs/ocfs2/dlm/dlmmaster.c, 219: spin_lock in dlm_put_mle_inuse To fix this bug, GFP_NOFS is replaced with GFP_ATOMIC. This bug is found by my static analysis tool DSAC. Link: http://lkml.kernel.org/r/20180901112528.27025-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdebug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 9b984cae4c4e..1d6dc8422899 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) { char *buf; - buf = (char *) get_zeroed_page(GFP_NOFS); + buf = (char *) get_zeroed_page(GFP_ATOMIC); if (buf) { dump_mle(mle, buf, PAGE_SIZE - 1); free_page((unsigned long)buf); -- cgit v1.2.3-59-g8ed1b From 867632d6a6126a9ef3e8d8015423d90f8613f2ef Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 26 Oct 2018 15:02:56 -0700 Subject: ocfs2: remove set but not used variable 'rb' Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/refcounttree.c: In function 'ocfs2_create_reflink_node': fs/ocfs2/refcounttree.c:4138:31: warning: variable 'rb' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1536198443-113047-1-git-send-email-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..1114ef02e780 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); - struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; @@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, -- cgit v1.2.3-59-g8ed1b From 5780a02fd1e87641ad6a8dd6891a1e890cf45c5d Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 26 Oct 2018 15:02:59 -0700 Subject: fs/iomap.c: change return type to vm_fault_t Change iomap_page_mkwrite() return type to vm_fault_t. see commit 1c8f422059ae ("mm: change return type to vm_fault_t") for reference. Link: http://lkml.kernel.org/r/20180827172050.GA18673@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap.c | 2 +- include/linux/iomap.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..90c2febc93ac 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3555d54bf79a..9a4258154b25 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -6,6 +6,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, + const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, -- cgit v1.2.3-59-g8ed1b From 7f2764cfbd85a18170f9d7a4cf01454dead8b0bc Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 26 Oct 2018 15:04:06 -0700 Subject: cramfs: convert to use vmf_insert_mixed cramfs is the only remaining user of vm_insert_mixed() and should be converted to vmf_insert_mixed(). Based on a previous patch from Matthew Wilcox. Link: http://lkml.kernel.org/r/nycvar.YSQ.7.76.1808290945450.10215@knanqh.ubzr Signed-off-by: Nicolas Pitre Reviewed-by: Andrew Morton Cc: Souptick Joarder a Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cramfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f408994fc632..0c35e62f108d 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) int i; vma->vm_flags |= VM_MIXEDMAP; for (i = 0; i < pages && !ret; i++) { + vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + if (vmf & VM_FAULT_ERROR) + ret = vm_fault_to_errno(vmf, 0); } } -- cgit v1.2.3-59-g8ed1b From 2e03b4bc4ae84fcc0eee00e5ba5d228901d38809 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:41 -0700 Subject: dcache: allocate external names from reclaimable kmalloc caches We can use the newly introduced kmalloc-reclaimable-X caches, to allocate external names in dcache, which will take care of the proper accounting automatically, and also improve anti-fragmentation page grouping. This effectively reverts commit f1782c9bc547 ("dcache: account external names as indirectly reclaimable memory") and instead passes __GFP_RECLAIMABLE to kmalloc(). The accounting thus moves from NR_INDIRECTLY_RECLAIMABLE_BYTES to NR_SLAB_RECLAIMABLE, which is also considered in MemAvailable calculation and overcommit decisions. Link: http://lkml.kernel.org/r/20180731090649.16028-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Matthew Wilcox Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 2e7e8d85e9b4..c2e443fb76ae 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -static void __d_free_external_name(struct rcu_head *head) -{ - struct external_name *name = container_of(head, struct external_name, - u.head); - - mod_node_page_state(page_pgdat(virt_to_page(name)), - NR_INDIRECTLY_RECLAIMABLE_BYTES, - -ksize(name)); - - kfree(name); -} - static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - - __d_free_external_name(&external_name(dentry)->u.head); - + kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - call_rcu(&p->u.head, __d_free_external_name); + kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { - struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - - ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); - if (!ext) { + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&ext->u.count, 1); - dname = ext->name; + atomic_set(&p->u.count, 1); + dname = p->name; } else { dname = dentry->d_iname; } @@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } - if (unlikely(ext)) { - pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); - mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, - ksize(ext)); - } - this_cpu_inc(nr_dentry); return dentry; @@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - call_rcu(&old_name->u.head, __d_free_external_name); + kfree_rcu(old_name, u.head); } /* -- cgit v1.2.3-59-g8ed1b From 61f94e18de94f79abaad3bb83549ff78923ac785 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:50 -0700 Subject: mm, proc: add KReclaimable to /proc/meminfo The vmstat NR_KERNEL_MISC_RECLAIMABLE counter is for kernel non-slab allocations that can be reclaimed via shrinker. In /proc/meminfo, we can show the sum of all reclaimable kernel allocations (including slab) as "KReclaimable". Add the same counter also to per-node meminfo under /sys With this counter, users will have more complete information about kernel memory usage. Non-slab reclaimable pages (currently just the ION allocator) will not be missing from /proc/meminfo, making users wonder where part of their memory went. More precisely, they already appear in MemAvailable, but without the new counter, it's not obvious why the value in MemAvailable doesn't fully correspond with the sum of other counters participating in it. Link: http://lkml.kernel.org/r/20180731090649.16028-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++++ drivers/base/node.c | 19 ++++++++++++------- fs/proc/meminfo.c | 16 ++++++++-------- 3 files changed, 24 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22b4b00dee31..12a5e6e693b6 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -858,6 +858,7 @@ Writeback: 0 kB AnonPages: 861800 kB Mapped: 280372 kB Shmem: 644 kB +KReclaimable: 168048 kB Slab: 284364 kB SReclaimable: 159856 kB SUnreclaim: 124508 kB @@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated with huge pages ShmemPmdMapped: Shared memory mapped into userspace with huge pages +KReclaimable: Kernel allocations that the kernel will attempt to reclaim + under memory pressure. Includes SReclaimable (below), and other + direct allocations with a shrinker. Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/drivers/base/node.c b/drivers/base/node.c index 1ac4c36e13bb..86d6cd92ce3d 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -67,8 +67,11 @@ static ssize_t node_read_meminfo(struct device *dev, int nid = dev->id; struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; + unsigned long sreclaimable, sunreclaimable; si_meminfo_node(&i, nid); + sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE); + sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE); n = sprintf(buf, "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" @@ -118,6 +121,7 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d WritebackTmp: %8lu kB\n" + "Node %d KReclaimable: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" "Node %d SUnreclaim: %8lu kB\n" @@ -138,20 +142,21 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + - node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), - nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), + nid, K(sreclaimable + + node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)), + nid, K(sreclaimable + sunreclaimable), + nid, K(sreclaimable), + nid, K(sunreclaimable) #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), + , nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * - HPAGE_PMD_NR)); -#else - nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); + HPAGE_PMD_NR) #endif + ); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index edda898714eb..568d90e17c17 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) long cached; long available; unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); @@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); + sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); + sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "Slab: ", - global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); - - show_val_kb(m, "SReclaimable: ", - global_node_page_state(NR_SLAB_RECLAIMABLE)); - show_val_kb(m, "SUnreclaim: ", - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", -- cgit v1.2.3-59-g8ed1b From 8508cf3ffad4defa202b303e5b6379efc4cd9054 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:11 -0700 Subject: sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD There are several definitions of those functions/macros in places that mess with fixed-point load averages. Provide an official version. [akpm@linux-foundation.org: fix missed conversion in block/blk-iolatency.c] Link: http://lkml.kernel.org/r/20180828172258.3185-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/spufs/sched.c | 9 +++------ arch/s390/appldata/appldata_os.c | 4 ---- block/blk-iolatency.c | 8 +++++--- drivers/cpuidle/governors/menu.c | 4 ---- fs/proc/loadavg.c | 3 --- include/linux/sched/loadavg.h | 21 +++++++++++++++++---- kernel/debug/kdb/kdb_main.c | 7 +------ kernel/sched/loadavg.c | 15 --------------- 9 files changed, 27 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 882944c36ef5..5d8e8b6bb1cc 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) cpu = info->policy->cpu; busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); - CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); + info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", cpu, busy_spus, info->busy_spus); diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index c9ef3c532169..9fcccb4490b9 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -987,9 +987,9 @@ static void spu_calc_load(void) unsigned long active_tasks; /* fixed-point */ active_tasks = count_active_contexts() * FIXED_1; - CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); - CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); - CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); + spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); + spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); + spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); } static void spusched_wake(struct timer_list *unused) @@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, } } -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int show_spu_loadavg(struct seq_file *s, void *private) { int a, b, c; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 433a994b1a89..54f375627532 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -25,10 +25,6 @@ #include "appldata.h" - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - /* * OS data * diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 35c48d7b8f78..28f80d227528 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -153,7 +153,7 @@ struct iolatency_grp { #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC /* * These are the constants used to fake the fixed-point moving average - * calculation just like load average. The call to CALC_LOAD folds + * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, return; /* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load() takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); + iolat->lat_avg = calc_load(iolat->lat_avg, + iolatency_exp_factors[exp_idx], + stat->rqs.mean); } static inline bool iolatency_may_queue(struct iolatency_grp *iolat, diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 575a68f31761..71979605246e 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -130,10 +130,6 @@ struct menu_device { int interval_ptr; }; - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static inline int get_loadavg(unsigned long load) { return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d06694757201..8468baee951d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include #include -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 80bc84ba5d2a..cc9cc62bb1f8 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -22,10 +22,23 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define EXP_5 2014 /* 1/exp(5sec/5min) */ #define EXP_15 2037 /* 1/exp(5sec/15min) */ -#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; +/* + * a1 = a0 * e + a * (1 - e) + */ +static inline unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) extern void calc_global_load(unsigned long ticks); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..54fbdfb2d86c 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,21 +91,6 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - unsigned long newload; - - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; - - return newload / FIXED_1; -} - #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. -- cgit v1.2.3-59-g8ed1b From 4b85afbdacd290c7a22c96df40a6433fdcacb509 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:42 -0700 Subject: mm: zero-seek shrinkers The page cache and most shrinkable slab caches hold data that has been read from disk, but there are some caches that only cache CPU work, such as the dentry and inode caches of procfs and sysfs, as well as the subset of radix tree nodes that track non-resident page cache. Currently, all these are shrunk at the same rate: using DEFAULT_SEEKS for the shrinker's seeks setting tells the reclaim algorithm that for every two page cache pages scanned it should scan one slab object. This is a bogus setting. A virtual inode that required no IO to create is not twice as valuable as a page cache page; shadow cache entries with eviction distances beyond the size of memory aren't either. In most cases, the behavior in practice is still fine. Such virtual caches don't tend to grow and assert themselves aggressively, and usually get picked up before they cause problems. But there are scenarios where that's not true. Our database workloads suffer from two of those. For one, their file workingset is several times bigger than available memory, which has the kernel aggressively create shadow page cache entries for the non-resident parts of it. The workingset code does tell the VM that most of these are expendable, but the VM ends up balancing them 2:1 to cache pages as per the seeks setting. This is a huge waste of memory. These workloads also deal with tens of thousands of open files and use /proc for introspection, which ends up growing the proc_inode_cache to absurdly large sizes - again at the cost of valuable cache space, which isn't a reasonable trade-off, given that proc inodes can be re-created without involving the disk. This patch implements a "zero-seek" setting for shrinkers that results in a target ratio of 0:1 between their objects and IO-backed caches. This allows such virtual caches to grow when memory is available (they do cache/avoid CPU work after all), but effectively disables them as soon as IO-backed objects are under pressure. It then switches the shrinkers for procfs and sysfs metadata, as well as excess page cache shadow nodes, to the new zero-seek setting. Link: http://lkml.kernel.org/r/20181009184732.762-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Domas Mituzas Reviewed-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/kernfs/mount.c | 3 +++ fs/proc/inode.c | 3 +++ mm/vmscan.c | 15 ++++++++++++--- mm/workingset.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..fdf527b6d79c 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; + /* sysfs dentries and inodes don't require IO to create */ + sb->s_shrink.seeks = 0; + /* get root inode, initialize and unlock it */ mutex_lock(&kernfs_mutex); inode = kernfs_get_inode(sb, info->root->kn); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fc5306a31a1d..5792f9e39466 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent) */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ea87586925e..28c9ae5633b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -474,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } /* * Make sure we apply some minimal pressure on default priority diff --git a/mm/workingset.c b/mm/workingset.c index 7e6ef312cea5..cbc13d4dfa79 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -534,7 +534,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; -- cgit v1.2.3-59-g8ed1b