From 313a5257b84c26b7f080c5d294aabe7d38ca439c Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Thu, 25 Jun 2020 20:29:17 -0700 Subject: openrisc: fix boot oops when DEBUG_VM is enabled Since v5.8-rc1 OpenRISC Linux fails to boot when DEBUG_VM is enabled. This has been bisected to commit 42fc541404f2 ("mmap locking API: add mmap_assert_locked() and mmap_assert_write_locked()"). The added locking checks exposed the issue that OpenRISC was not taking this mmap lock when during page walks for DMA operations. This patch locks and unlocks the mmap lock for page walking. Link: http://lkml.kernel.org/r/20200617090247.1680188-1-shorne@gmail.com Fixes: 42fc541404f2 ("mmap locking API: add mmap_assert_locked() and mmap_assert_write_locked()" Signed-off-by: Stafford Horne Reviewed-by: Michel Lespinasse Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Jason Gunthorpe Cc: Steven Price Cc: Thomas Hellstrom Cc: Robin Murphy Cc: Vlastimil Babka Cc: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/openrisc/kernel/dma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index c152a68811dd..345727638d52 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -74,8 +74,11 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size) * We need to iterate through the pages, clearing the dcache for * them and setting the cache-inhibit bit. */ + mmap_read_lock(&init_mm); error = walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, NULL); + mmap_read_unlock(&init_mm); + if (error) return ERR_PTR(error); return cpu_addr; @@ -85,9 +88,11 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size) { unsigned long va = (unsigned long)cpu_addr; + mmap_read_lock(&init_mm); /* walk_page_range shouldn't be able to fail here */ WARN_ON(walk_page_range(&init_mm, va, va + size, &clear_nocache_walk_ops, NULL)); + mmap_read_unlock(&init_mm); } void arch_sync_dma_for_device(phys_addr_t addr, size_t size, -- cgit v1.2.3-59-g8ed1b From 545b1b077ca6b359820436af097bc65e3f6f6cc9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 25 Jun 2020 20:29:21 -0700 Subject: mm: do_swap_page(): fix up the error code do_swap_page() returns error codes from the VM_FAULT* space. try_charge() might return -ENOMEM, though, and then do_swap_page() simply returns 0 which means a success. We almost never return ENOMEM for GFP_KERNEL single page charge. Except for async OOM handling (oom_disabled v1). So this needs translation to VM_FAULT_OOM otherwise the the page fault path will not notify the userspace and wait for an action. Link: http://lkml.kernel.org/r/20200617090238.GL9499@dhcp22.suse.cz Fixes: 4c6355b25e8b ("mm: memcontrol: charge swapin pages on instantiation") Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Joonsoo Kim Cc: Shakeel Butt Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index dc7f3543b1fd..1c632faa2611 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3140,8 +3140,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) err = mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL); ClearPageSwapCache(page); - if (err) + if (err) { + ret = VM_FAULT_OOM; goto out_page; + } lru_cache_add(page); swap_readpage(page, true); -- cgit v1.2.3-59-g8ed1b From b9e20f0da1f5c9c68689450a8cb436c9486434c8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 25 Jun 2020 20:29:24 -0700 Subject: mm, compaction: make capture control handling safe wrt interrupts Hugh reports: "While stressing compaction, one run oopsed on NULL capc->cc in __free_one_page()'s task_capc(zone): compact_zone_order() had been interrupted, and a page was being freed in the return from interrupt. Though you would not expect it from the source, both gccs I was using (4.8.1 and 7.5.0) had chosen to compile compact_zone_order() with the ".cc = &cc" implemented by mov %rbx,-0xb0(%rbp) immediately before callq compact_zone - long after the "current->capture_control = &capc". An interrupt in between those finds capc->cc NULL (zeroed by an earlier rep stos). This could presumably be fixed by a barrier() before setting current->capture_control in compact_zone_order(); but would also need more care on return from compact_zone(), in order not to risk leaking a page captured by interrupt just before capture_control is reset. Maybe that is the preferable fix, but I felt safer for task_capc() to exclude the rather surprising possibility of capture at interrupt time" I have checked that gcc10 also behaves the same. The advantage of fix in compact_zone_order() is that we don't add another test in the page freeing hot path, and that it might prevent future problems if we stop exposing pointers to uninitialized structures in current task. So this patch implements the suggestion for compact_zone_order() with barrier() (and WRITE_ONCE() to prevent store tearing) for setting current->capture_control, and prevents page leaking with WRITE_ONCE/READ_ONCE in the proper order. Link: http://lkml.kernel.org/r/20200616082649.27173-1-vbabka@suse.cz Fixes: 5e1f0f098b46 ("mm, compaction: capture a page under direct compaction") Signed-off-by: Vlastimil Babka Reported-by: Hugh Dickins Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Cc: Alex Shi Cc: Li Wang Cc: Mel Gorman Cc: [5.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index fd988b7e5f2b..86375605faa9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2316,15 +2316,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .page = NULL, }; - current->capture_control = &capc; + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *capture = capc.page; - current->capture_control = NULL; + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + *capture = READ_ONCE(capc.page); return ret; } -- cgit v1.2.3-59-g8ed1b From fd7af71be54271a9f03b2e6f63e4b3ac1ecd113d Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Thu, 25 Jun 2020 20:29:27 -0700 Subject: kexec: do not verify the signature without the lockdown or mandatory signature Signature verification is an important security feature, to protect system from being attacked with a kernel of unknown origin. Kexec rebooting is a way to replace the running kernel, hence need be secured carefully. In the current code of handling signature verification of kexec kernel, the logic is very twisted. It mixes signature verification, IMA signature appraising and kexec lockdown. If there is no KEXEC_SIG_FORCE, kexec kernel image doesn't have one of signature, the supported crypto, and key, we don't think this is wrong, Unless kexec lockdown is executed. IMA is considered as another kind of signature appraising method. If kexec kernel image has signature/crypto/key, it has to go through the signature verification and pass. Otherwise it's seen as verification failure, and won't be loaded. Seems kexec kernel image with an unqualified signature is even worse than those w/o signature at all, this sounds very unreasonable. E.g. If people get a unsigned kernel to load, or a kernel signed with expired key, which one is more dangerous? So, here, let's simplify the logic to improve code readability. If the KEXEC_SIG_FORCE enabled or kexec lockdown enabled, signature verification is mandated. Otherwise, we lift the bar for any kernel image. Link: http://lkml.kernel.org/r/20200602045952.27487-1-lijiang@redhat.com Signed-off-by: Lianbo Jiang Reviewed-by: Jiri Bohac Acked-by: Dave Young Acked-by: Baoquan He Cc: James Morris Cc: Matthew Garrett Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..09cc78df53c6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image) static int kimage_validate_signature(struct kimage *image) { - const char *reason; int ret; ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, image->kernel_buf_len); - switch (ret) { - case 0: - break; + if (ret) { - /* Certain verification errors are non-fatal if we're not - * checking errors, provided we aren't mandating that there - * must be a valid signature. - */ - case -ENODATA: - reason = "kexec of unsigned image"; - goto decide; - case -ENOPKG: - reason = "kexec of image with unsupported crypto"; - goto decide; - case -ENOKEY: - reason = "kexec of image with unavailable key"; - decide: if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { - pr_notice("%s rejected\n", reason); + pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } - /* If IMA is guaranteed to appraise a signature on the kexec + /* + * If IMA is guaranteed to appraise a signature on the kexec * image, permit it even if the kernel is otherwise locked * down. */ @@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image) security_locked_down(LOCKDOWN_KEXEC)) return -EPERM; - return 0; - - /* All other errors are fatal, including nomem, unparseable - * signatures and signature check failures - even if signatures - * aren't required. - */ - default: - pr_notice("kernel signature verification failed (%d).\n", ret); + pr_debug("kernel signature verification failed (%d).\n", ret); } - return ret; + return 0; } #endif -- cgit v1.2.3-59-g8ed1b From 4cd9973f9ff69e37dd0ba2bd6e6423f8179c329a Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 25 Jun 2020 20:29:30 -0700 Subject: ocfs2: avoid inode removal while nfsd is accessing it Patch series "ocfs2: fix nfsd over ocfs2 issues", v2. This is a series of patches to fix issues on nfsd over ocfs2. patch 1 is to avoid inode removed while nfsd access it patch 2 & 3 is to fix a panic issue. This patch (of 4): When nfsd is getting file dentry using handle or parent dentry of some dentry, one cluster lock is used to avoid inode removed from other node, but it still could be removed from local node, so use a rw lock to avoid this. Link: http://lkml.kernel.org/r/20200616183829.87211-1-junxiao.bi@oracle.com Link: http://lkml.kernel.org/r/20200616183829.87211-2-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 17 ++++++++++++++++- fs/ocfs2/ocfs2.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 152a0fc4e905..751bc4dc7466 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb) +{ + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + init_rwsem(&osb->nfs_sync_rwlock); +} + void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; @@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ex) + down_write(&osb->nfs_sync_rwlock); + else + down_read(&osb->nfs_sync_rwlock); + if (ocfs2_mount_local(osb)) return 0; @@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) if (!ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE); + if (ex) + up_write(&osb->nfs_sync_rwlock); + else + up_read(&osb->nfs_sync_rwlock); } int ocfs2_trim_fs_lock(struct ocfs2_super *osb, @@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_nfs_sync_lock_init(osb); ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index ee5d98516212..2dd71d626196 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -395,6 +395,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; -- cgit v1.2.3-59-g8ed1b From 7569d3c754e452769a5747eeeba488179e38a5da Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 25 Jun 2020 20:29:33 -0700 Subject: ocfs2: load global_inode_alloc Set global_inode_alloc as OCFS2_FIRST_ONLINE_SYSTEM_INODE, that will make it load during mount. It can be used to test whether some global/system inodes are valid. One use case is that nfsd will test whether root inode is valid. Link: http://lkml.kernel.org/r/20200616183829.87211-3-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 0dd8c41bafd4..3fc99659ed09 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -326,8 +326,8 @@ struct ocfs2_system_inode_info { enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, -#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, -- cgit v1.2.3-59-g8ed1b From e5a15e17a78d58f933d17cafedfcf7486a29f5b4 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 25 Jun 2020 20:29:37 -0700 Subject: ocfs2: fix panic on nfs server over ocfs2 The following kernel panic was captured when running nfs server over ocfs2, at that time ocfs2_test_inode_bit() was checking whether one inode locating at "blkno" 5 was valid, that is ocfs2 root inode, its "suballoc_slot" was OCFS2_INVALID_SLOT(65535) and it was allocted from //global_inode_alloc, but here it wrongly assumed that it was got from per slot inode alloctor which would cause array overflow and trigger kernel panic. BUG: unable to handle kernel paging request at 0000000000001088 IP: [] _raw_spin_lock+0x18/0xf0 PGD 1e06ba067 PUD 1e9e7d067 PMD 0 Oops: 0002 [#1] SMP CPU: 6 PID: 24873 Comm: nfsd Not tainted 4.1.12-124.36.1.el6uek.x86_64 #2 Hardware name: Huawei CH121 V3/IT11SGCA1, BIOS 3.87 02/02/2018 RIP: _raw_spin_lock+0x18/0xf0 RSP: e02b:ffff88005ae97908 EFLAGS: 00010206 RAX: ffff88005ae98000 RBX: 0000000000001088 RCX: 0000000000000000 RDX: 0000000000020000 RSI: 0000000000000009 RDI: 0000000000001088 RBP: ffff88005ae97928 R08: 0000000000000000 R09: ffff880212878e00 R10: 0000000000007ff0 R11: 0000000000000000 R12: 0000000000001088 R13: ffff8800063c0aa8 R14: ffff8800650c27d0 R15: 000000000000ffff FS: 0000000000000000(0000) GS:ffff880218180000(0000) knlGS:ffff880218180000 CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000001088 CR3: 00000002033d0000 CR4: 0000000000042660 Call Trace: igrab+0x1e/0x60 ocfs2_get_system_file_inode+0x63/0x3a0 [ocfs2] ocfs2_test_inode_bit+0x328/0xa00 [ocfs2] ocfs2_get_parent+0xba/0x3e0 [ocfs2] reconnect_path+0xb5/0x300 exportfs_decode_fh+0xf6/0x2b0 fh_verify+0x350/0x660 [nfsd] nfsd4_putfh+0x4d/0x60 [nfsd] nfsd4_proc_compound+0x3d3/0x6f0 [nfsd] nfsd_dispatch+0xe0/0x290 [nfsd] svc_process_common+0x412/0x6a0 [sunrpc] svc_process+0x123/0x210 [sunrpc] nfsd+0xff/0x170 [nfsd] kthread+0xcb/0xf0 ret_from_fork+0x61/0x90 Code: 83 c2 02 0f b7 f2 e8 18 dc 91 ff 66 90 eb bf 0f 1f 40 00 55 48 89 e5 41 56 41 55 41 54 53 0f 1f 44 00 00 48 89 fb ba 00 00 02 00 0f c1 17 89 d0 45 31 e4 45 31 ed c1 e8 10 66 39 d0 41 89 c6 RIP _raw_spin_lock+0x18/0xf0 CR2: 0000000000001088 ---[ end trace 7264463cd1aac8f9 ]--- Kernel panic - not syncing: Fatal exception Link: http://lkml.kernel.org/r/20200616183829.87211-4-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4836becb7578..45745cc3408a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2825,9 +2825,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - inode_alloc_inode = - ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, - suballoc_slot); + if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); + else + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + INODE_ALLOC_SYSTEM_INODE, suballoc_slot); if (!inode_alloc_inode) { /* the error code could be inaccurate, but we are not able to * get the correct one. */ -- cgit v1.2.3-59-g8ed1b From 9277f8334ffc719fe922d776444d6e4e884dbf30 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 25 Jun 2020 20:29:40 -0700 Subject: ocfs2: fix value of OCFS2_INVALID_SLOT In the ocfs2 disk layout, slot number is 16 bits, but in ocfs2 implementation, slot number is 32 bits. Usually this will not cause any issue, because slot number is converted from u16 to u32, but OCFS2_INVALID_SLOT was defined as -1, when an invalid slot number from disk was obtained, its value was (u16)-1, and it was converted to u32. Then the following checking in get_local_system_inode will be always skipped: static struct inode **get_local_system_inode(struct ocfs2_super *osb, int type, u32 slot) { BUG_ON(slot == OCFS2_INVALID_SLOT); ... } Link: http://lkml.kernel.org/r/20200616183829.87211-5-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 3fc99659ed09..19137c6d087b 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -290,7 +290,7 @@ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ -#define OCFS2_INVALID_SLOT -1 +#define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 -- cgit v1.2.3-59-g8ed1b From 786ae133e07f2a6b352a0efad16b555ee45a2898 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Jun 2020 20:29:43 -0700 Subject: lib: fix test_hmm.c reference after free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coccinelle scripts report the following errors: lib/test_hmm.c:523:20-26: ERROR: reference preceded by free on line 521 lib/test_hmm.c:524:21-27: ERROR: reference preceded by free on line 521 lib/test_hmm.c:523:28-35: ERROR: devmem is NULL but dereferenced. lib/test_hmm.c:524:29-36: ERROR: devmem is NULL but dereferenced. Fix these by using the local variable 'res' instead of devmem. Link: http://lkml.kernel.org/r/c845c158-9c65-9665-0d0b-00342846dd07@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Ralph Campbell Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_hmm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 28528285942c..a2a82262b97b 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -520,8 +520,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, err_free: kfree(devmem); err_release: - release_mem_region(devmem->pagemap.res.start, - resource_size(&devmem->pagemap.res)); + release_mem_region(res->start, resource_size(res)); err: mutex_unlock(&mdevice->devmem_lock); return false; -- cgit v1.2.3-59-g8ed1b From d7670879c5c4aa443d518fb234a9e5f30931efa3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 25 Jun 2020 20:29:49 -0700 Subject: mm, slab: fix sign conversion problem in memcg_uncharge_slab() It was found that running the LTP test on a PowerPC system could produce erroneous values in /proc/meminfo, like: MemTotal: 531915072 kB MemFree: 507962176 kB MemAvailable: 1100020596352 kB Using bisection, the problem is tracked down to commit 9c315e4d7d8c ("mm: memcg/slab: cache page number in memcg_(un)charge_slab()"). In memcg_uncharge_slab() with a "int order" argument: unsigned int nr_pages = 1 << order; : mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages); The mod_lruvec_state() function will eventually call the __mod_zone_page_state() which accepts a long argument. Depending on the compiler and how inlining is done, "-nr_pages" may be treated as a negative number or a very large positive number. Apparently, it was treated as a large positive number in that PowerPC system leading to incorrect stat counts. This problem hasn't been seen in x86-64 yet, perhaps the gcc compiler there has some slight difference in behavior. It is fixed by making nr_pages a signed value. For consistency, a similar change is applied to memcg_charge_slab() as well. Link: http://lkml.kernel.org/r/20200620184719.10994-1-longman@redhat.com Fixes: 9c315e4d7d8c ("mm: memcg/slab: cache page number in memcg_(un)charge_slab()"). Signed-off-by: Waiman Long Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 207c83ef6e06..74f7e09a7cfd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -348,7 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; @@ -388,7 +388,7 @@ out: static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; -- cgit v1.2.3-59-g8ed1b From 8982ae527fbef170ef298650c15d55a9ccd33973 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 25 Jun 2020 20:29:52 -0700 Subject: mm/slab: use memzero_explicit() in kzfree() The kzfree() function is normally used to clear some sensitive information, like encryption keys, in the buffer before freeing it back to the pool. Memset() is currently used for buffer clearing. However unlikely, there is still a non-zero probability that the compiler may choose to optimize away the memory clearing especially if LTO is being used in the future. To make sure that this optimization will never happen, memzero_explicit(), which is introduced in v3.18, is now used in kzfree() to future-proof it. Link: http://lkml.kernel.org/r/20200616154311.12314-2-longman@redhat.com Fixes: 3ef0e5ba4673 ("slab: introduce kzfree()") Signed-off-by: Waiman Long Acked-by: Michal Hocko Cc: David Howells Cc: Jarkko Sakkinen Cc: James Morris Cc: "Serge E. Hallyn" Cc: Joe Perches Cc: Matthew Wilcox Cc: David Rientjes Cc: Johannes Weiner Cc: Dan Carpenter Cc: "Jason A . Donenfeld" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 9e72ba224175..37d48a56431d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1726,7 +1726,7 @@ void kzfree(const void *p) if (unlikely(ZERO_OR_NULL_PTR(mem))) return; ks = ksize(mem); - memset(mem, 0, ks); + memzero_explicit(mem, ks); kfree(mem); } EXPORT_SYMBOL(kzfree); -- cgit v1.2.3-59-g8ed1b From 55860d96ca59265d35427da0ee7d7f61e404f8e7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Jun 2020 20:29:55 -0700 Subject: slub: cure list_slab_objects() from double fix According to Christopher Lameter two fixes have been merged for the same problem. As far as I can tell, the code does not acquire the list_lock and invoke kmalloc(). list_slab_objects() misses an unlock (the counterpart to get_map()) and the memory allocated in free_partial() isn't used. Revert the mentioned commit. Link: http://lkml.kernel.org/r/20200618201234.795692-1-bigeasy@linutronix.de Fixes: aa456c7aebb14 ("slub: remove kmalloc under list_lock from list_slab_objects() V2") Link: https://lkml.kernel.org/r/alpine.DEB.2.22.394.2006181501480.12014@www.lameter.com Signed-off-by: Sebastian Andrzej Siewior Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Thomas Gleixner Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fe81773fd97e..ef303070d175 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3766,15 +3766,13 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text, unsigned long *map) + const char *text) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long *map; void *p; - if (!map) - return; - slab_err(s, page, text, s->name); slab_lock(page); @@ -3786,6 +3784,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } + put_map(map); slab_unlock(page); #endif } @@ -3799,11 +3798,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; - unsigned long *map = NULL; - -#ifdef CONFIG_SLUB_DEBUG - map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); -#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3813,16 +3807,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()", - map); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); -#ifdef CONFIG_SLUB_DEBUG - bitmap_free(map); -#endif - list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } -- cgit v1.2.3-59-g8ed1b From 243bce09c91b0145aeaedd5afba799d81841c030 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 25 Jun 2020 20:29:59 -0700 Subject: mm: fix swap cache node allocation mask Chris Murphy reports that a slightly overcommitted load, testing swap and zram along with i915, splats and keeps on splatting, when it had better fail less noisily: gnome-shell: page allocation failure: order:0, mode:0x400d0(__GFP_IO|__GFP_FS|__GFP_COMP|__GFP_RECLAIMABLE), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 2 PID: 1155 Comm: gnome-shell Not tainted 5.7.0-1.fc33.x86_64 #1 Call Trace: dump_stack+0x64/0x88 warn_alloc.cold+0x75/0xd9 __alloc_pages_slowpath.constprop.0+0xcfa/0xd30 __alloc_pages_nodemask+0x2df/0x320 alloc_slab_page+0x195/0x310 allocate_slab+0x3c5/0x440 ___slab_alloc+0x40c/0x5f0 __slab_alloc+0x1c/0x30 kmem_cache_alloc+0x20e/0x220 xas_nomem+0x28/0x70 add_to_swap_cache+0x321/0x400 __read_swap_cache_async+0x105/0x240 swap_cluster_readahead+0x22c/0x2e0 shmem_swapin+0x8e/0xc0 shmem_swapin_page+0x196/0x740 shmem_getpage_gfp+0x3a2/0xa60 shmem_read_mapping_page_gfp+0x32/0x60 shmem_get_pages+0x155/0x5e0 [i915] __i915_gem_object_get_pages+0x68/0xa0 [i915] i915_vma_pin+0x3fe/0x6c0 [i915] eb_add_vma+0x10b/0x2c0 [i915] i915_gem_do_execbuffer+0x704/0x3430 [i915] i915_gem_execbuffer2_ioctl+0x1ea/0x3e0 [i915] drm_ioctl_kernel+0x86/0xd0 [drm] drm_ioctl+0x206/0x390 [drm] ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported on 5.7, but it goes back really to 3.1: when shmem_read_mapping_page_gfp() was implemented for use by i915, and allowed for __GFP_NORETRY and __GFP_NOWARN flags in most places, but missed swapin's "& GFP_KERNEL" mask for page tree node allocation in __read_swap_cache_async() - that was to mask off HIGHUSER_MOVABLE bits from what page cache uses, but GFP_RECLAIM_MASK is now what's needed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=208085 Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2006151330070.11064@eggly.anvils Fixes: 68da9f055755 ("tmpfs: pass gfp to shmem_getpage_gfp") Signed-off-by: Hugh Dickins Reviewed-by: Vlastimil Babka Reviewed-by: Matthew Wilcox (Oracle) Reported-by: Chris Murphy Analyzed-by: Vlastimil Babka Analyzed-by: Matthew Wilcox Tested-by: Chris Murphy Cc: [3.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index e98ff460e9e9..05889e8e3c97 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ #include #include #include - +#include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { put_swap_page(page, entry); goto fail_unlock; } -- cgit v1.2.3-59-g8ed1b From 7f70c2a68a51496289df163f6969d4db7c383f30 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 25 Jun 2020 20:30:01 -0700 Subject: mm/memory.c: properly pte_offset_map_lock/unlock in vm_insert_pages() Calls to pte_offset_map() in vm_insert_pages() are erroneously not matched with a call to pte_unmap(). This would cause problems on architectures where that is not a no-op. This patch does away with the non-traditional locking in the existing code, and instead uses pte_offset_map_lock/unlock() as usual, incrementing PTE as necessary. The PTE pointer is kept within bounds since we clamp it with PTRS_PER_PTE. Link: http://lkml.kernel.org/r/20200618220446.20284-1-arjunroy.kdev@gmail.com Fixes: 8cd3984d81d5 ("mm/memory.c: add vm_insert_pages()") Signed-off-by: Arjun Roy Acked-by: David Rientjes Cc: Eric Dumazet Cc: Hugh Dickins Cc: Soheil Hassas Yeganeh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1c632faa2611..0e5b25c9b151 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1498,7 +1498,7 @@ out: } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, +static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1506,8 +1506,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, if (!page_count(page)) return -EINVAL; err = validate_page_before_insert(page); - return err ? err : insert_page_into_pte_locked( - mm, pte_offset_map(pmd, addr), addr, page, prot); + if (err) + return err; + return insert_page_into_pte_locked(mm, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1517,7 +1518,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; - spinlock_t *pte_lock = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; @@ -1536,18 +1538,17 @@ more: ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; - pte_lock = pte_lockptr(mm, pmd); while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); - spin_lock(pte_lock); - for (; pte_idx < batch_size; ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pmd, + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_in_batch_locked(mm, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; @@ -1555,7 +1556,7 @@ more: addr += PAGE_SIZE; ++curr_page_idx; } - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } -- cgit v1.2.3-59-g8ed1b From 9449c9cb420b249eb6d7dad3953e4686443d7bd9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Jun 2020 20:30:04 -0700 Subject: mm/debug_vm_pgtable: fix build failure with powerpc 8xx Since commit 9e343b467c70 ("READ_ONCE: Enforce atomicity for {READ,WRITE}_ONCE() memory accesses"), READ_ONCE() cannot be used anymore to read complex page table entries. This leads to: CC mm/debug_vm_pgtable.o In file included from ./include/asm-generic/bug.h:5, from ./arch/powerpc/include/asm/bug.h:109, from ./include/linux/bug.h:5, from ./include/linux/mmdebug.h:5, from ./include/linux/gfp.h:5, from mm/debug_vm_pgtable.c:13: In function 'pte_clear_tests', inlined from 'debug_vm_pgtable' at mm/debug_vm_pgtable.c:363:2: ./include/linux/compiler.h:392:38: error: Unsupported access size for {READ,WRITE}_ONCE(). mm/debug_vm_pgtable.c:249:14: note: in expansion of macro 'READ_ONCE' 249 | pte_t pte = READ_ONCE(*ptep); | ^~~~~~~~~ make[2]: *** [mm/debug_vm_pgtable.o] Error 1 Fix it by using the recently added ptep_get() helper. Link: http://lkml.kernel.org/r/6ca8c972e6c920dc4ae0d4affbed9703afa4d010.1592490570.git.christophe.leroy@csgroup.eu Fixes: 9e343b467c70 ("READ_ONCE: Enforce atomicity for {READ,WRITE}_ONCE() memory accesses") Signed-off-by: Christophe Leroy Acked-by: Will Deacon Reviewed-by: Anshuman Khandual Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Peter Zijlstra (Intel)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e45623016aea..61ab16fb2e36 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, unsigned long vaddr) { - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = __pte(pte_val(pte) | RANDOM_ORVALUE); set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); } -- cgit v1.2.3-59-g8ed1b From 8dbdd5049cfa8f8848711ba83c9bbf67e08f5b2d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 25 Jun 2020 20:30:07 -0700 Subject: make asm-generic/cacheflush.h more standalone Some s390 builds get these warnings: include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:22:46: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:28:45: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:36:44: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:44:45: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:52:50: warning: 'struct address_space' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:58:52: warning: 'struct address_space' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:75:17: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:74:45: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:82:16: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:81:50: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration Forward declare the named structs to get rid of these. Link: http://lkml.kernel.org/r/20200623135714.4dae4b8a@canb.auug.org.au Fixes: e0cf615d725c ("asm-generic: don't include in cacheflush.h") Signed-off-by: Stephen Rothwell Reviewed-by: Christoph Hellwig Acked-by: Arnd Bergmann Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 907fa5d16494..4a674db4e1fa 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,6 +2,11 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H +struct mm_struct; +struct vm_area_struct; +struct page; +struct address_space; + /* * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory -- cgit v1.2.3-59-g8ed1b From 3c7858268411dc6473e73965a23854d8f9825424 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jun 2020 20:30:11 -0700 Subject: media: omap3isp: remove cacheflush.h After mm.h was removed from the asm-generic version of cacheflush.h, s390 allyesconfig shows several warnings of the following nature: In file included from arch/s390/include/generated/asm/cacheflush.h:1, from drivers/media/platform/omap3isp/isp.c:42: include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration As Geert and Laurent point out, this driver does not need this header in the two files that include it. Remove it so there are no warnings. Link: http://lkml.kernel.org/r/20200622234740.72825-2-natechancellor@gmail.com Fixes: e0cf615d725c ("asm-generic: don't include in cacheflush.h") Signed-off-by: Nathan Chancellor Suggested-by: Geert Uytterhoeven Suggested-by: Laurent Pinchart Reviewed-by: Laurent Pinchart Reviewed-by: Christoph Hellwig Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/media/platform/omap3isp/isp.c | 2 -- drivers/media/platform/omap3isp/ispvideo.c | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c index a4ee6b86663e..b91e472ee764 100644 --- a/drivers/media/platform/omap3isp/isp.c +++ b/drivers/media/platform/omap3isp/isp.c @@ -39,8 +39,6 @@ * Troy Laramy */ -#include - #include #include #include diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index 10c214bd0903..1ac9aef70dff 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-59-g8ed1b From 8eab7035b231aa3ac27b20ec77f85375e4413083 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 25 Jun 2020 20:30:13 -0700 Subject: mm/vmalloc.c: fix a warning while make xmldocs This patch fixes following warning while "make xmldocs" mm/vmalloc.c:1877: warning: Excess function parameter 'prot' description in 'vm_map_ram' This warning started since commit d4efd79a81ab ("mm: remove the prot argument from vm_map_ram"). Link: http://lkml.kernel.org/r/20200622152850.140871-1-standby24x7@gmail.com Fixes: d4efd79a81ab ("mm: remove the prot argument from vm_map_ram") Signed-off-by: Masanari Iida Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3091c2ca60df..957a0be77270 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1862,7 +1862,6 @@ EXPORT_SYMBOL(vm_unmap_ram); * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node - * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life -- cgit v1.2.3-59-g8ed1b From cd324edce598ebddde44162a2aa01321c1261b9e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Jun 2020 20:30:16 -0700 Subject: mm: memcontrol: handle div0 crash race condition in memory.low Tejun reports seeing rare div0 crashes in memory.low stress testing: RIP: 0010:mem_cgroup_calculate_protection+0xed/0x150 Code: 0f 46 d1 4c 39 d8 72 57 f6 05 16 d6 42 01 40 74 1f 4c 39 d8 76 1a 4c 39 d1 76 15 4c 29 d1 4c 29 d8 4d 29 d9 31 d2 48 0f af c1 <49> f7 f1 49 01 c2 4c 89 96 38 01 00 00 5d c3 48 0f af c7 31 d2 49 RSP: 0018:ffffa14e01d6fcd0 EFLAGS: 00010246 RAX: 000000000243e384 RBX: 0000000000000000 RCX: 0000000000008f4b RDX: 0000000000000000 RSI: ffff8b89bee84000 RDI: 0000000000000000 RBP: ffffa14e01d6fcd0 R08: ffff8b89ca7d40f8 R09: 0000000000000000 R10: 0000000000000000 R11: 00000000006422f7 R12: 0000000000000000 R13: ffff8b89d9617000 R14: ffff8b89bee84000 R15: ffffa14e01d6fdb8 FS: 0000000000000000(0000) GS:ffff8b8a1f1c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f93b1fc175b CR3: 000000016100a000 CR4: 0000000000340ea0 Call Trace: shrink_node+0x1e5/0x6c0 balance_pgdat+0x32d/0x5f0 kswapd+0x1d7/0x3d0 kthread+0x11c/0x160 ret_from_fork+0x1f/0x30 This happens when parent_usage == siblings_protected. We check that usage is bigger than protected, which should imply parent_usage being bigger than siblings_protected. However, we don't read (or even update) these values atomically, and they can be out of sync as the memory state changes under us. A bit of fluctuation around the target protection isn't a big deal, but we need to handle the div0 case. Check the parent state explicitly to make sure we have a reasonable positive value for the divisor. Link: http://lkml.kernel.org/r/20200615140658.601684-1-hannes@cmpxchg.org Fixes: 8a931f801340 ("mm: memcontrol: recursive memory.low protection") Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Chris Down Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b38b6ad547d..5de0a9035b5f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6360,11 +6360,16 @@ static unsigned long effective_protection(unsigned long usage, * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. */ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; - - if (parent_effective > siblings_protected && usage > protected) { + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; -- cgit v1.2.3-59-g8ed1b From 3a98990ae2150277ed34d3b248c60e68bf2244b2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 25 Jun 2020 20:30:19 -0700 Subject: mm/memcontrol.c: add missed css_put() We should put the css reference when memory allocation failed. Link: http://lkml.kernel.org/r/20200614122653.98829-1-songmuchun@bytedance.com Fixes: f0a3a24b532d ("mm: memcg/slab: rework non-root kmem_cache lifecycle management") Signed-off-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Qian Cai Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5de0a9035b5f..da15b686caa8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2772,8 +2772,10 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, return; cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); - if (!cw) + if (!cw) { + css_put(&memcg->css); return; + } cw->memcg = memcg; cw->cachep = cachep; -- cgit v1.2.3-59-g8ed1b From 03960e33187ae969187281c3aa6c308d7282c468 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Thu, 25 Jun 2020 20:30:22 -0700 Subject: mm/memcontrol.c: prevent missed memory.low load tears Looks like one of these got missed when massaging in f86b810c2610 ("mm, memcg: prevent memory.low load/store tearing") with other linux-mm changes. Link: http://lkml.kernel.org/r/20200612174437.GA391453@chrisdown.name Signed-off-by: Chris Down Reported-by: Michal Koutny Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da15b686caa8..19622328e4b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6423,7 +6423,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); - memcg->memory.elow = memcg->memory.low; + memcg->memory.elow = READ_ONCE(memcg->memory.low); goto out; } @@ -6435,7 +6435,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_min_usage))); WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, - memcg->memory.low, READ_ONCE(parent->memory.elow), + READ_ONCE(memcg->memory.low), + READ_ONCE(parent->memory.elow), atomic_long_read(&parent->memory.children_low_usage))); out: -- cgit v1.2.3-59-g8ed1b From f9e559703d0899cc74684128244682182157aa64 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 25 Jun 2020 20:30:25 -0700 Subject: docs: mm/gup: minor documentation update Now there are 5 cases. Updated the same. Link: http://lkml.kernel.org/r/1592422023-7401-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: John Hubbard Cc: Jonathan Corbet Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/pin_user_pages.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 6068266dd303..7ca8c7bac650 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -33,7 +33,7 @@ all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so that's a natural dividing line, and a good point to make separate wrapper calls. In other words, use pin_user_pages*() for DMA-pinned pages, and -get_user_pages*() for other cases. There are four cases described later on in +get_user_pages*() for other cases. There are five cases described later on in this document, to further clarify that concept. FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However, -- cgit v1.2.3-59-g8ed1b From 2a8bef321749219a6f236dc9f5ee5571a5f1efc6 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 25 Jun 2020 20:30:28 -0700 Subject: doc: THP CoW fault no longer allocate THP Since commit 3917c80280c9 ("thp: change CoW semantics for anon-THP"), THP CoW page fault is rewritten. Now it just splits pmd then fallback to base page fault, it doesn't try to allocate THP anymore. So it is no longer counted in THP_FAULT_ALLOC. Remove the obsolete statement in documentation about THP CoW allocation to avoid confusion. Link: http://lkml.kernel.org/r/1592424895-5421-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Cc: Kirill A. Shutemov Cc: Zi Yan Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 4 ++-- Documentation/admin-guide/mm/transhuge.rst | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index ce3e05e41724..d09471aa7443 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1356,8 +1356,8 @@ PAGE_SIZE multiple when read back. thp_fault_alloc Number of transparent hugepages which were allocated to satisfy - a page fault, including COW faults. This counter is not present - when CONFIG_TRANSPARENT_HUGEPAGE is not set. + a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE + is not set. thp_collapse_alloc Number of transparent hugepages which were allocated to allow diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 6a233e42be08..b2acd0d395ca 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -305,8 +305,7 @@ monitor how successfully the system is providing huge pages for use. thp_fault_alloc is incremented every time a huge page is successfully - allocated to handle a page fault. This applies to both the - first time a page is faulted and for COW faults. + allocated to handle a page fault. thp_collapse_alloc is incremented by khugepaged when it has found -- cgit v1.2.3-59-g8ed1b From 31d8fcac00fcf4007f3921edc69ab4dcb3abcd4d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Jun 2020 20:30:31 -0700 Subject: mm: workingset: age nonresident information alongside anonymous pages Patch series "fix for "mm: balance LRU lists based on relative thrashing" patchset" This patchset fixes some problems of the patchset, "mm: balance LRU lists based on relative thrashing", which is now merged on the mainline. Patch "mm: workingset: let cache workingset challenge anon fix" is the result of discussion with Johannes. See following link. http://lkml.kernel.org/r/20200520232525.798933-6-hannes@cmpxchg.org And, the other two are minor things which are found when I try to rebase my patchset. This patch (of 3): After ("mm: workingset: let cache workingset challenge anon fix"), we compare refault distances to active_file + anon. But age of the non-resident information is only driven by the file LRU. As a result, we may overestimate the recency of any incoming refaults and activate them too eagerly, causing unnecessary LRU churn in certain situations. Make anon aging drive nonresident age as well to address that. Link: http://lkml.kernel.org/r/1592288204-27734-1-git-send-email-iamjoonsoo.kim@lge.com Link: http://lkml.kernel.org/r/1592288204-27734-2-git-send-email-iamjoonsoo.kim@lge.com Fixes: 34e58cac6d8f2a ("mm: workingset: let cache workingset challenge anon") Reported-by: Joonsoo Kim Signed-off-by: Johannes Weiner Signed-off-by: Joonsoo Kim Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- include/linux/swap.h | 1 + mm/vmscan.c | 3 +++ mm/workingset.c | 46 +++++++++++++++++++++++++++------------------- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c4c37fd12104..f6f884970511 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -257,8 +257,8 @@ struct lruvec { */ unsigned long anon_cost; unsigned long file_cost; - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; + /* Non-resident age, driven by LRU movement */ + atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; /* Various lruvec state flags (enum lruvec_flags) */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4c5974bb9ba9..5b3216ba39a9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -313,6 +313,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d84326bdf2..749d239c62b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); + workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } } diff --git a/mm/workingset.c b/mm/workingset.c index d481ea452eeb..50b7937bab32 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,8 @@ * * Implementation * - * For each node's file LRU lists, a counter for inactive evictions - * and activations is maintained (node->inactive_age). + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @memcg: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) * the root cgroup's, age as well. */ do { - struct lruvec *lruvec; - - lruvec = mem_cgroup_lruvec(memcg, pgdat); - atomic_long_inc(&lruvec->inactive_age); - } while (memcg && (memcg = parent_mem_cgroup(memcg))); + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); } /** @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - advance_inactive_age(page_memcg(page), pgdat); - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->inactive_age); + eviction = atomic_long_read(&lruvec->nonresident_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) if (!mem_cgroup_disabled() && !eviction_memcg) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->inactive_age); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. There is a + * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the - * inactive_age to lap a shadow entry in the field, which can - * then result in a false small refault distance, leading to a - * false activation should this old entry actually refault - * again. However, earlier kernels used to deactivate + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - advance_inactive_age(memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -382,6 +388,7 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - advance_inactive_age(memcg, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); out: rcu_read_unlock(); } -- cgit v1.2.3-59-g8ed1b From cb6868832ede5cd73b346ec11cf89814d26ff7c7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 25 Jun 2020 20:30:34 -0700 Subject: mm/swap: fix for "mm: workingset: age nonresident information alongside anonymous pages" Non-file-lru page could also be activated in mark_page_accessed() and we need to count this activation for nonresident_age. Note that it's better for this patch to be squashed into the patch "mm: workingset: age nonresident information alongside anonymous pages". Link: http://lkml.kernel.org/r/1592288204-27734-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index dbcab84c6fce..a82efc33411f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -443,8 +443,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_lru(page)) - workingset_activation(page); + workingset_activation(page); } if (page_is_idle(page)) clear_page_idle(page); -- cgit v1.2.3-59-g8ed1b From 0076f029cb2906d32baf3bf4401ef09663071d16 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 25 Jun 2020 20:30:37 -0700 Subject: mm/memory: fix IO cost for anonymous page With synchronous IO swap device, swap-in is directly handled in fault code. Since IO cost notation isn't added there, with synchronous IO swap device, LRU balancing could be wrongly biased. Fix it to count it in fault code. Link: http://lkml.kernel.org/r/1592288204-27734-4-git-send-email-iamjoonsoo.kim@lge.com Fixes: 314b57fb0460001 ("mm: balance LRU lists based on relative thrashing cache sizing") Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 0e5b25c9b151..87ec87cdc1ff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3146,6 +3146,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } + /* + * XXX: Move to lru_cache_add() when it + * supports new vs putback + */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + lru_cache_add(page); swap_readpage(page, true); } -- cgit v1.2.3-59-g8ed1b From 800e26b81311dcc0080b8784f80620bb8f2baaa5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jun 2020 20:30:40 -0700 Subject: x86/hyperv: allocate the hypercall page with only read and execute bits Patch series "fix a hyperv W^X violation and remove vmalloc_exec" Dexuan reported a W^X violation due to the fact that the hyper hypercall page due switching it to be allocated using vmalloc_exec. The problem is that PAGE_KERNEL_EXEC as used by vmalloc_exec actually sets writable permissions in the pte. This series fixes the issue by switching to the low-level __vmalloc_node_range interface that allows specifing more detailed permissions instead. It then also open codes the other two callers and removes the somewhat confusing vmalloc_exec interface. Peter noted that the hyper hypercall page allocation also has another long standing issue in that it shouldn't use the full vmalloc but just the module space. This issue is so far theoretical as the allocation is done early in the boot process. I plan to fix it with another bigger series for 5.9. This patch (of 3): Avoid a W^X violation cause by the fact that PAGE_KERNEL_EXEC includes the writable bit. For this resurrect the removed PAGE_KERNEL_RX definition, but as PAGE_KERNEL_ROX to match arm64 and powerpc. Link: http://lkml.kernel.org/r/20200618064307.32739-2-hch@lst.de Fixes: 78bb17f76edc ("x86/hyperv: use vmalloc_exec for the hypercall page") Signed-off-by: Christoph Hellwig Reported-by: Dexuan Cui Tested-by: Vitaly Kuznetsov Acked-by: Wei Liu Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Will Deacon Cc: Jessica Yu Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/hyperv/hv_init.c | 4 +++- arch/x86/include/asm/pgtable_types.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a54c6a401581..2bdc72e6890e 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -375,7 +375,9 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __func__); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 2da1f95b88d7..816b31c68550 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,6 +194,7 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -219,6 +220,7 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) +#define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) -- cgit v1.2.3-59-g8ed1b From 10d5e97c1bf816facbc7c431c6caf47ee35fc1ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jun 2020 20:30:43 -0700 Subject: arm64: use PAGE_KERNEL_ROX directly in alloc_insn_page Use PAGE_KERNEL_ROX directly instead of allocating RWX and setting the page read-only just after the allocation. Link: http://lkml.kernel.org/r/20200618064307.32739-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Dexuan Cui Cc: Jessica Yu Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/probes/kprobes.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d1c95dcf1d78..cbe49cd117cf 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -120,15 +120,9 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) void *alloc_insn_page(void) { - void *page; - - page = vmalloc_exec(PAGE_SIZE); - if (page) { - set_memory_ro((unsigned long)page, 1); - set_vm_flush_reset_perms(page); - } - - return page; + return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } /* arm kprobe: install breakpoint in text */ -- cgit v1.2.3-59-g8ed1b From 7a0e27b2a0ce2735e27e21ebc8b777550fe0ed81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jun 2020 20:30:47 -0700 Subject: mm: remove vmalloc_exec Merge vmalloc_exec into its only caller. Note that for !CONFIG_MMU __vmalloc_node_range maps to __vmalloc, which directly clears the __GFP_HIGHMEM added by the vmalloc_exec stub anyway. Link: http://lkml.kernel.org/r/20200618064307.32739-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Dexuan Cui Cc: Jessica Yu Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - kernel/module.c | 4 +++- mm/nommu.c | 17 ----------------- mm/vmalloc.c | 20 -------------------- 4 files changed, 3 insertions(+), 39 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 48bb681e6c2a..0221f852a7e1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..0c6573b98c36 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } bool __weak module_init_section(const char *name) diff --git a/mm/nommu.c b/mm/nommu.c index cdcad5d61dd1..f32a69095d50 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -290,23 +290,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); -} - /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 957a0be77270..5a2b55c8dd9a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2695,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -- cgit v1.2.3-59-g8ed1b From b7e3debdd0408c0dca5d4750371afa5003f792dc Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 25 Jun 2020 20:30:51 -0700 Subject: mm/memory_hotplug.c: fix false softlockup during pfn range removal When working with very large nodes, poisoning the struct pages (for which there will be very many) can take a very long time. If the system is using voluntary preemptions, the software watchdog will not be able to detect forward progress. This patch addresses this issue by offering to give up time like __remove_pages() does. This behavior was introduced in v5.6 with: commit d33695b16a9f ("mm/memory_hotplug: poison memmap in remove_pfn_range_from_zone()") Alternately, init_page_poison could do this cond_resched(), but it seems to me that the caller of init_page_poison() is what actually knows whether or not it should relax its own priority. Based on Dan's notes, I think this is perfectly safe: commit f931ab479dd2 ("mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}") Aside from fixing the lockup, it is also a friendlier thing to do on lower core systems that might wipe out large chunks of hotplug memory (probably not a very common case). Fixes this kind of splat: watchdog: BUG: soft lockup - CPU#46 stuck for 22s! [daxctl:9922] irq event stamp: 138450 hardirqs last enabled at (138449): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (138450): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (138448): [] __do_softirq+0x347/0x456 softirqs last disabled at (138443): [] irq_exit+0x7d/0xb0 CPU: 46 PID: 9922 Comm: daxctl Not tainted 5.7.0-BEN-14238-g373c6049b336 #30 Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYXCRB1.86B.0578.D07.1902280810 02/28/2019 RIP: 0010:memset_erms+0x9/0x10 Code: c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 f3 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 aa 4c 89 c8 c3 90 49 89 fa 40 0f b6 ce 48 b8 01 01 01 01 01 01 Call Trace: remove_pfn_range_from_zone+0x3a/0x380 memunmap_pages+0x17f/0x280 release_nodes+0x22a/0x260 __device_release_driver+0x172/0x220 device_driver_detach+0x3e/0xa0 unbind_store+0x113/0x130 kernfs_fop_write+0xdc/0x1c0 vfs_write+0xde/0x1d0 ksys_write+0x58/0xd0 do_syscall_64+0x5a/0x120 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Built 2 zonelists, mobility grouping on. Total pages: 49050381 Policy zone: Normal Built 3 zonelists, mobility grouping on. Total pages: 49312525 Policy zone: Normal David said: "It really only is an issue for devmem. Ordinary hotplugged system memory is not affected (onlined/offlined in memory block granularity)." Link: http://lkml.kernel.org/r/20200619231213.1160351-1-ben.widawsky@intel.com Fixes: commit d33695b16a9f ("mm/memory_hotplug: poison memmap in remove_pfn_range_from_zone()") Signed-off-by: Ben Widawsky Reported-by: "Scargall, Steve" Reported-by: Ben Widawsky Acked-by: David Hildenbrand Cc: Dan Williams Cc: Vishal Verma Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9b34e03e730a..da374cd3d45b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -471,11 +471,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { + const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long flags; + unsigned long pfn, cur_nr_pages, flags; /* Poison struct pages because they are now uninitialized again. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } #ifdef CONFIG_ZONE_DEVICE /* -- cgit v1.2.3-59-g8ed1b From 19ef1f9dfeffe2bfbeaf624277a179b1c0eef201 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Thu, 25 Jun 2020 20:30:54 -0700 Subject: MAINTAINERS: update info for sparse Update the info for sparse. More specifically: - change W entry to point to sparse.docs.kernel.org - add Q & B entry (patchwork & bugzilla) Link: http://lkml.kernel.org/r/20200621144204.53938-1-luc.vanoostenryck@gmail.com Signed-off-by: Luc Van Oostenryck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7b5ffd646c6b..f7f4513f09b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16058,8 +16058,10 @@ SPARSE CHECKER M: "Luc Van Oostenryck" L: linux-sparse@vger.kernel.org S: Maintained -W: https://sparse.wiki.kernel.org/ +W: https://sparse.docs.kernel.org/ T: git git://git.kernel.org/pub/scm/devel/sparse/sparse.git +Q: https://patchwork.kernel.org/project/linux-sparse/list/ +B: https://bugzilla.kernel.org/enter_bug.cgi?component=Sparse&product=Tools F: include/linux/compiler.h SPEAR CLOCK FRAMEWORK SUPPORT -- cgit v1.2.3-59-g8ed1b