From cea582247a0aea78b4674c32833c10b8820fcdbe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 12 May 2017 15:46:44 -0700 Subject: Tigran has moved Cc: Tigran Aivazian Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/bfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f2deec0a62f0..25e312cb6071 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,7 +1,7 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999-2006 Tigran Aivazian + * Copyright (C) 1999-2006 Tigran Aivazian * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill , 2005. @@ -19,7 +19,7 @@ #include #include "bfs.h" -MODULE_AUTHOR("Tigran Aivazian "); +MODULE_AUTHOR("Tigran Aivazian "); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From 4636e70bb0a8b871998b6841a2e4b205cf2bc863 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 12 May 2017 15:46:47 -0700 Subject: dax: prevent invalidation of mapped DAX entries Patch series "mm,dax: Fix data corruption due to mmap inconsistency", v4. This series fixes data corruption that can happen for DAX mounts when page faults race with write(2) and as a result page tables get out of sync with block mappings in the filesystem and thus data seen through mmap is different from data seen through read(2). The series passes testing with t_mmap_stale test program from Ross and also other mmap related tests on DAX filesystem. This patch (of 4): dax_invalidate_mapping_entry() currently removes DAX exceptional entries only if they are clean and unlocked. This is done via: invalidate_mapping_pages() invalidate_exceptional_entry() dax_invalidate_mapping_entry() However, for page cache pages removed in invalidate_mapping_pages() there is an additional criteria which is that the page must not be mapped. This is noted in the comments above invalidate_mapping_pages() and is checked in invalidate_inode_page(). For DAX entries this means that we can can end up in a situation where a DAX exceptional entry, either a huge zero page or a regular DAX entry, could end up mapped but without an associated radix tree entry. This is inconsistent with the rest of the DAX code and with what happens in the page cache case. We aren't able to unmap the DAX exceptional entry because according to its comments invalidate_mapping_pages() isn't allowed to block, and unmap_mapping_range() takes a write lock on the mapping->i_mmap_rwsem. Since we essentially never have unmapped DAX entries to evict from the radix tree, just remove dax_invalidate_mapping_entry(). Fixes: c6dcf52c23d2 ("mm: Invalidate DAX radix tree entries only if appropriate") Link: http://lkml.kernel.org/r/20170510085419.27601-2-jack@suse.cz Signed-off-by: Ross Zwisler Signed-off-by: Jan Kara Reported-by: Jan Kara Cc: Dan Williams Cc: [4.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 29 ----------------------------- include/linux/dax.h | 1 - mm/truncate.c | 9 +++------ 3 files changed, 3 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 66d79067eedf..38deebb8c86e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -460,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) return ret; } -/* - * Invalidate exceptional DAX entry if easily possible. This handles DAX - * entries for invalidate_inode_pages() so we evict the entry only if we can - * do so without blocking. - */ -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - int ret = 0; - void *entry, **slot; - struct radix_tree_root *page_tree = &mapping->page_tree; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || - slot_locked(mapping, slot)) - goto out; - if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto out; - radix_tree_delete(page_tree, index); - mapping->nrexceptional--; - ret = 1; -out: - spin_unlock_irq(&mapping->tree_lock); - if (ret) - dax_wake_mapping_entry_waiter(mapping, index, entry, true); - return ret; -} - /* * Invalidate exceptional DAX entry if it is clean. */ diff --git a/include/linux/dax.h b/include/linux/dax.h index d3158e74a59e..d1236d16ef00 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -63,7 +63,6 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); void dax_wake_mapping_entry_waiter(struct address_space *mapping, diff --git a/mm/truncate.c b/mm/truncate.c index 83a059e8cd1d..706cff171a15 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -67,17 +67,14 @@ static void truncate_exceptional_entry(struct address_space *mapping, /* * Invalidate exceptional entry if easily possible. This handles exceptional - * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and - * clean entries. + * entries for invalidate_inode_pages(). */ static int invalidate_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { - /* Handled by shmem itself */ - if (shmem_mapping(mapping)) + /* Handled by shmem itself, or for DAX we do nothing. */ + if (shmem_mapping(mapping) || dax_mapping(mapping)) return 1; - if (dax_mapping(mapping)) - return dax_invalidate_mapping_entry(mapping, index); clear_shadow_entry(mapping, index, entry); return 1; } -- cgit v1.2.3-59-g8ed1b From cd656375f94632d7b5af57bf67b7b5c0270c591c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 May 2017 15:46:50 -0700 Subject: mm: fix data corruption due to stale mmap reads Currently, we didn't invalidate page tables during invalidate_inode_pages2() for DAX. That could result in e.g. 2MiB zero page being mapped into page tables while there were already underlying blocks allocated and thus data seen through mmap were different from data seen by read(2). The following sequence reproduces the problem: - open an mmap over a 2MiB hole - read from a 2MiB hole, faulting in a 2MiB zero page - write to the hole with write(3p). The write succeeds but we incorrectly leave the 2MiB zero page mapping intact. - via the mmap, read the data that was just written. Since the zero page mapping is still intact we read back zeroes instead of the new data. Fix the problem by unconditionally calling invalidate_inode_pages2_range() in dax_iomap_actor() for new block allocations and by properly invalidating page tables in invalidate_inode_pages2_range() for DAX mappings. Fixes: c6dcf52c23d2d3fb5235cec42d7dd3f786b87d55 Link: http://lkml.kernel.org/r/20170510085419.27601-3-jack@suse.cz Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- mm/truncate.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 38deebb8c86e..123d9903c77d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1015,7 +1015,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + if (iomap->flags & IOMAP_F_NEW) { invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); diff --git a/mm/truncate.c b/mm/truncate.c index 706cff171a15..6479ed2afc53 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -686,7 +686,17 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cond_resched(); index++; } - + /* + * For DAX we invalidate page tables after invalidating radix tree. We + * could invalidate page tables while invalidating each entry however + * that would be expensive. And doing range unmapping before doesn't + * work as we have no cheap way to find whether radix tree entry didn't + * get remapped later. + */ + if (dax_mapping(mapping)) { + unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, + (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + } out: cleancache_invalidate_inode(mapping); return ret; -- cgit v1.2.3-59-g8ed1b From fb26a1cbed8c90025572d48bc9eabe59f7571e88 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 May 2017 15:46:54 -0700 Subject: ext4: return to starting transaction in ext4_dax_huge_fault() DAX will return to locking exceptional entry before mapping blocks for a page fault to fix possible races with concurrent writes. To avoid lock inversion between exceptional entry lock and transaction start, start the transaction already in ext4_dax_huge_fault(). Fixes: 9f141d6ef6258a3a37a045842d9ba7e68f368956 Link: http://lkml.kernel.org/r/20170510085419.27601-4-jack@suse.cz Signed-off-by: Jan Kara Cc: Ross Zwisler Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/file.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cefa9835f275..831fd6beebf0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -257,6 +257,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { int result; + handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -264,12 +265,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } else { + down_read(&EXT4_I(inode)->i_mmap_sem); } - down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) + if (!IS_ERR(handle)) + result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); + else + result = VM_FAULT_SIGBUS; + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); + } else { + up_read(&EXT4_I(inode)->i_mmap_sem); + } return result; } -- cgit v1.2.3-59-g8ed1b From 13e451fdc1af05568ea379d71c02a126295d2244 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 May 2017 15:46:57 -0700 Subject: dax: fix data corruption when fault races with write Currently DAX read fault can race with write(2) in the following way: CPU1 - write(2) CPU2 - read fault dax_iomap_pte_fault() ->iomap_begin() - sees hole dax_iomap_rw() iomap_apply() ->iomap_begin - allocates blocks dax_iomap_actor() invalidate_inode_pages2_range() - there's nothing to invalidate grab_mapping_entry() - we add zero page in the radix tree and map it to page tables The result is that hole page is mapped into page tables (and thus zeros are seen in mmap) while file has data written in that place. Fix the problem by locking exception entry before mapping blocks for the fault. That way we are sure invalidate_inode_pages2_range() call for racing write will either block on entry lock waiting for the fault to finish (and unmap stale page tables after that) or read fault will see already allocated blocks by write(2). Fixes: 9f141d6ef6258a3a37a045842d9ba7e68f368956 Link: http://lkml.kernel.org/r/20170510085419.27601-5-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 123d9903c77d..32f020c9cedf 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1148,6 +1148,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); + goto out; + } + /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means @@ -1156,17 +1162,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); if (error) { vmf_ret = dax_fault_return(error); - goto out; + goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ - goto finish_iomap; - } - - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); - goto finish_iomap; + error = -EIO; /* fs corruption? */ + goto error_finish_iomap; } sector = dax_iomap_sector(&iomap, pos); @@ -1188,13 +1188,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } if (error) - goto error_unlock_entry; + goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; - goto unlock_entry; + goto finish_iomap; } switch (iomap.type) { @@ -1214,7 +1214,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { vmf_ret = dax_load_hole(mapping, &entry, vmf); - goto unlock_entry; + goto finish_iomap; } /*FALLTHRU*/ default: @@ -1223,10 +1223,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; } - error_unlock_entry: + error_finish_iomap: vmf_ret = dax_fault_return(error) | major; - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; @@ -1241,7 +1239,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } -out: + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } -- cgit v1.2.3-59-g8ed1b From 876f29460cbd4086b43475890c1bf2488fa11d40 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 12 May 2017 15:47:00 -0700 Subject: dax: fix PMD data corruption when fault races with write This is based on a patch from Jan Kara that fixed the equivalent race in the DAX PTE fault path. Currently DAX PMD read fault can race with write(2) in the following way: CPU1 - write(2) CPU2 - read fault dax_iomap_pmd_fault() ->iomap_begin() - sees hole dax_iomap_rw() iomap_apply() ->iomap_begin - allocates blocks dax_iomap_actor() invalidate_inode_pages2_range() - there's nothing to invalidate grab_mapping_entry() - we add huge zero page to the radix tree and map it to page tables The result is that hole page is mapped into page tables (and thus zeros are seen in mmap) while file has data written in that place. Fix the problem by locking exception entry before mapping blocks for the fault. That way we are sure invalidate_inode_pages2_range() call for racing write will either block on entry lock waiting for the fault to finish (and unmap stale page tables after that) or read fault will see already allocated blocks by write(2). Fixes: 9f141d6ef6258 ("dax: Call ->iomap_begin without entry lock during dax fault") Link: http://lkml.kernel.org/r/20170510172700.18991-1-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 32f020c9cedf..93ae87297ffa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1387,6 +1387,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if ((pgoff | PG_PMD_COLOUR) > max_pgoff) goto fallback; + /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way @@ -1395,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) - goto fallback; + goto unlock_entry; if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. - */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) - goto finish_iomap; - switch (iomap.type) { case IOMAP_MAPPED: result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); @@ -1417,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) - goto unlock_entry; + break; result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: @@ -1425,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, break; } - unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PMD_SIZE; @@ -1442,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); -- cgit v1.2.3-59-g8ed1b