From fe2b51145c9ffd5a49013fe180e42e92ef0e6df9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 19:33:30 -0500 Subject: nilfs2: Use xa_erase_irq This code simply opencoded xa_erase_irq(). Signed-off-by: Matthew Wilcox --- fs/nilfs2/btnode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index de99db518571..f2129a5d9f23 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -266,9 +266,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - xa_lock_irq(&btnc->i_pages); - __xa_erase(&btnc->i_pages, newkey); - xa_unlock_irq(&btnc->i_pages); + xa_erase_irq(&btnc->i_pages, newkey); unlock_page(ctxt->bh->b_page); } else brelse(nbh); -- cgit v1.2.3-59-g8ed1b From b469e7e47c8a075cc08bcd1e85d4365134bdcdd5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 30 Oct 2018 20:29:53 +0200 Subject: fanotify: fix handling of events on child sub-directory When an event is reported on a sub-directory and the parent inode has a mark mask with FS_EVENT_ON_CHILD|FS_ISDIR, the event will be sent to fsnotify() even if the event type is not in the parent mark mask (e.g. FS_OPEN). Further more, if that event happened on a mount or a filesystem with a mount/sb mark that does have that event type in their mask, the "on child" event will be reported on the mount/sb mark. That is not desired, because user will get a duplicate event for the same action. Note that the event reported on the victim inode is never merged with the event reported on the parent inode, because of the check in should_merge(): old_fsn->inode == new_fsn->inode. Fix this by looking for a match of an actual event type (i.e. not just FS_ISDIR) in parent's inode mark mask and by not reporting an "on child" event to group if event type is only found on mount/sb marks. [backport hint: The bug seems to have always been in fanotify, but this patch will only apply cleanly to v4.19.y] Cc: # v4.19 Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 10 +++++----- fs/notify/fsnotify.c | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5769cf3ff035..e08a6647267b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -115,12 +115,12 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, continue; mark = iter_info->marks[type]; /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! + * If the event is for a child and this mark doesn't care about + * events on a child, don't send it! */ - if (type == FSNOTIFY_OBJ_TYPE_INODE && - (event_mask & FS_EVENT_ON_CHILD) && - !(mark->mask & FS_EVENT_ON_CHILD)) + if (event_mask & FS_EVENT_ON_CHILD && + (type != FSNOTIFY_OBJ_TYPE_INODE || + !(mark->mask & FS_EVENT_ON_CHILD))) continue; marks_mask |= mark->mask; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2172ba516c61..d2c34900ae05 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -167,9 +167,9 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask parent = dget_parent(dentry); p_inode = parent->d_inode; - if (unlikely(!fsnotify_inode_watches_children(p_inode))) + if (unlikely(!fsnotify_inode_watches_children(p_inode))) { __fsnotify_update_child_dentry_flags(p_inode); - else if (p_inode->i_fsnotify_mask & mask) { + } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) { struct name_snapshot name; /* we are notifying a parent so come up with the new mask which @@ -339,6 +339,9 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, sb = mnt->mnt.mnt_sb; mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; } + /* An event "on child" is not intended for a mount/sb mark */ + if (mask & FS_EVENT_ON_CHILD) + mnt_or_sb_mask = 0; /* * Optimization: srcu_read_lock() has a memory barrier which can -- cgit v1.2.3-59-g8ed1b From 01310bb7c9c98752cc763b36532fab028e0f8f81 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 8 Nov 2018 11:11:36 -0500 Subject: nfsd: COPY and CLONE operations require the saved filehandle to be set Make sure we have a saved filehandle, otherwise we'll oops with a null pointer dereference in nfs4_preprocess_stateid_op(). Signed-off-by: Scott Mayhew Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index edff074d38c7..d505990dac7c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1038,6 +1038,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; + if (!cstate->save_fh.fh_dentry) + return nfserr_nofilehandle; + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, src_stateid, RD_STATE, src, NULL); if (status) { -- cgit v1.2.3-59-g8ed1b From 10283ea525d30f2e99828978fd04d8427876a7ad Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 5 Nov 2018 22:57:24 +0000 Subject: gfs2: Put bitmap buffers in put_super gfs2_put_super calls gfs2_clear_rgrpd to destroy the gfs2_rgrpd objects attached to the resource group glocks. That function should release the buffers attached to the gfs2_bitmap objects (bi_bh), but the call to gfs2_rgrp_brelse for doing that is missing. When gfs2_releasepage later runs across these buffers which are still referenced, it refuses to free them. This causes the pages the buffers are attached to to remain referenced as well. With enough mount/unmount cycles, the system will eventually run out of memory. Fix this by adding the missing call to gfs2_rgrp_brelse in gfs2_clear_rgrpd. (Also fix a gfs2_rgrp_relse -> gfs2_rgrp_brelse typo in a comment.) Fixes: 39b0f1e92908 ("GFS2: Don't brelse rgrp buffer_heads every allocation") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Andreas Gruenbacher --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ffe3032b1043..b08a530433ad 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -733,6 +733,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) if (gl) { glock_clear_object(gl, rgd); + gfs2_rgrp_brelse(rgd); gfs2_glock_put(gl); } @@ -1174,7 +1175,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. - * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. * * Returns: errno */ -- cgit v1.2.3-59-g8ed1b From e7445ceddfc220c1aede6d42758a5acb8844e9c3 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 8 Nov 2018 20:14:29 +0000 Subject: gfs2: Fix metadata read-ahead during truncate (2) The previous attempt to fix for metadata read-ahead during truncate was incorrect: for files with a height > 2 (1006989312 bytes with a block size of 4096 bytes), read-ahead requests were not being issued for some of the indirect blocks discovered while walking the metadata tree, leading to significant slow-downs when deleting large files. Fix that. In addition, only issue read-ahead requests in the first pass through the meta-data tree, while deallocating data blocks. Fixes: c3ce5aa9b0 ("gfs2: Fix metadata read-ahead during truncate") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5f3ea07ef5e2..38d88fcb6988 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1908,10 +1908,16 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) if (ret < 0) goto out; - /* issue read-ahead on metadata */ - if (mp.mp_aheight > 1) { - for (; ret > 1; ret--) { - metapointer_range(&mp, mp.mp_aheight - ret, + /* On the first pass, issue read-ahead on metadata. */ + if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) { + unsigned int height = mp.mp_aheight - 1; + + /* No read-ahead for data blocks. */ + if (mp.mp_aheight - 1 == strip_h) + height--; + + for (; height >= mp.mp_aheight - ret; height--) { + metapointer_range(&mp, height, start_list, start_aligned, end_list, end_aligned, &start, &end); -- cgit v1.2.3-59-g8ed1b From 7fabaf303458fcabb694999d6fa772cc13d4e217 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 9 Nov 2018 15:52:16 +0100 Subject: fuse: fix leaked notify reply fuse_request_send_notify_reply() may fail if the connection was reset for some reason (e.g. fs was unmounted). Don't leak request reference in this case. Besides leaking memory, this resulted in fc->num_waiting not being decremented and hence fuse_wait_aborted() left in a hanging and unkillable state. Fixes: 2d45ba381a74 ("fuse: add retrieve request") Fixes: b8f95e5d13f5 ("fuse: umount should wait for all requests") Reported-and-tested-by: syzbot+6339eda9cb4ebbc4c37b@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi Cc: #v2.6.36 --- fs/fuse/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ae813e609932..6fe330cc9709 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1768,8 +1768,10 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.args[1].size = total_len; err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) + if (err) { fuse_retrieve_end(fc, req); + fuse_put_request(fc, req); + } return err; } -- cgit v1.2.3-59-g8ed1b From 2d84a2d19b6150c6dbac1e6ebad9c82e4c123772 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 9 Nov 2018 15:52:16 +0100 Subject: fuse: fix possibly missed wake-up after abort In current fuse_drop_waiting() implementation it's possible that fuse_wait_aborted() will not be woken up in the unlikely case that fuse_abort_conn() + fuse_wait_aborted() runs in between checking fc->connected and calling atomic_dec(&fc->num_waiting). Do the atomic_dec_and_test() unconditionally, which also provides the necessary barrier against reordering with the fc->connected check. The explicit smp_mb() in fuse_wait_aborted() is not actually needed, since the spin_unlock() in fuse_abort_conn() provides the necessary RELEASE barrier after resetting fc->connected. However, this is not a performance sensitive path, and adding the explicit barrier makes it easier to document. Signed-off-by: Miklos Szeredi Fixes: b8f95e5d13f5 ("fuse: umount should wait for all requests") Cc: #v4.19 --- fs/fuse/dev.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6fe330cc9709..a5e516a40e7a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -165,9 +165,13 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) static void fuse_drop_waiting(struct fuse_conn *fc) { - if (fc->connected) { - atomic_dec(&fc->num_waiting); - } else if (atomic_dec_and_test(&fc->num_waiting)) { + /* + * lockess check of fc->connected is okay, because atomic_dec_and_test() + * provides a memory barrier mached with the one in fuse_wait_aborted() + * to ensure no wake-up is missed. + */ + if (atomic_dec_and_test(&fc->num_waiting) && + !READ_ONCE(fc->connected)) { /* wake up aborters */ wake_up_all(&fc->blocked_waitq); } @@ -2221,6 +2225,8 @@ EXPORT_SYMBOL_GPL(fuse_abort_conn); void fuse_wait_aborted(struct fuse_conn *fc) { + /* matches implicit memory barrier in fuse_drop_waiting() */ + smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); } -- cgit v1.2.3-59-g8ed1b From ebacb81273599555a7a19f7754a1451206a5fc4f Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 9 Nov 2018 14:51:46 +0100 Subject: fuse: fix use-after-free in fuse_direct_IO() In async IO blocking case the additional reference to the io is taken for it to survive fuse_aio_complete(). In non blocking case this additional reference is not needed, however we still reference io to figure out whether to wait for completion or not. This is wrong and will lead to use-after-free. Fix it by storing blocking information in separate variable. This was spotted by KASAN when running generic/208 fstest. Signed-off-by: Lukas Czerner Reported-by: Zorro Lang Signed-off-by: Miklos Szeredi Fixes: 744742d692e3 ("fuse: Add reference counting for fuse_io_priv") Cc: # v4.6 --- fs/fuse/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index cc2121b37bf5..b52f9baaa3e7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2924,10 +2924,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } if (io->async) { + bool blocking = io->blocking; + fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (!io->blocking) + if (!blocking) return -EIOCBQUEUED; wait_for_completion(&wait); -- cgit v1.2.3-59-g8ed1b From c4b7d1ba7d263b74bb72e9325262a67139605cde Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 10 Nov 2018 04:13:24 +0000 Subject: sysv: return 'err' instead of 0 in __sysv_write_inode Fixes gcc '-Wunused-but-set-variable' warning: fs/sysv/inode.c: In function '__sysv_write_inode': fs/sysv/inode.c:239:6: warning: variable 'err' set but not used [-Wunused-but-set-variable] __sysv_write_inode should return 'err' instead of 0 Fixes: 05459ca81ac3 ("repair sysv_write_inode(), switch sysv to simple_fsync()") Signed-off-by: YueHaibing Signed-off-by: Al Viro --- fs/sysv/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 499a20a5a010..273736f41be3 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -275,7 +275,7 @@ static int __sysv_write_inode(struct inode *inode, int wait) } } brelse(bh); - return 0; + return err; } int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) -- cgit v1.2.3-59-g8ed1b From 1e9c75fb9c47a75a9aec0cd17db5f6dc36b58e00 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 3 Oct 2018 10:18:33 -0400 Subject: mnt: fix __detach_mounts infinite loop Since commit ff17fa561a04 ("d_invalidate(): unhash immediately") immediately unhashes the dentry, we'll never return the mountpoint in lookup_mountpoint(), which can lead to an unbreakable loop in d_invalidate(). I have reports of NFS clients getting into this condition after the server removes an export of an existing mount created through follow_automount(), but I suspect there are various other ways to produce this problem if we hunt down users of d_invalidate(). For example, it is possible to get into this state by using XFS' d_invalidate() call in xfs_vn_unlink(): truncate -s 100m img{1,2} mkfs.xfs -q -n version=ci img1 mkfs.xfs -q -n version=ci img2 mkdir -p /mnt/xfs mount img1 /mnt/xfs mkdir /mnt/xfs/sub1 mount img2 /mnt/xfs/sub1 cat > /mnt/xfs/sub1/foo & umount -l /mnt/xfs/sub1 mount img2 /mnt/xfs/sub1 mount --make-private /mnt/xfs mkdir /mnt/xfs/sub2 mount --move /mnt/xfs/sub1 /mnt/xfs/sub2 rmdir /mnt/xfs/sub1 Fix this by moving the check for an unlinked dentry out of the detach_mounts() path. Fixes: ff17fa561a04 ("d_invalidate(): unhash immediately") Cc: stable@vger.kernel.org Reviewed-by: "Eric W. Biederman" Signed-off-by: Benjamin Coddington Signed-off-by: Eric W. Biederman --- fs/namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 74f64294a410..a7f91265ea67 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -695,9 +695,6 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { - /* might be worth a WARN_ON() */ - if (d_unlinked(dentry)) - return ERR_PTR(-ENOENT); mp->m_count++; return mp; } @@ -711,6 +708,9 @@ static struct mountpoint *get_mountpoint(struct dentry *dentry) int ret; if (d_mountpoint(dentry)) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); mountpoint: read_seqlock_excl(&mount_lock); mp = lookup_mountpoint(dentry); -- cgit v1.2.3-59-g8ed1b From 21a446cf186570168b7281b154b1993968598aca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Nov 2018 11:10:50 -0500 Subject: NFSv4: Don't exit the state manager without clearing NFS4CLNT_MANAGER_RUNNING If we exit the NFSv4 state manager due to a umount, then we can end up leaving the NFS4CLNT_MANAGER_RUNNING flag set. If another mount causes the nfs4_client to be rereferenced before it is destroyed, then we end up never being able to recover state. Fixes: 47c2199b6eb5 ("NFSv4.1: Ensure state manager thread dies on last ...") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.15+ --- fs/nfs/nfs4state.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 62ae0fd345ad..98d1b6a6646a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2601,11 +2601,12 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ if (clp->cl_state == 0) - break; + return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) - break; + return; } while (refcount_read(&clp->cl_count) > 1); - return; + goto out_drain; + out_error: if (strlen(section)) section_sep = ": "; @@ -2613,6 +2614,7 @@ out_error: " with error %d\n", section_sep, section, clp->cl_hostname, -status); ssleep(1); +out_drain: nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } -- cgit v1.2.3-59-g8ed1b From a1aa09be21fa344d1f5585aab8164bfae55f57e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Nov 2018 12:17:01 -0500 Subject: NFSv4: Ensure that the state manager exits the loop on SIGKILL Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 98d1b6a6646a..ffea57885394 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2604,7 +2604,7 @@ static void nfs4_state_manager(struct nfs_client *clp) return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; - } while (refcount_read(&clp->cl_count) > 1); + } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; out_error: -- cgit v1.2.3-59-g8ed1b From f8397d69daef06d358430d3054662fb597e37c00 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 6 Nov 2018 16:40:20 +0200 Subject: btrfs: Always try all copies when reading extent buffers When a metadata read is served the endio routine btree_readpage_end_io_hook is called which eventually runs the tree-checker. If tree-checker fails to validate the read eb then it sets EXTENT_BUFFER_CORRUPT flag. This leads to btree_read_extent_buffer_pages wrongly assuming that all available copies of this extent buffer are wrong and failing prematurely. Fix this modify btree_read_extent_buffer_pages to read all copies of the data. This failure was exhibitted in xfstests btrfs/124 which would spuriously fail its balance operations. The reason was that when balance was run following re-introduction of the missing raid1 disk __btrfs_map_block would map the read request to stripe 0, which corresponded to devid 2 (the disk which is being removed in the test): item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 3553624064) itemoff 15975 itemsize 112 length 1073741824 owner 2 stripe_len 65536 type DATA|RAID1 io_align 65536 io_width 65536 sector_size 4096 num_stripes 2 sub_stripes 1 stripe 0 devid 2 offset 2156920832 dev_uuid 8466c350-ed0c-4c3b-b17d-6379b445d5c8 stripe 1 devid 1 offset 3553624064 dev_uuid 1265d8db-5596-477e-af03-df08eb38d2ca This caused read requests for a checksum item that to be routed to the stale disk which triggered the aforementioned logic involving EXTENT_BUFFER_CORRUPT flag. This then triggered cascading failures of the balance operation. Fixes: a826d6dcb32d ("Btrfs: check items for correctness as we search") CC: stable@vger.kernel.org # 4.4+ Suggested-by: Qu Wenruo Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f0b6d1936e8..6d776717d8b3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -477,9 +477,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, int mirror_num = 0; int failed_mirror = 0; - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { @@ -493,15 +493,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, break; } - /* - * This buffer's crc is fine, but its contents are corrupted, so - * there is no reason to read the other copies, they won't be - * any less wrong. - */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || - ret == -EUCLEAN) - break; - num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) -- cgit v1.2.3-59-g8ed1b From aab15e8ec25765cf7968c72cbec7583acf99d8a4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Nov 2018 10:23:58 +0000 Subject: Btrfs: fix rare chances for data loss when doing a fast fsync After the simplification of the fast fsync patch done recently by commit b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync time") and commit e7175a692765 ("btrfs: remove the wait ordered logic in the log_one_extent path"), we got a very short time window where we can get extents logged without writeback completing first or extents logged without logging the respective data checksums. Both issues can only happen when doing a non-full (fast) fsync. As soon as we enter btrfs_sync_file() we trigger writeback, then lock the inode and then wait for the writeback to complete before starting to log the inode. However before we acquire the inode's lock and after we started writeback, it's possible that more writes happened and dirtied more pages. If that happened and those pages get writeback triggered while we are logging the inode (for example, the VM subsystem triggering it due to memory pressure, or another concurrent fsync), we end up seeing the respective extent maps in the inode's list of modified extents and will log matching file extent items without waiting for the respective ordered extents to complete, meaning that either of the following will happen: 1) We log an extent after its writeback finishes but before its checksums are added to the csum tree, leading to -EIO errors when attempting to read the extent after a log replay. 2) We log an extent before its writeback finishes. Therefore after the log replay we will have a file extent item pointing to an unwritten extent (and without the respective data checksums as well). This could not happen before the fast fsync patch simplification, because for any extent we found in the list of modified extents, we would wait for its respective ordered extent to finish writeback or collect its checksums for logging if it did not complete yet. Fix this by triggering writeback again after acquiring the inode's lock and before waiting for ordered extents to complete. Fixes: e7175a692765 ("btrfs: remove the wait ordered logic in the log_one_extent path") Fixes: b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync time") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97c7a086f7bd..b92b7f05c3d5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2088,6 +2088,30 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); + /* + * Before we acquired the inode's lock, someone may have dirtied more + * pages in the target range. We need to make sure that writeback for + * any such pages does not start while we are logging the inode, because + * if it does, any of the following might happen when we are not doing a + * full inode sync: + * + * 1) We log an extent after its writeback finishes but before its + * checksums are added to the csum tree, leading to -EIO errors + * when attempting to read the extent after a log replay. + * + * 2) We can end up logging an extent before its writeback finishes. + * Therefore after the log replay we will have a file extent item + * pointing to an unwritten extent (and no data checksums as well). + * + * So trigger writeback for any eventual new dirty pages and then we + * wait for all ordered extents to complete below. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) { + inode_unlock(inode); + goto out; + } + /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaciton open. -- cgit v1.2.3-59-g8ed1b From e39d8a186ed002854196668cb7562ffdfbc6d379 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 13 Nov 2018 16:37:54 -0500 Subject: NFSv4: Fix an Oops during delegation callbacks If the server sends a CB_GETATTR or a CB_RECALL while the filesystem is being unmounted, then we can Oops when releasing the inode in nfs4_callback_getattr() and nfs4_callback_recall(). Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 4 ++-- fs/nfs/delegation.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index fa515d5ea5ba..7b861bbc0b43 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -66,7 +66,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, out_iput: rcu_read_unlock(); trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); - iput(inode); + nfs_iput_and_deactive(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); return res->status; @@ -108,7 +108,7 @@ __be32 nfs4_callback_recall(void *argp, void *resp, } trace_nfs4_cb_recall(cps->clp, &args->fh, inode, &args->stateid, -ntohl(res)); - iput(inode); + nfs_iput_and_deactive(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); return res; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 07b839560576..6ec2f78c1e19 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -850,16 +850,23 @@ nfs_delegation_find_inode_server(struct nfs_server *server, const struct nfs_fh *fhandle) { struct nfs_delegation *delegation; - struct inode *res = NULL; + struct inode *freeme, *res = NULL; list_for_each_entry_rcu(delegation, &server->delegations, super_list) { spin_lock(&delegation->lock); if (delegation->inode != NULL && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { - res = igrab(delegation->inode); + freeme = igrab(delegation->inode); + if (freeme && nfs_sb_active(freeme->i_sb)) + res = freeme; spin_unlock(&delegation->lock); if (res != NULL) return res; + if (freeme) { + rcu_read_unlock(); + iput(freeme); + rcu_read_lock(); + } return ERR_PTR(-EAGAIN); } spin_unlock(&delegation->lock); -- cgit v1.2.3-59-g8ed1b From f505754fd6599230371cb01b9332754ddc104be1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 14 Nov 2018 11:35:24 +0000 Subject: Btrfs: ensure path name is null terminated at btrfs_control_ioctl We were using the path name received from user space without checking that it is null terminated. While btrfs-progs is well behaved and does proper validation and null termination, someone could call the ioctl and pass a non-null terminated patch, leading to buffer overrun problems in the kernel. The ioctl is protected by CAP_SYS_ADMIN. So just set the last byte of the path to a null character, similar to what we do in other ioctls (add/remove/resize device, snapshot creation, etc). CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cbc9d0d2c12d..645fc81e2a94 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2237,6 +2237,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); + vol->name[BTRFS_PATH_NAME_MAX] = '\0'; switch (cmd) { case BTRFS_IOC_SCAN_DEV: -- cgit v1.2.3-59-g8ed1b From 7150ceaacb27f7b3bf494e72cd4be4e11612dfff Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 12 Nov 2018 22:33:22 +0000 Subject: rxrpc: Fix life check The life-checking function, which is used by kAFS to make sure that a call is still live in the event of a pending signal, only samples the received packet serial number counter; it doesn't actually provoke a change in the counter, rather relying on the server to happen to give us a packet in the time window. Fix this by adding a function to force a ping to be transmitted. kAFS then keeps track of whether there's been a stall, and if so, uses the new function to ping the server, resetting the timeout to allow the reply to come back. If there's a stall, a ping and the call is *still* stalled in the same place after another period, then the call will be aborted. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Fixes: f4d15fb6f99a ("rxrpc: Provide functions for allowing cleaner handling of signals") Signed-off-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.txt | 17 +++++++++++------ fs/afs/rxrpc.c | 11 ++++++++++- include/net/af_rxrpc.h | 3 ++- include/trace/events/rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 27 +++++++++++++++++++++++---- 5 files changed, 48 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 605e00cdd6be..89f1302d593a 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -1056,18 +1056,23 @@ The kernel interface functions are as follows: u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call); + void rxrpc_kernel_probe_life(struct socket *sock, + struct rxrpc_call *call); - This returns a number that is updated when ACKs are received from the peer - (notably including PING RESPONSE ACKs which we can elicit by sending PING - ACKs to see if the call still exists on the server). The caller should - compare the numbers of two calls to see if the call is still alive after - waiting for a suitable interval. + The first function returns a number that is updated when ACKs are received + from the peer (notably including PING RESPONSE ACKs which we can elicit by + sending PING ACKs to see if the call still exists on the server). The + caller should compare the numbers of two calls to see if the call is still + alive after waiting for a suitable interval. This allows the caller to work out if the server is still contactable and if the call is still alive on the server whilst waiting for the server to process a client operation. - This function may transmit a PING ACK. + The second function causes a ping ACK to be transmitted to try to provoke + the peer into responding, which would then cause the value returned by the + first function to change. Note that this must be called in TASK_RUNNING + state. (*) Get reply timestamp. diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 59970886690f..a7b44863d502 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -576,6 +576,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, { signed long rtt2, timeout; long ret; + bool stalled = false; u64 rtt; u32 life, last_life; @@ -609,12 +610,20 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); if (timeout == 0 && - life == last_life && signal_pending(current)) + life == last_life && signal_pending(current)) { + if (stalled) break; + __set_current_state(TASK_RUNNING); + rxrpc_kernel_probe_life(call->net->socket, call->rxcall); + timeout = rtt2; + stalled = true; + continue; + } if (life != last_life) { timeout = rtt2; last_life = life; + stalled = false; } timeout = schedule_timeout(timeout); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index de587948042a..1adefe42c0a6 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -77,7 +77,8 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *, struct key *); int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, enum rxrpc_call_completion *, u32 *); -u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *); +u32 rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); +void rxrpc_kernel_probe_life(struct socket *, struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, ktime_t *); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 573d5b901fb1..5b50fe4906d2 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -181,6 +181,7 @@ enum rxrpc_timer_trace { enum rxrpc_propose_ack_trace { rxrpc_propose_ack_client_tx_end, rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_check_life, rxrpc_propose_ack_ping_for_keepalive, rxrpc_propose_ack_ping_for_lost_ack, rxrpc_propose_ack_ping_for_lost_reply, @@ -380,6 +381,7 @@ enum rxrpc_tx_point { #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ EM(rxrpc_propose_ack_input_data, "DataIn ") \ + EM(rxrpc_propose_ack_ping_for_check_life, "ChkLife") \ EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 64362d078da8..a2522f9d71e2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -375,16 +375,35 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * getting ACKs from the server. Returns a number representing the life state * which can be compared to that returned by a previous call. * - * If this is a client call, ping ACKs will be sent to the server to find out - * whether it's still responsive and whether the call is still alive on the - * server. + * If the life state stalls, rxrpc_kernel_probe_life() should be called and + * then 2RTT waited. */ -u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) +u32 rxrpc_kernel_check_life(const struct socket *sock, + const struct rxrpc_call *call) { return call->acks_latest; } EXPORT_SYMBOL(rxrpc_kernel_check_life); +/** + * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive + * @sock: The socket the call is on + * @call: The call to check + * + * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to + * find out whether a call is still alive by pinging it. This should cause the + * life state to be bumped in about 2*RTT. + * + * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. + */ +void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) +{ + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_check_life); + rxrpc_send_ack_packet(call, true, NULL); +} +EXPORT_SYMBOL(rxrpc_kernel_probe_life); + /** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on -- cgit v1.2.3-59-g8ed1b From c26b5aa8ef0d46035060fded475e6ab957b9f69f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 11 Nov 2018 11:15:21 +0000 Subject: gfs2: Fix iomap buffer head reference counting bug GFS2 passes the inode buffer head (dibh) from gfs2_iomap_begin to gfs2_iomap_end in iomap->private. It sets that private pointer in gfs2_iomap_get. Users of gfs2_iomap_get other than gfs2_iomap_begin would have to release iomap->private, but this isn't done correctly, leading to a leak of buffer head references. To fix this, move the code for setting iomap->private from gfs2_iomap_get to gfs2_iomap_begin. Fixes: 64bc06bb32 ("gfs2: iomap buffered write support") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Andreas Gruenbacher Signed-off-by: Linus Torvalds --- fs/gfs2/bmap.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 38d88fcb6988..0d643306c255 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -826,7 +826,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto unlock; - iomap->private = dibh; + mp->mp_bh[0] = dibh; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_WRITE) { @@ -863,9 +863,6 @@ unstuff: len = lblock_stop - lblock + 1; iomap->length = len << inode->i_blkbits; - get_bh(dibh); - mp->mp_bh[0] = dibh; - height = ip->i_height; while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) height++; @@ -898,8 +895,6 @@ out: iomap->bdev = inode->i_sb->s_bdev; unlock: up_read(&ip->i_rw_mutex); - if (ret && dibh) - brelse(dibh); return ret; do_alloc: @@ -980,9 +975,9 @@ static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos, static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct metapath *mp) { - struct metapath mp = { .mp_aheight = 1, }; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; @@ -996,9 +991,9 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, unstuff = gfs2_is_stuffed(ip) && pos + length > gfs2_max_stuffed_size(ip); - ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp); if (ret) - goto out_release; + goto out_unlock; alloc_required = unstuff || iomap->type == IOMAP_HOLE; @@ -1013,7 +1008,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, ret = gfs2_quota_lock_check(ip, &ap); if (ret) - goto out_release; + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) @@ -1038,17 +1033,15 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, ret = gfs2_unstuff_dinode(ip, NULL); if (ret) goto out_trans_end; - release_metapath(&mp); - brelse(iomap->private); - iomap->private = NULL; + release_metapath(mp); ret = gfs2_iomap_get(inode, iomap->offset, iomap->length, - flags, iomap, &mp); + flags, iomap, mp); if (ret) goto out_trans_end; } if (iomap->type == IOMAP_HOLE) { - ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + ret = gfs2_iomap_alloc(inode, iomap, flags, mp); if (ret) { gfs2_trans_end(sdp); gfs2_inplace_release(ip); @@ -1056,7 +1049,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, goto out_qunlock; } } - release_metapath(&mp); if (gfs2_is_jdata(ip)) iomap->page_done = gfs2_iomap_journaled_page_done; return 0; @@ -1069,10 +1061,7 @@ out_trans_fail: out_qunlock: if (alloc_required) gfs2_quota_unlock(ip); -out_release: - if (iomap->private) - brelse(iomap->private); - release_metapath(&mp); +out_unlock: gfs2_write_unlock(inode); return ret; } @@ -1088,10 +1077,10 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, trace_gfs2_iomap_start(ip, pos, length, flags); if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) { - ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap); + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); } else { ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); - release_metapath(&mp); + /* * Silently fall back to buffered I/O for stuffed files or if * we've hot a hole (see gfs2_file_direct_write). @@ -1100,6 +1089,11 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->type != IOMAP_MAPPED) ret = -ENOTBLK; } + if (!ret) { + get_bh(mp.mp_bh[0]); + iomap->private = mp.mp_bh[0]; + } + release_metapath(&mp); trace_gfs2_iomap_end(ip, iomap, ret); return ret; } -- cgit v1.2.3-59-g8ed1b From 6d7cd8c1373746a93dc868ee9d38a82df78b38aa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 6 Nov 2018 13:11:57 -0500 Subject: dax: Remove optimisation from dax_lock_mapping_entry Skipping some of the revalidation after we sleep can lead to returning a mapping which has already been freed. Just drop this optimisation. Reported-by: Dan Williams Fixes: 9f32d221301c ("dax: Convert dax_lock_mapping_entry to XArray") Signed-off-by: Matthew Wilcox --- fs/dax.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 616e36ea6aaa..529ac9d7c10a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -383,11 +383,8 @@ bool dax_lock_mapping_entry(struct page *page) entry = xas_load(&xas); if (dax_is_locked(entry)) { entry = get_unlocked_entry(&xas); - /* Did the page move while we slept? */ - if (dax_to_pfn(entry) != page_to_pfn(page)) { - xas_unlock_irq(&xas); - continue; - } + xas_unlock_irq(&xas); + continue; } dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); -- cgit v1.2.3-59-g8ed1b From 7ae2ea7dc45e8250a74cfaaecdce578427669ae5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 9 Nov 2018 20:09:37 -0500 Subject: dax: Make sure the unlocking entry isn't locked I wrote the semantics in the commit message, but didn't document it in the source code. Use a BUG_ON instead (if any code does do this, it's really buggy; we can't recover and it's worth taking the machine down). Signed-off-by: Matthew Wilcox --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 529ac9d7c10a..7944417f5a71 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -255,6 +255,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) { void *old; + BUG_ON(dax_is_locked(entry)); xas_reset(xas); xas_lock_irq(xas); old = xas_store(xas, entry); -- cgit v1.2.3-59-g8ed1b From c5bbd4515a05f8acb7e6ab6297044a529762cbf5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 16 Nov 2018 14:37:06 -0500 Subject: dax: Reinstate RCU protection of inode For the device-dax case, it is possible that the inode can go away underneath us. The rcu_read_lock() was there to prevent it from being freed, and not (as I thought) to protect the tree. Bring back the rcu_read_lock() protection. Also add a little kernel-doc; while this function is not exported to modules, it is used from outside dax.c Reported-by: Dan Williams Fixes: 9f32d221301c ("dax: Convert dax_lock_mapping_entry to XArray") Signed-off-by: Matthew Wilcox --- fs/dax.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 7944417f5a71..ce87d21b3805 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -353,16 +353,27 @@ static struct page *dax_busy_page(void *entry) return NULL; } +/* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page + * @page: The page whose entry we want to lock + * + * Context: Process context. + * Return: %true if the entry was locked or does not need to be locked. + */ bool dax_lock_mapping_entry(struct page *page) { XA_STATE(xas, NULL, 0); void *entry; + bool locked; + /* Ensure page->mapping isn't freed while we look at it */ + rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(page->mapping); + locked = false; if (!dax_mapping(mapping)) - return false; + break; /* * In the device-dax case there's no need to lock, a @@ -371,8 +382,9 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ + locked = true; if (S_ISCHR(mapping->host->i_mode)) - return true; + break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); @@ -383,14 +395,18 @@ bool dax_lock_mapping_entry(struct page *page) xas_set(&xas, page->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { + rcu_read_unlock(); entry = get_unlocked_entry(&xas); xas_unlock_irq(&xas); + rcu_read_lock(); continue; } dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); - return true; + break; } + rcu_read_unlock(); + return locked; } void dax_unlock_mapping_entry(struct page *page) -- cgit v1.2.3-59-g8ed1b From 53fffe29a9e664a999dd3787e4428da8c30533e0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Nov 2018 07:43:42 -0700 Subject: aio: fix failure to put the file pointer If the ioprio capability check fails, we return without putting the file pointer. Fixes: d9a08a9e616b ("fs: Add aio iopriority support") Signed-off-by: Jens Axboe Signed-off-by: Al Viro --- fs/aio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 301e6314183b..97f983592925 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1436,6 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); + fput(req->ki_filp); return ret; } -- cgit v1.2.3-59-g8ed1b From fda490d39fc0668d92e170d95c11e35a010019aa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 16 Nov 2018 15:07:31 -0500 Subject: dax: Fix dax_unlock_mapping_entry for PMD pages Device DAX PMD pages do not set the PageHead bit for compound pages. Fix for now by retrieving the PMD bit from the entry, but eventually we will be passed the page size by the caller. Reported-by: Dan Williams Fixes: 9f32d221301c ("dax: Convert dax_lock_mapping_entry to XArray") Signed-off-by: Matthew Wilcox --- fs/dax.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index ce87d21b3805..5426252375f6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -98,12 +98,6 @@ static void *dax_make_entry(pfn_t pfn, unsigned long flags) return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } -static void *dax_make_page_entry(struct page *page) -{ - pfn_t pfn = page_to_pfn_t(page); - return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); -} - static bool dax_is_locked(void *entry) { return xa_to_value(entry) & DAX_LOCKED; @@ -116,12 +110,12 @@ static unsigned int dax_entry_order(void *entry) return 0; } -static int dax_is_pmd_entry(void *entry) +static unsigned long dax_is_pmd_entry(void *entry) { return xa_to_value(entry) & DAX_PMD; } -static int dax_is_pte_entry(void *entry) +static bool dax_is_pte_entry(void *entry) { return !(xa_to_value(entry) & DAX_PMD); } @@ -413,11 +407,16 @@ void dax_unlock_mapping_entry(struct page *page) { struct address_space *mapping = page->mapping; XA_STATE(xas, &mapping->i_pages, page->index); + void *entry; if (S_ISCHR(mapping->host->i_mode)) return; - dax_unlock_entry(&xas, dax_make_page_entry(page)); + rcu_read_lock(); + entry = xas_load(&xas); + rcu_read_unlock(); + entry = dax_make_entry(page_to_pfn_t(page), dax_is_pmd_entry(entry)); + dax_unlock_entry(&xas, entry); } /* -- cgit v1.2.3-59-g8ed1b From 0e40de0338d005f73d46898a21544cd26f01b4ce Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 16 Nov 2018 15:19:13 -0500 Subject: dax: Fix huge page faults Using xas_load() with a PMD-sized xa_state would work if either a PMD-sized entry was present or a PTE sized entry was present in the first 64 entries (of the 512 PTEs in a PMD on x86). If there was no PTE in the first 64 entries, grab_mapping_entry() would believe there were no entries present, allocate a PMD-sized entry and overwrite the PTE in the page cache. Use xas_find_conflict() instead which turns out to simplify both get_unlocked_entry() and grab_mapping_entry(). Also remove a WARN_ON_ONCE from grab_mapping_entry() as it will have already triggered in get_unlocked_entry(). Fixes: cfc93c6c6c96 ("dax: Convert dax_insert_pfn_mkwrite to XArray") Signed-off-by: Matthew Wilcox --- fs/dax.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 5426252375f6..cf2394e2bf4b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -216,9 +216,8 @@ static void *get_unlocked_entry(struct xa_state *xas) ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = xas_load(xas); - if (!entry || xa_is_internal(entry) || - WARN_ON_ONCE(!xa_is_value(entry)) || + entry = xas_find_conflict(xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || !dax_is_locked(entry)) return entry; @@ -458,11 +457,9 @@ static void *grab_mapping_entry(struct xa_state *xas, retry: xas_lock_irq(xas); entry = get_unlocked_entry(xas); - if (xa_is_internal(entry)) - goto fallback; if (entry) { - if (WARN_ON_ONCE(!xa_is_value(entry))) { + if (!xa_is_value(entry)) { xas_set_err(xas, EIO); goto out_unlock; } @@ -1641,8 +1638,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || (order == 0 && !dax_is_pte_entry(entry)) || - (order == PMD_ORDER && (xa_is_internal(entry) || - !dax_is_pmd_entry(entry)))) { + (order == PMD_ORDER && !dax_is_pmd_entry(entry))) { put_unlocked_entry(&xas, entry); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, -- cgit v1.2.3-59-g8ed1b From a76cf1a474d7dbcd9336b5f5afb0162baa142cf0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 16 Nov 2018 15:08:18 -0800 Subject: mm: don't reclaim inodes with many attached pages Spock reported that commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects") leads to a regression on his setup: periodically the majority of the pagecache is evicted without an obvious reason, while before the change the amount of free memory was balancing around the watermark. The reason behind is that the mentioned above change created some minimal background pressure on the inode cache. The problem is that if an inode is considered to be reclaimed, all belonging pagecache page are stripped, no matter how many of them are there. So, if a huge multi-gigabyte file is cached in the memory, and the goal is to reclaim only few slab objects (unused inodes), we still can eventually evict all gigabytes of the pagecache at once. The workload described by Spock has few large non-mapped files in the pagecache, so it's especially noticeable. To solve the problem let's postpone the reclaim of inodes, which have more than 1 attached page. Let's wait until the pagecache pages will be evicted naturally by scanning the corresponding LRU lists, and only then reclaim the inode structure. Link: http://lkml.kernel.org/r/20181023164302.20436-1-guro@fb.com Signed-off-by: Roman Gushchin Reported-by: Spock Tested-by: Spock Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Rik van Riel Cc: Randy Dunlap Cc: [4.19.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 9e198f00b64c..35d2108d567c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,8 +730,11 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { + /* + * Recently referenced inodes and inodes with many attached pages + * get one more pass. + */ + if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; -- cgit v1.2.3-59-g8ed1b From 5040f8df56fb90c7919f1c9b0b6e54c843437456 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 16 Nov 2018 15:08:25 -0800 Subject: ocfs2: free up write context when direct IO failed The write context should also be freed even when direct IO failed. Otherwise a memory leak is introduced and entries remain in oi->ip_unwritten_list causing the following BUG later in unlink path: ERROR: bug expression: !list_empty(&oi->ip_unwritten_list) ERROR: Clear inode of 215043, inode has unwritten extents ... Call Trace: ? __set_current_blocked+0x42/0x68 ocfs2_evict_inode+0x91/0x6a0 [ocfs2] ? bit_waitqueue+0x40/0x33 evict+0xdb/0x1af iput+0x1a2/0x1f7 do_unlinkat+0x194/0x28f SyS_unlinkat+0x1b/0x2f do_syscall_64+0x79/0x1ae entry_SYSCALL_64_after_hwframe+0x151/0x0 This patch also logs, with frequency limit, direct IO failures. Link: http://lkml.kernel.org/r/20181102170632.25921-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Junxiao Bi Reviewed-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 12 ++++++++++-- fs/ocfs2/cluster/masklog.h | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index da578ad4c08f..eb1ce30412dc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2411,8 +2411,16 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (bytes > 0 && private) - ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); + if (bytes <= 0) + mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld", + (long long)bytes); + if (private) { + if (bytes > 0) + ret = ocfs2_dio_end_io_write(inode, private, offset, + bytes); + else + ocfs2_dio_free_write_ctx(inode, private); + } ocfs2_iocb_clear_rw_locked(iocb); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 308ea0eb35fd..a396096a5099 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -178,6 +178,15 @@ do { \ ##__VA_ARGS__); \ } while (0) +#define mlog_ratelimited(mask, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + mlog(mask, fmt, ##__VA_ARGS__); \ +} while (0) + #define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ -- cgit v1.2.3-59-g8ed1b From 909e22e05353a783c526829427e9a8de122fba9c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 19 Nov 2018 11:32:41 +0800 Subject: exportfs: fix 'passing zero to ERR_PTR()' warning Fix a static code checker warning: fs/exportfs/expfs.c:171 reconnect_one() warn: passing zero to 'ERR_PTR' The error path for lookup_one_len_unlocked failure should set err to PTR_ERR. Fixes: bbf7a8a3562f ("exportfs: move most of reconnect_path to helper function") Signed-off-by: YueHaibing Signed-off-by: Al Viro --- fs/exportfs/expfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 645158dc33f1..c8a3dfda1764 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -147,6 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + err = PTR_ERR(tmp); goto out_err; } if (tmp != dentry) { -- cgit v1.2.3-59-g8ed1b From b54e41f5efcb4316b2f30b30c2535cc194270373 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Nov 2018 13:43:17 +0100 Subject: udf: Allow mounting volumes with incorrect identification strings Commit c26f6c615788 ("udf: Fix conversion of 'dstring' fields to UTF8") started to be more strict when checking whether converted strings are properly formatted. Sudip reports that there are DVDs where the volume identification string is actually too long - UDF reports: [ 632.309320] UDF-fs: incorrect dstring lengths (32/32) during mount and fails the mount. This is mostly harmless failure as we don't need volume identification (and even less volume set identification) for anything. So just truncate the volume identification string if it is too long and replace it with 'Invalid' if we just cannot convert it for other reasons. This keeps slightly incorrect media still mountable. CC: stable@vger.kernel.org Fixes: c26f6c615788 ("udf: Fix conversion of 'dstring' fields to UTF8") Reported-and-tested-by: Sudip Mukherjee Signed-off-by: Jan Kara --- fs/udf/super.c | 16 ++++++++++------ fs/udf/unicode.c | 14 +++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 8f2f56d9a1bb..e3d684ea3203 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -827,16 +827,20 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32); - if (ret < 0) - goto out_bh; - - strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); + if (ret < 0) { + strcpy(UDF_SB(sb)->s_volume_ident, "InvalidName"); + pr_warn("incorrect volume identification, setting to " + "'InvalidName'\n"); + } else { + strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); + } udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128); - if (ret < 0) + if (ret < 0) { + ret = 0; goto out_bh; - + } outstr[ret] = 0; udf_debug("volSetIdent[] = '%s'\n", outstr); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 45234791fec2..5fcfa96463eb 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -351,6 +351,11 @@ try_again: return u_len; } +/* + * Convert CS0 dstring to output charset. Warning: This function may truncate + * input string if it is too long as it is used for informational strings only + * and it is better to truncate the string than to refuse mounting a media. + */ int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len) { @@ -359,9 +364,12 @@ int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, if (i_len > 0) { s_len = ocu_i[i_len - 1]; if (s_len >= i_len) { - pr_err("incorrect dstring lengths (%d/%d)\n", - s_len, i_len); - return -EINVAL; + pr_warn("incorrect dstring lengths (%d/%d)," + " truncating\n", s_len, i_len); + s_len = i_len - 1; + /* 2-byte encoding? Need to round properly... */ + if (ocu_i[0] == 16) + s_len -= (s_len - 1) & 2; } } -- cgit v1.2.3-59-g8ed1b From c22397888f1eed98cd59f0a88f2a5f6925f80e15 Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Mon, 12 Nov 2018 12:54:45 +0900 Subject: exec: make de_thread() freezable Suspend fails due to the exec family of functions blocking the freezer. The casue is that de_thread() sleeps in TASK_UNINTERRUPTIBLE waiting for all sub-threads to die, and we have the deadlock if one of them is frozen. This also can occur with the schedule() waiting for the group thread leader to exit if it is frozen. In our machine, it causes freeze timeout as bellows. Freezing of tasks failed after 20.010 seconds (1 tasks refusing to freeze, wq_busy=0): setcpushares-ls D ffffffc00008ed70 0 5817 1483 0x0040000d Call trace: [] __switch_to+0x88/0xa0 [] __schedule+0x1bc/0x720 [] schedule+0x40/0xa8 [] flush_old_exec+0xdc/0x640 [] load_elf_binary+0x2a8/0x1090 [] search_binary_handler+0x9c/0x240 [] load_script+0x20c/0x228 [] search_binary_handler+0x9c/0x240 [] do_execveat_common.isra.14+0x4f8/0x6e8 [] compat_SyS_execve+0x38/0x48 [] el0_svc_naked+0x24/0x28 To fix this, make de_thread() freezable. It looks safe and works fine. Suggested-by: Oleg Nesterov Signed-off-by: Chanho Min Acked-by: Oleg Nesterov Acked-by: Pavel Machek Acked-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- fs/exec.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index fc281b738a98..acc3a5536384 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1083,7 +1084,7 @@ static int de_thread(struct task_struct *tsk) while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); - schedule(); + freezable_schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; spin_lock_irq(lock); @@ -1111,7 +1112,7 @@ static int de_thread(struct task_struct *tsk) __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); - schedule(); + freezable_schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; } -- cgit v1.2.3-59-g8ed1b From 25bbe21bf427a81b8e3ccd480ea0e1d940256156 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 16 Nov 2018 15:50:02 -0500 Subject: dax: Avoid losing wakeup in dax_lock_mapping_entry After calling get_unlocked_entry(), you have to call put_unlocked_entry() to avoid subsequent waiters losing wakeups. Fixes: c2a7d2a11552 ("filesystem-dax: Introduce dax_lock_mapping_entry()") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index cf2394e2bf4b..9bcce89ea18e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -391,6 +391,7 @@ bool dax_lock_mapping_entry(struct page *page) rcu_read_unlock(); entry = get_unlocked_entry(&xas); xas_unlock_irq(&xas); + put_unlocked_entry(&xas, entry); rcu_read_lock(); continue; } -- cgit v1.2.3-59-g8ed1b From 59e4293149106fb92530f8e56fa3992d8548c5e6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 14 Nov 2018 07:46:40 -0800 Subject: xfs: fix shared extent data corruption due to missing cow reservation Page writeback indirectly handles shared extents via the existence of overlapping COW fork blocks. If COW fork blocks exist, writeback always performs the associated copy-on-write regardless if the underlying blocks are actually shared. If the blocks are shared, then overlapping COW fork blocks must always exist. fstests shared/010 reproduces a case where a buffered write occurs over a shared block without performing the requisite COW fork reservation. This ultimately causes writeback to the shared extent and data corruption that is detected across md5 checks of the filesystem across a mount cycle. The problem occurs when a buffered write lands over a shared extent that crosses an extent size hint boundary and that also happens to have a partial COW reservation that doesn't cover the start and end blocks of the data fork extent. For example, a buffered write occurs across the file offset (in FSB units) range of [29, 57]. A shared extent exists at blocks [29, 35] and COW reservation already exists at blocks [32, 34]. After accommodating a COW extent size hint of 32 blocks and the existing reservation at offset 32, xfs_reflink_reserve_cow() allocates 32 blocks of reservation at offset 0 and returns with COW reservation across the range of [0, 34]. The associated data fork extent is still [29, 35], however, which isn't fully covered by the COW reservation. This leads to a buffered write at file offset 35 over a shared extent without associated COW reservation. Writeback eventually kicks in, performs an overwrite of the underlying shared block and causes the associated data corruption. Update xfs_reflink_reserve_cow() to accommodate the fact that a delalloc allocation request may not fully cover the extent in the data fork. Trim the data fork extent appropriately, just as is done for shared extent boundaries and/or existing COW reservations that happen to overlap the start of the data fork extent. This prevents shared/010 failures due to data corruption on reflink enabled filesystems. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ecdb086bc23e..c56bdbfcf7ae 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -296,6 +296,7 @@ xfs_reflink_reserve_cow( if (error) return error; + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); trace_xfs_reflink_cow_alloc(ip, &got); return 0; } -- cgit v1.2.3-59-g8ed1b From da034bcc6aaaf2a6ba6c5b5e63565c5ef4816a0e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 14 Nov 2018 21:48:18 -0800 Subject: xfs: make xfs_file_remap_range() static xfs_file_remap_range() is only used in fs/xfs/xfs_file.c, so make it static. This addresses a gcc warning when -Wmissing-prototypes is enabled. Signed-off-by: Eric Biggers Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 53c9ab8fb777..e47425071e65 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -920,7 +920,7 @@ out_unlock: } -loff_t +STATIC loff_t xfs_file_remap_range( struct file *file_in, loff_t pos_in, -- cgit v1.2.3-59-g8ed1b From aeabb3c96186a0f944fc2b1f25c84d5eb3a93fa9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 19 Nov 2018 20:11:45 -0500 Subject: NFSv4: Fix a NFSv4 state manager deadlock Fix a deadlock whereby the NFSv4 state manager can get stuck in the delegation return code, waiting for a layout return to complete in another thread. If the server reboots before that other thread completes, then we need to be able to start a second state manager thread in order to perform recovery. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4state.c | 16 +++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8d59c9655ec4..1b994b527518 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -41,6 +41,8 @@ enum nfs4_client_state { NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, + NFS4CLNT_RUN_MANAGER, + NFS4CLNT_DELEGRETURN_RUNNING, }; #define NFS4_RENEW_TIMEOUT 0x01 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ffea57885394..d8decf2ec48f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1210,6 +1210,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; __module_get(THIS_MODULE); @@ -2503,6 +2504,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Ensure exclusive access to NFSv4 state */ do { + clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { section = "purge state"; status = nfs4_purge_lease(clp); @@ -2593,14 +2595,18 @@ static void nfs4_state_manager(struct nfs_client *clp) } nfs4_end_drain_session(clp); - if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { - nfs_client_return_marked_delegations(clp); - continue; + nfs4_clear_state_manager_bit(clp); + + if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } + clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state); } - nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ - if (clp->cl_state == 0) + if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; -- cgit v1.2.3-59-g8ed1b From d61fa8cbf3da85ffca6620f261354941c126ee23 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:07 -0800 Subject: xfs: uncached buffer tracing needs to print bno Useless: xfs_buf_get_uncached: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_unlock: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_submit: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_hold: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_iowait: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_iodone: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_iowait_done: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_rele: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... Useful: xfs_buf_get_uncached: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_unlock: dev 253:32 bno 0xffffffffffffffff nblks 0x1 ... xfs_buf_submit: dev 253:32 bno 0x200b5 nblks 0x1 ... xfs_buf_hold: dev 253:32 bno 0x200b5 nblks 0x1 ... xfs_buf_iowait: dev 253:32 bno 0x200b5 nblks 0x1 ... xfs_buf_iodone: dev 253:32 bno 0x200b5 nblks 0x1 ... xfs_buf_iowait_done: dev 253:32 bno 0x200b5 nblks 0x1 ... xfs_buf_rele: dev 253:32 bno 0x200b5 nblks 0x1 ... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trace.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3043e5ed6495..8a6532aae779 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + if (bp->b_bn == XFS_BUF_DADDR_NULL) + __entry->bno = bp->b_maps[0].bm_bn; + else + __entry->bno = bp->b_bn; __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); -- cgit v1.2.3-59-g8ed1b From d43aaf1685aa471f0593685c9f54d53e3af3cf3f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:08 -0800 Subject: xfs: fix transient reference count error in xfs_buf_resubmit_failed_buffers When retrying a failed inode or dquot buffer, xfs_buf_resubmit_failed_buffers() clears all the failed flags from the inde/dquot log items. In doing so, it also drops all the reference counts on the buffer that the failed log items hold. This means it can drop all the active references on the buffer and hence free the buffer before it queues it for write again. Putting the buffer on the delwri queue takes a reference to the buffer (so that it hangs around until it has been written and completed), but this goes bang if the buffer has already been freed. Hence we need to add the buffer to the delwri queue before we remove the failed flags from the log items attached to the buffer to ensure it always remains referenced during the resubmit process. Reported-by: Josef Bacik Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 12d8455bfbb2..010db5f8fb00 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } -- cgit v1.2.3-59-g8ed1b From c08768977b9a65cab9bcfd1ba30ffb686b2b7c69 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:08 -0800 Subject: xfs: finobt AG reserves don't consider last AG can be a runt The last AG may be very small comapred to all other AGs, and hence AG reservations based on the superblock AG size may actually consume more space than the AG actually has. This results on assert failures like: XFS: Assertion failed: xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= pag->pagf_freeblks + pag->pagf_flcount, file: fs/xfs/libxfs/xfs_ag_resv.c, line: 319 [ 48.932891] xfs_ag_resv_init+0x1bd/0x1d0 [ 48.933853] xfs_fs_reserve_ag_blocks+0x37/0xb0 [ 48.934939] xfs_mountfs+0x5b3/0x920 [ 48.935804] xfs_fs_fill_super+0x462/0x640 [ 48.936784] ? xfs_test_remount_options+0x60/0x60 [ 48.937908] mount_bdev+0x178/0x1b0 [ 48.938751] mount_fs+0x36/0x170 [ 48.939533] vfs_kern_mount.part.43+0x54/0x130 [ 48.940596] do_mount+0x20e/0xcb0 [ 48.941396] ? memdup_user+0x3e/0x70 [ 48.942249] ksys_mount+0xba/0xd0 [ 48.943046] __x64_sys_mount+0x21/0x30 [ 48.943953] do_syscall_64+0x54/0x170 [ 48.944835] entry_SYSCALL_64_after_hwframe+0x49/0xbe Hence we need to ensure the finobt per-ag space reservations take into account the size of the last AG rather than treat it like all the other full size AGs. Note that both refcountbt and rmapbt already take the size of the AG into account via reading the AGF length directly. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 86c50208a143..7fbf8af0b159 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -538,15 +538,18 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agnumber_t agno) { + xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_inobt_mxr[0] == 0) return 0; return xfs_btree_calc_size(mp->m_inobt_mnr, - (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / - XFS_INODES_PER_CHUNK); + (uint64_t)agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); } static int @@ -594,7 +597,7 @@ xfs_finobt_calc_reserves( if (error) return error; - *ask += xfs_inobt_max_size(mp); + *ask += xfs_inobt_max_size(mp, agno); *used += tree_len; return 0; } -- cgit v1.2.3-59-g8ed1b From 7f9f71be84bcab368e58020a42f6d0dd97adf0ce Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:09 -0800 Subject: xfs: extent shifting doesn't fully invalidate page cache The extent shifting code uses a flush and invalidate mechainsm prior to shifting extents around. This is similar to what xfs_free_file_space() does, but it doesn't take into account things like page cache vs block size differences, and it will fail if there is a page that it currently busy. xfs_flush_unmap_range() handles all of these cases, so just convert xfs_prepare_shift() to us that mechanism rather than having it's own special sauce. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5d263dfdb3bc..167ff4297e5c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1195,13 +1195,7 @@ xfs_prepare_shift( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); - if (error) - return error; - error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - offset >> PAGE_SHIFT, -1); - if (error) - return error; + error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); /* * Clean out anything hanging around in the cow fork now that -- cgit v1.2.3-59-g8ed1b From a4390aee72713d9e73f1132bcdeb17d72fbbf974 Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Wed, 14 Nov 2018 18:32:37 +0000 Subject: Btrfs: send, fix infinite loop due to directory rename dependencies When doing an incremental send, due to the need of delaying directory move (rename) operations we can end up in infinite loop at apply_children_dir_moves(). An example scenario that triggers this problem is described below, where directory names correspond to the numbers of their respective inodes. Parent snapshot: . |--- 261/ |--- 271/ |--- 266/ |--- 259/ |--- 260/ | |--- 267 | |--- 264/ | |--- 258/ | |--- 257/ | |--- 265/ |--- 268/ |--- 269/ | |--- 262/ | |--- 270/ |--- 272/ | |--- 263/ | |--- 275/ | |--- 274/ |--- 273/ Send snapshot: . |-- 275/ |-- 274/ |-- 273/ |-- 262/ |-- 269/ |-- 258/ |-- 271/ |-- 268/ |-- 267/ |-- 270/ |-- 259/ | |-- 265/ | |-- 272/ |-- 257/ |-- 260/ |-- 264/ |-- 263/ |-- 261/ |-- 266/ When processing inode 257 we delay its move (rename) operation because its new parent in the send snapshot, inode 272, was not yet processed. Then when processing inode 272, we delay the move operation for that inode because inode 274 is its ancestor in the send snapshot. Finally we delay the move operation for inode 274 when processing it because inode 275 is its new parent in the send snapshot and was not yet moved. When finishing processing inode 275, we start to do the move operations that were previously delayed (at apply_children_dir_moves()), resulting in the following iterations: 1) We issue the move operation for inode 274; 2) Because inode 262 depended on the move operation of inode 274 (it was delayed because 274 is its ancestor in the send snapshot), we issue the move operation for inode 262; 3) We issue the move operation for inode 272, because it was delayed by inode 274 too (ancestor of 272 in the send snapshot); 4) We issue the move operation for inode 269 (it was delayed by 262); 5) We issue the move operation for inode 257 (it was delayed by 272); 6) We issue the move operation for inode 260 (it was delayed by 272); 7) We issue the move operation for inode 258 (it was delayed by 269); 8) We issue the move operation for inode 264 (it was delayed by 257); 9) We issue the move operation for inode 271 (it was delayed by 258); 10) We issue the move operation for inode 263 (it was delayed by 264); 11) We issue the move operation for inode 268 (it was delayed by 271); 12) We verify if we can issue the move operation for inode 270 (it was delayed by 271). We detect a path loop in the current state, because inode 267 needs to be moved first before we can issue the move operation for inode 270. So we delay again the move operation for inode 270, this time we will attempt to do it after inode 267 is moved; 13) We issue the move operation for inode 261 (it was delayed by 263); 14) We verify if we can issue the move operation for inode 266 (it was delayed by 263). We detect a path loop in the current state, because inode 270 needs to be moved first before we can issue the move operation for inode 266. So we delay again the move operation for inode 266, this time we will attempt to do it after inode 270 is moved (its move operation was delayed in step 12); 15) We issue the move operation for inode 267 (it was delayed by 268); 16) We verify if we can issue the move operation for inode 266 (it was delayed by 270). We detect a path loop in the current state, because inode 270 needs to be moved first before we can issue the move operation for inode 266. So we delay again the move operation for inode 266, this time we will attempt to do it after inode 270 is moved (its move operation was delayed in step 12). So here we added again the same delayed move operation that we added in step 14; 17) We attempt again to see if we can issue the move operation for inode 266, and as in step 16, we realize we can not due to a path loop in the current state due to a dependency on inode 270. Again we delay inode's 266 rename to happen after inode's 270 move operation, adding the same dependency to the empty stack that we did in steps 14 and 16. The next iteration will pick the same move dependency on the stack (the only entry) and realize again there is still a path loop and then again the same dependency to the stack, over and over, resulting in an infinite loop. So fix this by preventing adding the same move dependency entries to the stack by removing each pending move record from the red black tree of pending moves. This way the next call to get_pending_dir_moves() will not return anything for the current parent inode. A test case for fstests, with this reproducer, follows soon. Signed-off-by: Robbie Ko Reviewed-by: Filipe Manana [Wrote changelog with example and more clear explanation] Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 094cc1444a90..5be83b5a1b43 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) kfree(m); } -static void tail_append_pending_moves(struct pending_dir_move *moves, +static void tail_append_pending_moves(struct send_ctx *sctx, + struct pending_dir_move *moves, struct list_head *stack) { if (list_empty(&moves->list)) { @@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves, list_add_tail(&moves->list, stack); list_splice_tail(&list, stack); } + if (!RB_EMPTY_NODE(&moves->node)) { + rb_erase(&moves->node, &sctx->pending_dir_moves); + RB_CLEAR_NODE(&moves->node); + } } static int apply_children_dir_moves(struct send_ctx *sctx) @@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) return 0; INIT_LIST_HEAD(&stack); - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); @@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) goto out; pm = get_pending_dir_moves(sctx, parent_ino); if (pm) - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); } return 0; -- cgit v1.2.3-59-g8ed1b From 2c307174ab77e34645e75e12827646e044d273c3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:10 -0800 Subject: xfs: flush removing page cache in xfs_reflink_remap_prep On a sub-page block size filesystem, fsx is failing with a data corruption after a series of operations involving copying a file with the destination offset beyond EOF of the destination of the file: 8093(157 mod 256): TRUNCATE DOWN from 0x7a120 to 0x50000 ******WWWW 8094(158 mod 256): INSERT 0x25000 thru 0x25fff (0x1000 bytes) 8095(159 mod 256): COPY 0x18000 thru 0x1afff (0x3000 bytes) to 0x2f400 8096(160 mod 256): WRITE 0x5da00 thru 0x651ff (0x7800 bytes) HOLE 8097(161 mod 256): COPY 0x2000 thru 0x5fff (0x4000 bytes) to 0x6fc00 The second copy here is beyond EOF, and it is to sub-page (4k) but block aligned (1k) offset. The clone runs the EOF zeroing, landing in a pre-existing post-eof delalloc extent. This zeroes the post-eof extents in the page cache just fine, dirtying the pages correctly. The problem is that xfs_reflink_remap_prep() now truncates the page cache over the range that it is copying it to, and rounds that down to cover the entire start page. This removes the dirty page over the delalloc extent from the page cache without having written it back. Hence later, when the page cache is flushed, the page at offset 0x6f000 has not been written back and hence exposes stale data, which fsx trips over less than 10 operations later. Fix this by changing xfs_reflink_remap_prep() to use xfs_flush_unmap_range(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 2 +- fs/xfs/xfs_bmap_util.h | 3 +++ fs/xfs/xfs_reflink.c | 17 +++++++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 167ff4297e5c..404e581f1ea1 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1042,7 +1042,7 @@ out_trans_cancel: goto out_unlock; } -static int +int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 87363d136bb6..7a78229cf1a7 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); +int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c56bdbfcf7ae..322a852ce284 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1352,10 +1352,19 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, - round_down(pos_out, PAGE_SIZE), - round_up(pos_out + *len, PAGE_SIZE) - 1); + /* + * If pos_out > EOF, we may have dirtied blocks between EOF and + * pos_out. In that case, we need to extend the flush and unmap to cover + * from EOF to the end of the copy length. + */ + if (pos_out > XFS_ISIZE(dest)) { + loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); + ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); + } else { + ret = xfs_flush_unmap_range(dest, pos_out, *len); + } + if (ret) + goto out_unlock; return 1; out_unlock: -- cgit v1.2.3-59-g8ed1b From 9230a0b65b47fe6856c4468ec0175c4987e5bede Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 22:50:08 -0800 Subject: xfs: delalloc -> unwritten COW fork allocation can go wrong Long saga. There have been days spent following this through dead end after dead end in multi-GB event traces. This morning, after writing a trace-cmd wrapper that enabled me to be more selective about XFS trace points, I discovered that I could get just enough essential tracepoints enabled that there was a 50:50 chance the fsx config would fail at ~115k ops. If it didn't fail at op 115547, I stopped fsx at op 115548 anyway. That gave me two traces - one where the problem manifested, and one where it didn't. After refining the traces to have the necessary information, I found that in the failing case there was a real extent in the COW fork compared to an unwritten extent in the working case. Walking back through the two traces to the point where the CWO fork extents actually diverged, I found that the bad case had an extra unwritten extent in it. This is likely because the bug it led me to had triggered multiple times in those 115k ops, leaving stray COW extents around. What I saw was a COW delalloc conversion to an unwritten extent (as they should always be through xfs_iomap_write_allocate()) resulted in a /written extent/: xfs_writepage: dev 259:0 ino 0x83 pgoff 0x17000 size 0x79a00 offset 0 length 0 xfs_iext_remove: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/2 offset 32 block 152 count 20 flag 1 caller xfs_bmap_add_extent_delay_real xfs_bmap_pre_update: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/1 offset 1 block 4503599627239429 count 31 flag 0 caller xfs_bmap_add_extent_delay_real xfs_bmap_post_update: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/1 offset 1 block 121 count 51 flag 0 caller xfs_bmap_add_ex Basically, Cow fork before: 0 1 32 52 +H+DDDDDDDDDDDD+UUUUUUUUUUU+ PREV RIGHT COW delalloc conversion allocates: 1 32 +uuuuuuuuuuuu+ NEW And the result according to the xfs_bmap_post_update trace was: 0 1 32 52 +H+wwwwwwwwwwwwwwwwwwwwwwww+ PREV Which is clearly wrong - it should be a merged unwritten extent, not an unwritten extent. That lead me to look at the LEFT_FILLING|RIGHT_FILLING|RIGHT_CONTIG case in xfs_bmap_add_extent_delay_real(), and sure enough, there's the bug. It takes the old delalloc extent (PREV) and adds the length of the RIGHT extent to it, takes the start block from NEW, removes the RIGHT extent and then updates PREV with the new extent. What it fails to do is update PREV.br_state. For delalloc, this is always XFS_EXT_NORM, while in this case we are converting the delayed allocation to unwritten, so it needs to be updated to XFS_EXT_UNWRITTEN. This LF|RF|RC case does not do this, and so the resultant extent is always written. And that's the bug I've been chasing for a week - a bmap btree bug, not a reflink/dedupe/copy_file_range bug, but a BMBT bug introduced with the recent in core extent tree scalability enhancements. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74d7228e755b..19e921d1586f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); -- cgit v1.2.3-59-g8ed1b From 0929d8580071c6a1cec1a7916a8f674c243ceee1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:10 -0800 Subject: iomap: FUA is wrong for DIO O_DSYNC writes into unwritten extents When we write into an unwritten extent via direct IO, we dirty metadata on IO completion to convert the unwritten extent to written. However, when we do the FUA optimisation checks, the inode may be clean and so we issue a FUA write into the unwritten extent. This means we then bypass the generic_write_sync() call after unwritten extent conversion has ben done and we don't force the modified metadata to stable storage. This violates O_DSYNC semantics. The window of exposure is a single IO, as the next DIO write will see the inode has dirty metadata and hence will not use the FUA optimisation. Calling generic_write_sync() after completion of the second IO will also sync the first write and it's metadata. Fix this by avoiding the FUA optimisation when writing to unwritten extents. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 64ce240217a1..72f3864a2e6b 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1596,12 +1596,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; - } else { + } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this - * is a pure data IO that doesn't require any metadata - * updates and the underlying device supports FUA. This - * allows us to avoid cache flushes on IO completion. + * Use a FUA write if we need datasync semantics, this is a pure + * data IO that doesn't require any metadata updates (including + * after IO completion such as unwritten extent conversion) and + * the underlying device supports FUA. This allows us to avoid + * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && (dio->flags & IOMAP_DIO_WRITE_FUA) && -- cgit v1.2.3-59-g8ed1b From b450672fb66b4a991a5b55ee24209ac7ae7690ce Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:10 -0800 Subject: iomap: sub-block dio needs to zeroout beyond EOF If we are doing sub-block dio that extends EOF, we need to zero the unused tail of the block to initialise the data in it it. If we do not zero the tail of the block, then an immediate mmap read of the EOF block will expose stale data beyond EOF to userspace. Found with fsx running sub-block DIO sizes vs MAPREAD/MAPWRITE operations. Fix this by detecting if the end of the DIO write is beyond EOF and zeroing the tail if necessary. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 72f3864a2e6b..77c214194edf 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1677,7 +1677,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->submit.cookie = submit_bio(bio); } while (nr_pages); - if (need_zeroout) { + /* + * We need to zeroout the tail of a sub-block write if the extent type + * requires zeroing or the write extends beyond EOF. If we don't zero + * the block tail in the latter case, we can expose stale data via mmap + * reads of the EOF block. + */ + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) -- cgit v1.2.3-59-g8ed1b From 4721a6010990971440b4ffefbdf014976b8eda2f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:11 -0800 Subject: iomap: dio data corruption and spurious errors when pipes fill When doing direct IO to a pipe for do_splice_direct(), then pipe is trivial to fill up and overflow as it can only hold 16 pages. At this point bio_iov_iter_get_pages() then returns -EFAULT, and we abort the IO submission process. Unfortunately, iomap_dio_rw() propagates the error back up the stack. The error is converted from the EFAULT to EAGAIN in generic_file_splice_read() to tell the splice layers that the pipe is full. do_splice_direct() completely fails to handle EAGAIN errors (it aborts on error) and returns EAGAIN to the caller. copy_file_write() then completely fails to handle EAGAIN as well, and so returns EAGAIN to userspace, having failed to copy the data it was asked to. Avoid this whole steaming pile of fail by having iomap_dio_rw() silently swallow EFAULT errors and so do short reads. To make matters worse, iomap_dio_actor() has a stale data exposure bug bio_iov_iter_get_pages() fails - it does not zero the tail block that it may have been left uncovered by partial IO. Fix the error handling case to drop to the sub-block zeroing rather than immmediately returning the -EFAULT error. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/iomap.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 77c214194edf..d51e7a2ae641 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1580,7 +1580,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct bio *bio; bool need_zeroout = false; bool use_fua = false; - int nr_pages, ret; + int nr_pages, ret = 0; size_t copied = 0; if ((pos | length | align) & ((1 << blkbits) - 1)) @@ -1645,8 +1645,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { + /* + * We have to stop part way through an IO. We must fall + * through to the sub-block tail zeroing here, otherwise + * this short IO may expose stale data in the tail of + * the block we haven't written data to. + */ bio_put(bio); - return copied ? copied : ret; + goto zero_tail; } n = bio->bi_iter.bi_size; @@ -1683,6 +1689,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, * the block tail in the latter case, we can expose stale data via mmap * reads of the EOF block. */ +zero_tail: if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ @@ -1690,7 +1697,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - return copied; + return copied ? copied : ret; } static loff_t @@ -1865,6 +1872,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = true; ret = 0; } + + /* + * Splicing to pipes can fail on a full pipe. We have to + * swallow this to make it look like a short IO + * otherwise the higher splice layers will completely + * mishandle the error and stop moving data. + */ + if (ret == -EFAULT) + ret = 0; break; } pos += ret; -- cgit v1.2.3-59-g8ed1b From 494633fac7896afc2bce6f83fe7319946270540b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Nov 2018 13:31:12 -0800 Subject: vfs: vfs_dedupe_file_range() doesn't return EOPNOTSUPP It returns EINVAL when the operation is not supported by the filesystem. Fix it to return EOPNOTSUPP to be consistent with the man page and clone_file_range(). Clean up the inconsistent error return handling while I'm there. (I know, lipstick on a pig, but every little bit helps...) Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/read_write.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index bfcb4ced5664..4dae0399c75a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) off = same->src_offset; len = same->src_length; - ret = -EISDIR; if (S_ISDIR(src->i_mode)) - goto out; + return -EISDIR; - ret = -EINVAL; if (!S_ISREG(src->i_mode)) - goto out; + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; ret = remap_verify_area(file, off, len, false); if (ret < 0) - goto out; + return ret; ret = 0; if (off + len > i_size_read(src)) @@ -2147,10 +2148,8 @@ next_fdput: fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) - goto out; + break; } - -out: return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range); -- cgit v1.2.3-59-g8ed1b From 8c110d43c6bca4b24dd13272a9d4e0ba6f2ec957 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 21 Nov 2018 08:06:37 -0800 Subject: iomap: readpages doesn't zero page tail beyond EOF When we read the EOF page of the file via readpages, we need to zero the region beyond EOF that we either do not read or should not contain data so that mmap does not expose stale data to user applications. However, iomap_adjust_read_range() fails to detect EOF correctly, and so fsx on 1k block size filesystems fails very quickly with mapreads exposing data beyond EOF. There are two problems here. Firstly, when calculating the end block of the EOF byte, we have to round the size by one to avoid a block aligned EOF from reporting a block too large. i.e. a size of 1024 bytes is 1 block, which in index terms is block 0. Therefore we have to calculate the end block from (isize - 1), not isize. The second bug is determining if the current page spans EOF, and so whether we need split it into two half, one for the IO, and the other for zeroing. Unfortunately, the code that checks whether we should split the block doesn't actually check if we span EOF, it just checks if the read spans the /offset in the page/ that EOF sits on. So it splits every read into two if EOF is not page aligned, regardless of whether we are reading the EOF block or not. Hence we need to restrict the "does the read span EOF" check to just the page that spans EOF, not every page we read. This patch results in correct EOF detection through readpages: xfs_vm_readpages: dev 259:0 ino 0x43 nr_pages 24 xfs_iomap_found: dev 259:0 ino 0x43 size 0x66c00 offset 0x4f000 count 98304 type hole startoff 0x13c startblock 1368 blockcount 0x4 iomap_readpage_actor: orig pos 323584 pos 323584, length 4096, poff 0 plen 4096, isize 420864 xfs_iomap_found: dev 259:0 ino 0x43 size 0x66c00 offset 0x50000 count 94208 type hole startoff 0x140 startblock 1497 blockcount 0x5c iomap_readpage_actor: orig pos 327680 pos 327680, length 94208, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 331776 pos 331776, length 90112, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 335872 pos 335872, length 86016, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 339968 pos 339968, length 81920, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 344064 pos 344064, length 77824, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 348160 pos 348160, length 73728, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 352256 pos 352256, length 69632, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 356352 pos 356352, length 65536, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 360448 pos 360448, length 61440, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 364544 pos 364544, length 57344, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 368640 pos 368640, length 53248, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 372736 pos 372736, length 49152, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 376832 pos 376832, length 45056, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 380928 pos 380928, length 40960, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 385024 pos 385024, length 36864, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 389120 pos 389120, length 32768, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 393216 pos 393216, length 28672, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 397312 pos 397312, length 24576, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 401408 pos 401408, length 20480, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 405504 pos 405504, length 16384, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 409600 pos 409600, length 12288, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 413696 pos 413696, length 8192, poff 0 plen 4096, isize 420864 iomap_readpage_actor: orig pos 417792 pos 417792, length 4096, poff 0 plen 3072, isize 420864 iomap_readpage_actor: orig pos 420864 pos 420864, length 1024, poff 3072 plen 1024, isize 420864 As you can see, it now does full page reads until the last one which is split correctly at the block aligned EOF, reading 3072 bytes and zeroing the last 1024 bytes. The original version of the patch got this right, but it got another case wrong. The EOF detection crossing really needs to the the original length as plen, while it starts at the end of the block, will be shortened as up-to-date blocks are found on the page. This means "orig_pos + plen" no longer points to the end of the page, and so will not correctly detect EOF crossing. Hence we have to use the length passed in to detect this partial page case: xfs_filemap_fault: dev 259:1 ino 0x43 write_fault 0 xfs_vm_readpage: dev 259:1 ino 0x43 nr_pages 1 xfs_iomap_found: dev 259:1 ino 0x43 size 0x2cc00 offset 0x2c000 count 4096 type hole startoff 0xb0 startblock 282 blockcount 0x4 iomap_readpage_actor: orig pos 180224 pos 181248, length 4096, poff 1024 plen 2048, isize 183296 xfs_iomap_found: dev 259:1 ino 0x43 size 0x2cc00 offset 0x2cc00 count 1024 type hole startoff 0xb3 startblock 285 blockcount 0x1 iomap_readpage_actor: orig pos 183296 pos 183296, length 1024, poff 3072 plen 1024, isize 183296 Heere we see a trace where the first block on the EOF page is up to date, hence poff = 1024 bytes. The offset into the page of EOF is 3072, so the range we want to read is 1024 - 3071, and the range we want to zero is 3072 - 4095. You can see this is split correctly now. This fixes the stale data beyond EOF problem that fsx quickly uncovers on 1k block size filesystems. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index d51e7a2ae641..3ffb776fbebe 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -142,13 +142,14 @@ static void iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) { + loff_t orig_pos = *pos; + loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); unsigned poff = offset_in_page(*pos); unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; - unsigned end = offset_in_page(i_size_read(inode)) >> block_bits; /* * If the block size is smaller than the page size we need to check the @@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ - if (first <= end && last > end) - plen -= (last - end) * block_size; + if (orig_pos <= isize && orig_pos + length > isize) { + unsigned end = offset_in_page(isize - 1) >> block_bits; + + if (first <= end && last > end) + plen -= (last - end) * block_size; + } *offp = poff; *lenp = plen; -- cgit v1.2.3-59-g8ed1b From 552f0329c75b3e1d7f9bb8c9e421d37403f192cd Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Nov 2018 16:20:34 +0000 Subject: Btrfs: fix race between enabling quotas and subvolume creation We have a race between enabling quotas end subvolume creation that cause subvolume creation to fail with -EINVAL, and the following diagram shows how it happens: CPU 0 CPU 1 btrfs_ioctl() btrfs_ioctl_quota_ctl() btrfs_quota_enable() mutex_lock(fs_info->qgroup_ioctl_lock) btrfs_ioctl() create_subvol() btrfs_qgroup_inherit() -> save fs_info->quota_root into quota_root -> stores a NULL value -> tries to lock the mutex qgroup_ioctl_lock -> blocks waiting for the task at CPU0 -> sets BTRFS_FS_QUOTA_ENABLED in fs_info -> sets quota_root in fs_info->quota_root (non-NULL value) mutex_unlock(fs_info->qgroup_ioctl_lock) -> checks quota enabled flag is set -> returns -EINVAL because fs_info->quota_root was NULL before it acquired the mutex qgroup_ioctl_lock -> ioctl returns -EINVAL Returning -EINVAL to user space will be confusing if all the arguments passed to the subvolume creation ioctl were valid. Fix it by grabbing the value from fs_info->quota_root after acquiring the mutex. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 45868fd76209..f70825af6438 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2659,7 +2659,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int i; u64 *i_qgroups; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; u32 level_size = 0; @@ -2669,6 +2669,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; goto out; -- cgit v1.2.3-59-g8ed1b From 99f2c55591fb5c1b536263970d98c2ebc2089906 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 21 Nov 2018 11:24:22 -0500 Subject: NFSv4.2 copy do not allocate memory under the lock Bruce pointed out that we shouldn't allocate memory while holding a lock in the nfs4_callback_offload() and handle_async_copy() that deal with a racing CB_OFFLOAD and reply to COPY case. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 22 +++++++++++----------- fs/nfs/nfs42proc.c | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 7b861bbc0b43..315967354954 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -686,20 +686,24 @@ __be32 nfs4_callback_offload(void *data, void *dummy, { struct cb_offloadargs *args = data; struct nfs_server *server; - struct nfs4_copy_state *copy; + struct nfs4_copy_state *copy, *tmp_copy; bool found = false; + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + if (!copy) + return htonl(NFS4ERR_SERVERFAULT); + spin_lock(&cps->clp->cl_lock); rcu_read_lock(); list_for_each_entry_rcu(server, &cps->clp->cl_superblocks, client_link) { - list_for_each_entry(copy, &server->ss_copies, copies) { + list_for_each_entry(tmp_copy, &server->ss_copies, copies) { if (memcmp(args->coa_stateid.other, - copy->stateid.other, + tmp_copy->stateid.other, sizeof(args->coa_stateid.other))) continue; - nfs4_copy_cb_args(copy, args); - complete(©->completion); + nfs4_copy_cb_args(tmp_copy, args); + complete(&tmp_copy->completion); found = true; goto out; } @@ -707,15 +711,11 @@ __be32 nfs4_callback_offload(void *data, void *dummy, out: rcu_read_unlock(); if (!found) { - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); - if (!copy) { - spin_unlock(&cps->clp->cl_lock); - return htonl(NFS4ERR_SERVERFAULT); - } memcpy(©->stateid, &args->coa_stateid, NFS4_STATEID_SIZE); nfs4_copy_cb_args(copy, args); list_add_tail(©->copies, &cps->clp->pending_cb_stateids); - } + } else + kfree(copy); spin_unlock(&cps->clp->cl_lock); return 0; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ac5b784a1de0..fed06fd9998d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -137,31 +137,32 @@ static int handle_async_copy(struct nfs42_copy_res *res, struct file *dst, nfs4_stateid *src_stateid) { - struct nfs4_copy_state *copy; + struct nfs4_copy_state *copy, *tmp_copy; int status = NFS4_OK; bool found_pending = false; struct nfs_open_context *ctx = nfs_file_open_context(dst); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + if (!copy) + return -ENOMEM; + spin_lock(&server->nfs_client->cl_lock); - list_for_each_entry(copy, &server->nfs_client->pending_cb_stateids, + list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids, copies) { - if (memcmp(&res->write_res.stateid, ©->stateid, + if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, NFS4_STATEID_SIZE)) continue; found_pending = true; - list_del(©->copies); + list_del(&tmp_copy->copies); break; } if (found_pending) { spin_unlock(&server->nfs_client->cl_lock); + kfree(copy); + copy = tmp_copy; goto out; } - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); - if (!copy) { - spin_unlock(&server->nfs_client->cl_lock); - return -ENOMEM; - } memcpy(©->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE); init_completion(©->completion); copy->parent_state = ctx->state; -- cgit v1.2.3-59-g8ed1b From bb21ce0ad227b69ec0f83279297ee44232105d96 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Wed, 21 Nov 2018 12:25:41 +0100 Subject: flexfiles: use per-mirror specified stateid for IO rfc8435 says: For tight coupling, ffds_stateid provides the stateid to be used by the client to access the file. However current implementation replaces per-mirror provided stateid with by open or lock stateid. Ensure that per-mirror stateid is used by ff_layout_write_prepare_v4 and nfs4_ff_layout_prepare_ds. Signed-off-by: Tigran Mkrtchyan Signed-off-by: Rick Macklem Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 21 +++++++++------------ fs/nfs/flexfilelayout/flexfilelayout.h | 4 ++++ fs/nfs/flexfilelayout/flexfilelayoutdev.c | 19 +++++++++++++++++++ 3 files changed, 32 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 86bcba40ca61..74b36ed883ca 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1361,12 +1361,7 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_read_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_READ) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_read_prepare_common(task, hdr); } static void ff_layout_read_call_done(struct rpc_task *task, void *data) @@ -1542,12 +1537,7 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_write_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_WRITE) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_write_prepare_common(task, hdr); } static void ff_layout_write_call_done(struct rpc_task *task, void *data) @@ -1742,6 +1732,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; + + if (!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) + goto out_failed; + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1804,6 +1798,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) if (fh) hdr->args.fh = fh; + if (!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) + goto out_failed; + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 411798346e48..de50a342d5a5 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -215,6 +215,10 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, unsigned int maxnum); struct nfs_fh * nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); +int +nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, + u32 mirror_idx, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 74d8d5352438..d23347389626 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -370,6 +370,25 @@ out: return fh; } +int +nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, + u32 mirror_idx, + nfs4_stateid *stateid) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + + if (!ff_layout_mirror_valid(lseg, mirror, false)) { + pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", + __func__, mirror_idx); + goto out; + } + + nfs4_stateid_copy(stateid, &mirror->stateid); + return 1; +out: + return 0; +} + /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on -- cgit v1.2.3-59-g8ed1b From 42a657f57628402c73237547f0134e083e2f6764 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 23 Nov 2018 18:10:15 +0800 Subject: btrfs: relocation: set trans to be NULL after ending transaction The function relocate_block_group calls btrfs_end_transaction to release trans when update_backref_cache returns 1, and then continues the loop body. If btrfs_block_rsv_refill fails this time, it will jump out the loop and the freed trans will be accessed. This may result in a use-after-free bug. The patch assigns NULL to trans after trans is released so that it will not be accessed. Fixes: 0647bf564f1 ("Btrfs: improve forever loop when doing balance relocation") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Signed-off-by: Pan Bian Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..a3f75b8926d4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3959,6 +3959,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans); + trans = NULL; continue; } -- cgit v1.2.3-59-g8ed1b From 2084ac6c505a58f7efdec13eba633c6aaa085ca5 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 23 Nov 2018 15:56:33 +0800 Subject: exportfs: do not read dentry after free The function dentry_connected calls dput(dentry) to drop the previously acquired reference to dentry. In this case, dentry can be released. After that, IS_ROOT(dentry) checks the condition (dentry == dentry->d_parent), which may result in a use-after-free bug. This patch directly compares dentry with its parent obtained before dropping the reference. Fixes: a056cc8934c("exportfs: stop retrying once we race with rename/remove") Signed-off-by: Pan Bian Signed-off-by: Al Viro --- fs/exportfs/expfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index c8a3dfda1764..c69927bed4ef 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -77,7 +77,7 @@ static bool dentry_connected(struct dentry *dentry) struct dentry *parent = dget_parent(dentry); dput(dentry); - if (IS_ROOT(dentry)) { + if (dentry == parent) { dput(parent); return false; } -- cgit v1.2.3-59-g8ed1b From e5f5b717983bccfa033282e9886811635602510e Mon Sep 17 00:00:00 2001 From: xingaopeng Date: Sat, 24 Nov 2018 19:21:59 +0800 Subject: ext2: initialize opts.s_mount_opt as zero before using it We need to initialize opts.s_mount_opt as zero before using it, else we may get some unexpected mount options. Fixes: 088519572ca8 ("ext2: Parse mount options into a dedicated structure") CC: stable@vger.kernel.org Signed-off-by: xingaopeng Signed-off-by: Jan Kara --- fs/ext2/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index cb91baa4275d..eb11502e3fcd 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -892,6 +892,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_magic != EXT2_SUPER_MAGIC) goto cantfind_ext2; + opts.s_mount_opt = 0; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (def_mount_opts & EXT2_DEFM_DEBUG) -- cgit v1.2.3-59-g8ed1b From ecebf55d27a11538ea84aee0be643dd953f830d5 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sun, 25 Nov 2018 08:58:02 +0800 Subject: ext2: fix potential use after free The function ext2_xattr_set calls brelse(bh) to drop the reference count of bh. After that, bh may be freed. However, following brelse(bh), it reads bh->b_data via macro HDR(bh). This may result in a use-after-free bug. This patch moves brelse(bh) after reading field. CC: stable@vger.kernel.org Signed-off-by: Pan Bian Signed-off-by: Jan Kara --- fs/ext2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 62d9a659a8ff..dd8f10db82e9 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -612,9 +612,9 @@ skip_replace: } cleanup: - brelse(bh); if (!(bh && header == HDR(bh))) kfree(header); + brelse(bh); up_write(&EXT2_I(inode)->xattr_sem); return error; -- cgit v1.2.3-59-g8ed1b From e6bc06faf64a83384cc0abc537df954c9d3ff942 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Nov 2018 16:34:55 +0000 Subject: cachefiles: Fix an assertion failure when trying to update a failed object If cachefiles gets an error other then ENOENT when trying to look up an object in the cache (in this case, EACCES), the object state machine will eventually transition to the DROP_OBJECT state. This state invokes fscache_drop_object() which tries to sync the auxiliary data with the cache (this is done lazily since commit 402cb8dda949d) on an incomplete cache object struct. The problem comes when cachefiles_update_object_xattr() is called to rewrite the xattr holding the data. There's an assertion there that the cache object points to a dentry as we're going to update its xattr. The assertion trips, however, as dentry didn't get set. Fix the problem by skipping the update in cachefiles if the object doesn't refer to a dentry. A better way to do it could be to skip the update from the DROP_OBJECT state handler in fscache, but that might deny the cache the opportunity to update intermediate state. If this error occurs, the kernel log includes lines that look like the following: CacheFiles: Lookup failed error -13 CacheFiles: CacheFiles: Assertion failed ------------[ cut here ]------------ kernel BUG at fs/cachefiles/xattr.c:138! ... Workqueue: fscache_object fscache_object_work_func [fscache] RIP: 0010:cachefiles_update_object_xattr.cold.4+0x18/0x1a [cachefiles] ... Call Trace: cachefiles_update_object+0xdd/0x1c0 [cachefiles] fscache_update_aux_data+0x23/0x30 [fscache] fscache_drop_object+0x18e/0x1c0 [fscache] fscache_object_work_func+0x74/0x2b0 [fscache] process_one_work+0x18d/0x340 worker_thread+0x2e/0x390 ? pwq_unbound_release_workfn+0xd0/0xd0 kthread+0x112/0x130 ? kthread_bind+0x30/0x30 ret_from_fork+0x35/0x40 Note that there are actually two issues here: (1) EACCES happened on a cache object and (2) an oops occurred. I think that the second is a consequence of the first (it certainly looks like it ought to be). This patch only deals with the second. Fixes: 402cb8dda949 ("fscache: Attach the index key and aux data to the cookie") Reported-by: Zhibin Li Signed-off-by: David Howells --- fs/cachefiles/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 0a29a00aed2e..511e6c68156a 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -135,7 +135,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, struct dentry *dentry = object->dentry; int ret; - ASSERT(dentry); + if (!dentry) + return -ESTALE; _enter("%p,#%d", object, auxdata->len); -- cgit v1.2.3-59-g8ed1b From 9a24ce5b66f9c8190d63b15f4473600db4935f1f Mon Sep 17 00:00:00 2001 From: Kiran Kumar Modukuri Date: Mon, 24 Sep 2018 12:02:39 +1000 Subject: cachefiles: Fix page leak in cachefiles_read_backing_file while vmscan is active [Description] In a heavily loaded system where the system pagecache is nearing memory limits and fscache is enabled, pages can be leaked by fscache while trying read pages from cachefiles backend. This can happen because two applications can be reading same page from a single mount, two threads can be trying to read the backing page at same time. This results in one of the threads finding that a page for the backing file or netfs file is already in the radix tree. During the error handling cachefiles does not clean up the reference on backing page, leading to page leak. [Fix] The fix is straightforward, to decrement the reference when error is encountered. [dhowells: Note that I've removed the clearance and put of newpage as they aren't attested in the commit message and don't appear to actually achieve anything since a new page is only allocated is newpage!=NULL and any residual new page is cleared before returning.] [Testing] I have tested the fix using following method for 12+ hrs. 1) mkdir -p /mnt/nfs ; mount -o vers=3,fsc :/export /mnt/nfs 2) create 10000 files of 2.8MB in a NFS mount. 3) start a thread to simulate heavy VM presssure (while true ; do echo 3 > /proc/sys/vm/drop_caches ; sleep 1 ; done)& 4) start multiple parallel reader for data set at same time find /mnt/nfs -type f | xargs -P 80 cat > /dev/null & find /mnt/nfs -type f | xargs -P 80 cat > /dev/null & find /mnt/nfs -type f | xargs -P 80 cat > /dev/null & .. .. find /mnt/nfs -type f | xargs -P 80 cat > /dev/null & find /mnt/nfs -type f | xargs -P 80 cat > /dev/null & 5) finally check using cat /proc/fs/fscache/stats | grep -i pages ; free -h , cat /proc/meminfo and page-types -r -b lru to ensure all pages are freed. Reviewed-by: Daniel Axtens Signed-off-by: Shantanu Goel Signed-off-by: Kiran Kumar Modukuri [dja: forward ported to current upstream] Signed-off-by: Daniel Axtens Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 40f7595aad10..db233588a69a 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -535,7 +535,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { + put_page(backpage); + backpage = NULL; put_page(netpage); + netpage = NULL; fscache_retrieval_complete(op, 1); continue; } @@ -608,7 +611,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { + put_page(backpage); + backpage = NULL; put_page(netpage); + netpage = NULL; fscache_retrieval_complete(op, 1); continue; } -- cgit v1.2.3-59-g8ed1b From 89d328f637b9904b6d4c9af73c8a608b8dd4d6f8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Nov 2018 16:17:22 -0700 Subject: pstore/ram: Correctly calculate usable PRZ bytes The actual number of bytes stored in a PRZ is smaller than the bytes requested by platform data, since there is a header on each PRZ. Additionally, if ECC is enabled, there are trailing bytes used as well. Normally this mismatch doesn't matter since PRZs are circular buffers and the leading "overflow" bytes are just thrown away. However, in the case of a compressed record, this rather badly corrupts the results. This corruption was visible with "ramoops.mem_size=204800 ramoops.ecc=1". Any stored crashes would not be uncompressable (producing a pstorefs "dmesg-*.enc.z" file), and triggering errors at boot: [ 2.790759] pstore: crypto_comp_decompress failed, ret = -22! Backporting this depends on commit 70ad35db3321 ("pstore: Convert console write to use ->write_buf") Reported-by: Joel Fernandes Fixes: b0aad7a99c1d ("pstore: Add compression support to pstore") Signed-off-by: Kees Cook Reviewed-by: Joel Fernandes (Google) --- fs/pstore/ram.c | 15 ++++++--------- include/linux/pstore.h | 5 ++++- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 712960e117fe..8646fe6e916f 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -816,17 +816,14 @@ static int ramoops_probe(struct platform_device *pdev) cxt->pstore.data = cxt; /* - * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we - * have to handle dumps, we must have at least record_size buffer. And - * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be - * ZERO_SIZE_PTR). + * Since bufsize is only used for dmesg crash dumps, it + * must match the size of the dprz record (after PRZ header + * and ECC bytes have been accounted for). */ - if (cxt->console_size) - cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */ - cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize); - cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); + cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); if (!cxt->pstore.buf) { - pr_err("cannot allocate pstore buffer\n"); + pr_err("cannot allocate pstore crash dump buffer\n"); err = -ENOMEM; goto fail_clear; } diff --git a/include/linux/pstore.h b/include/linux/pstore.h index a15bc4d48752..30fcec375a3a 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -90,7 +90,10 @@ struct pstore_record { * * @buf_lock: spinlock to serialize access to @buf * @buf: preallocated crash dump buffer - * @bufsize: size of @buf available for crash dump writes + * @bufsize: size of @buf available for crash dump bytes (must match + * smallest number of bytes available for writing to a + * backend entry, since compressed bytes don't take kindly + * to being truncated) * * @read_mutex: serializes @open, @read, @close, and @erase callbacks * @flags: bitfield of frontends the backend can accept writes for -- cgit v1.2.3-59-g8ed1b From ae3b7361dc0ee9a425bf7d77ce211f533500b39b Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Nov 2018 23:20:21 +0000 Subject: afs: Fix validation/callback interaction When afs_validate() is called to validate a vnode (inode), there are two unhandled cases in the fastpath at the top of the function: (1) If the vnode is promised (AFS_VNODE_CB_PROMISED is set), the break counters match and the data has expired, then there's an implicit case in which the vnode needs revalidating. This has no consequences since the default "valid = false" set at the top of the function happens to do the right thing. (2) If the vnode is not promised and it hasn't been deleted (AFS_VNODE_DELETED is not set) then there's a default case we're not handling in which the vnode is invalid. If the vnode is invalid, we need to bring cb_s_break and cb_v_break up to date before we refetch the status. As a consequence, once the server loses track of the client (ie. sufficient time has passed since we last sent it an operation), it will send us a CB.InitCallBackState* operation when we next try to talk to it. This calls afs_init_callback_state() which increments afs_server::cb_s_break, but this then doesn't propagate to the afs_vnode record. The result being that every afs_validate() call thereafter sends a status fetch operation to the server. Clarify and fix this by: (A) Setting valid in all the branches rather than initialising it at the top so that the compiler catches where we've missed. (B) Restructuring the logic in the 'promised' branch so that we set valid to false if the callback is due to expire (or has expired) and so that the final case is that the vnode is still valid. (C) Adding an else-statement that ups cb_s_break and cb_v_break if the promised and deleted cases don't match. Fixes: c435ee34551e ("afs: Overhaul the callback handling") Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/inode.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4c6d8e1112c2..6b17d3620414 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -382,7 +382,7 @@ void afs_zap_data(struct afs_vnode *vnode) int afs_validate(struct afs_vnode *vnode, struct key *key) { time64_t now = ktime_get_real_seconds(); - bool valid = false; + bool valid; int ret; _enter("{v={%llx:%llu} fl=%lx},%x", @@ -402,15 +402,21 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } else if (vnode->status.type == AFS_FTYPE_DIR && - test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && - vnode->cb_expires_at - 10 > now) { - valid = true; - } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && - vnode->cb_expires_at - 10 > now) { + (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) || + vnode->cb_expires_at - 10 <= now)) { + valid = false; + } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) || + vnode->cb_expires_at - 10 <= now) { + valid = false; + } else { valid = true; } } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; + } else { + vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; + valid = false; } read_sequnlock_excl(&vnode->cb_lock); -- cgit v1.2.3-59-g8ed1b From 4584ae96ae307613625e80cb9c7d9a981bed47a7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Nov 2018 23:20:28 +0000 Subject: afs: Fix missing net error handling kAFS can be given certain network errors (EADDRNOTAVAIL, EHOSTDOWN and ERFKILL) that it doesn't handle in its server/address rotation algorithms. They cause the probing and rotation to abort immediately rather than rotating. Fix this by: (1) Abstracting out the error prioritisation from the VL and FS rotation algorithms into a common function and expand usage into the server probing code. When multiple errors are available, this code selects the one we'd prefer to return. (2) Add handling for EADDRNOTAVAIL, EHOSTDOWN and ERFKILL. Fixes: 0fafdc9f888b ("afs: Fix file locking") Fixes: 0338747d8454 ("afs: Probe multiple fileservers simultaneously") Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/fs_probe.c | 39 ++++++++++++++++++++++++--------------- fs/afs/internal.h | 9 +++++++++ fs/afs/misc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/afs/rotate.c | 53 +++++++++++++---------------------------------------- fs/afs/vl_probe.c | 45 +++++++++++++++++++++++++++------------------ fs/afs/vl_rotate.c | 50 ++++++++++---------------------------------------- 6 files changed, 135 insertions(+), 113 deletions(-) (limited to 'fs') diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index d049cb459742..fde6b4d4121e 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -61,8 +61,11 @@ void afs_fileserver_probe_result(struct afs_call *call) afs_io_error(call, afs_io_error_fs_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -132,12 +135,14 @@ out: static int afs_do_probe_fileserver(struct afs_net *net, struct afs_server *server, struct key *key, - unsigned int server_index) + unsigned int server_index, + struct afs_error *_e) { struct afs_addr_cursor ac = { .index = 0, }; - int ret; + bool in_progress = false; + int err; _enter("%pU", &server->uuid); @@ -151,15 +156,17 @@ static int afs_do_probe_fileserver(struct afs_net *net, server->probe.rtt = UINT_MAX; for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - ret = afs_fs_get_capabilities(net, server, &ac, key, server_index, + err = afs_fs_get_capabilities(net, server, &ac, key, server_index, true); - if (ret != -EINPROGRESS) { - afs_fs_probe_done(server); - return ret; - } + if (err == -EINPROGRESS) + in_progress = true; + else + afs_prioritise_error(_e, err, ac.abort_code); } - return 0; + if (!in_progress) + afs_fs_probe_done(server); + return in_progress; } /* @@ -169,21 +176,23 @@ int afs_probe_fileservers(struct afs_net *net, struct key *key, struct afs_server_list *list) { struct afs_server *server; - int i, ret; + struct afs_error e; + bool in_progress = false; + int i; + e.error = 0; + e.responded = false; for (i = 0; i < list->nr_servers; i++) { server = list->servers[i].server; if (test_bit(AFS_SERVER_FL_PROBED, &server->flags)) continue; - if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags)) { - ret = afs_do_probe_fileserver(net, server, key, i); - if (ret) - return ret; - } + if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags) && + afs_do_probe_fileserver(net, server, key, i, &e)) + in_progress = true; } - return 0; + return in_progress ? 0 : e.error; } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5da3b09b7518..8871b9e8645f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -695,6 +695,14 @@ struct afs_interface { unsigned mtu; /* MTU of interface */ }; +/* + * Error prioritisation and accumulation. + */ +struct afs_error { + short error; /* Accumulated error */ + bool responded; /* T if server responded */ +}; + /* * Cursor for iterating over a server's address list. */ @@ -1015,6 +1023,7 @@ static inline void __afs_stat(atomic_t *s) * misc.c */ extern int afs_abort_to_error(u32); +extern void afs_prioritise_error(struct afs_error *, int, u32); /* * mntpt.c diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 700a5fa7f4ec..bbb1fd51b019 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -118,3 +118,55 @@ int afs_abort_to_error(u32 abort_code) default: return -EREMOTEIO; } } + +/* + * Select the error to report from a set of errors. + */ +void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) +{ + switch (error) { + case 0: + return; + default: + if (e->error == -ETIMEDOUT || + e->error == -ETIME) + return; + case -ETIMEDOUT: + case -ETIME: + if (e->error == -ENOMEM || + e->error == -ENONET) + return; + case -ENOMEM: + case -ENONET: + if (e->error == -ERFKILL) + return; + case -ERFKILL: + if (e->error == -EADDRNOTAVAIL) + return; + case -EADDRNOTAVAIL: + if (e->error == -ENETUNREACH) + return; + case -ENETUNREACH: + if (e->error == -EHOSTUNREACH) + return; + case -EHOSTUNREACH: + if (e->error == -EHOSTDOWN) + return; + case -EHOSTDOWN: + if (e->error == -ECONNREFUSED) + return; + case -ECONNREFUSED: + if (e->error == -ECONNRESET) + return; + case -ECONNRESET: /* Responded, but call expired. */ + if (e->responded) + return; + e->error = error; + return; + + case -ECONNABORTED: + e->responded = true; + e->error = afs_abort_to_error(abort_code); + return; + } +} diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 00504254c1c2..c3ae324781f8 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -136,7 +136,8 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = fc->vnode; - u32 rtt, abort_code; + struct afs_error e; + u32 rtt; int error = fc->ac.error, i; _enter("%lx[%d],%lx[%d],%d,%d", @@ -306,8 +307,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) if (fc->error != -EDESTADDRREQ) goto iterate_address; /* Fall through */ + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: _debug("no conn"); fc->error = error; @@ -446,50 +450,15 @@ no_more_servers: if (fc->flags & AFS_FS_CURSOR_VBUSY) goto restart_from_beginning; - abort_code = 0; - error = -EDESTADDRREQ; + e.error = -EDESTADDRREQ; + e.responded = false; for (i = 0; i < fc->server_list->nr_servers; i++) { struct afs_server *s = fc->server_list->servers[i].server; - int probe_error = READ_ONCE(s->probe.error); - switch (probe_error) { - case 0: - continue; - default: - if (error == -ETIMEDOUT || - error == -ETIME) - continue; - case -ETIMEDOUT: - case -ETIME: - if (error == -ENOMEM || - error == -ENONET) - continue; - case -ENOMEM: - case -ENONET: - if (error == -ENETUNREACH) - continue; - case -ENETUNREACH: - if (error == -EHOSTUNREACH) - continue; - case -EHOSTUNREACH: - if (error == -ECONNREFUSED) - continue; - case -ECONNREFUSED: - if (error == -ECONNRESET) - continue; - case -ECONNRESET: /* Responded, but call expired. */ - if (error == -ECONNABORTED) - continue; - case -ECONNABORTED: - abort_code = s->probe.abort_code; - error = probe_error; - continue; - } + afs_prioritise_error(&e, READ_ONCE(s->probe.error), + s->probe.abort_code); } - if (error == -ECONNABORTED) - error = afs_abort_to_error(abort_code); - failed_set_error: fc->error = error; failed: @@ -553,8 +522,11 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) _leave(" = f [abort]"); return false; + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -633,6 +605,7 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc) struct afs_net *net = afs_v2net(fc->vnode); if (fc->error == -EDESTADDRREQ || + fc->error == -EADDRNOTAVAIL || fc->error == -ENETUNREACH || fc->error == -EHOSTUNREACH) afs_dump_edestaddrreq(fc); diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index c0f616bd70cb..f0b032976487 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -61,8 +61,11 @@ void afs_vlserver_probe_result(struct afs_call *call) afs_io_error(call, afs_io_error_vl_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -129,15 +132,17 @@ out: * Probe all of a vlserver's addresses to find out the best route and to * query its capabilities. */ -static int afs_do_probe_vlserver(struct afs_net *net, - struct afs_vlserver *server, - struct key *key, - unsigned int server_index) +static bool afs_do_probe_vlserver(struct afs_net *net, + struct afs_vlserver *server, + struct key *key, + unsigned int server_index, + struct afs_error *_e) { struct afs_addr_cursor ac = { .index = 0, }; - int ret; + bool in_progress = false; + int err; _enter("%s", server->name); @@ -151,15 +156,17 @@ static int afs_do_probe_vlserver(struct afs_net *net, server->probe.rtt = UINT_MAX; for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - ret = afs_vl_get_capabilities(net, &ac, key, server, + err = afs_vl_get_capabilities(net, &ac, key, server, server_index, true); - if (ret != -EINPROGRESS) { - afs_vl_probe_done(server); - return ret; - } + if (err == -EINPROGRESS) + in_progress = true; + else + afs_prioritise_error(_e, err, ac.abort_code); } - return 0; + if (!in_progress) + afs_vl_probe_done(server); + return in_progress; } /* @@ -169,21 +176,23 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key, struct afs_vlserver_list *vllist) { struct afs_vlserver *server; - int i, ret; + struct afs_error e; + bool in_progress = false; + int i; + e.error = 0; + e.responded = false; for (i = 0; i < vllist->nr_servers; i++) { server = vllist->servers[i].server; if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) continue; - if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags)) { - ret = afs_do_probe_vlserver(net, server, key, i); - if (ret) - return ret; - } + if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) && + afs_do_probe_vlserver(net, server, key, i, &e)) + in_progress = true; } - return 0; + return in_progress ? 0 : e.error; } /* diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index b64a284b99d2..7adde83a0648 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -71,8 +71,9 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) { struct afs_addr_list *alist; struct afs_vlserver *vlserver; + struct afs_error e; u32 rtt; - int error = vc->ac.error, abort_code, i; + int error = vc->ac.error, i; _enter("%lx[%d],%lx[%d],%d,%d", vc->untried, vc->index, @@ -119,8 +120,11 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) goto failed; } + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -235,50 +239,15 @@ no_more_servers: if (vc->flags & AFS_VL_CURSOR_RETRY) goto restart_from_beginning; - abort_code = 0; - error = -EDESTADDRREQ; + e.error = -EDESTADDRREQ; + e.responded = false; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - int probe_error = READ_ONCE(s->probe.error); - switch (probe_error) { - case 0: - continue; - default: - if (error == -ETIMEDOUT || - error == -ETIME) - continue; - case -ETIMEDOUT: - case -ETIME: - if (error == -ENOMEM || - error == -ENONET) - continue; - case -ENOMEM: - case -ENONET: - if (error == -ENETUNREACH) - continue; - case -ENETUNREACH: - if (error == -EHOSTUNREACH) - continue; - case -EHOSTUNREACH: - if (error == -ECONNREFUSED) - continue; - case -ECONNREFUSED: - if (error == -ECONNRESET) - continue; - case -ECONNRESET: /* Responded, but call expired. */ - if (error == -ECONNABORTED) - continue; - case -ECONNABORTED: - abort_code = s->probe.abort_code; - error = probe_error; - continue; - } + afs_prioritise_error(&e, READ_ONCE(s->probe.error), + s->probe.abort_code); } - if (error == -ECONNABORTED) - error = afs_abort_to_error(abort_code); - failed_set_error: vc->error = error; failed: @@ -341,6 +310,7 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc) struct afs_net *net = vc->cell->net; if (vc->error == -EDESTADDRREQ || + vc->error == -EADDRNOTAVAIL || vc->error == -ENETUNREACH || vc->error == -EHOSTUNREACH) afs_vl_dump_edestaddrreq(vc); -- cgit v1.2.3-59-g8ed1b From 73116df7bb90435ccb2817f44113295240d15034 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Nov 2018 23:20:35 +0000 Subject: afs: Use d_instantiate() rather than d_add() and don't d_drop() Use d_instantiate() rather than d_add() and don't d_drop() in afs_vnode_new_inode(). The dentry shouldn't be removed as it's not changing its name. Reported-by: Al Viro Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/dir.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 43dea3b00c29..8a2562e3a316 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1075,8 +1075,6 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, if (fc->ac.error < 0) return; - d_drop(new_dentry); - inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key, newfid, newstatus, newcb, fc->cbi); if (IS_ERR(inode)) { @@ -1090,7 +1088,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); afs_vnode_commit_status(fc, vnode, 0); - d_add(new_dentry, inode); + d_instantiate(new_dentry, inode); } /* -- cgit v1.2.3-59-g8ed1b From 41e817bca3acd3980efe5dd7d28af0e6f4ab9247 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Fri, 30 Nov 2018 08:35:14 -0700 Subject: fs: fix lost error code in dio_complete commit e259221763a40403d5bb232209998e8c45804ab8 ("fs: simplify the generic_write_sync prototype") reworked callers of generic_write_sync(), and ended up dropping the error return for the directio path. Prior to that commit, in dio_complete(), an error would be bubbled up the stack, but after that commit, errors passed on to dio_complete were eaten up. This was reported on the list earlier, and a fix was proposed in https://lore.kernel.org/lkml/20160921141539.GA17898@infradead.org/, but never followed up with. We recently hit this bug in our testing where fencing io errors, which were previously erroring out with EIO, were being returned as success operations after this commit. The fix proposed on the list earlier was a little short -- it would have still called generic_write_sync() in case `ret` already contained an error. This fix ensures generic_write_sync() is only called when there's no pending error in the write. Additionally, transferred is replaced with ret to bring this code in line with other callers. Fixes: e259221763a4 ("fs: simplify the generic_write_sync prototype") Reported-by: Ravi Nankani Signed-off-by: Maximilian Heyne Reviewed-by: Christoph Hellwig CC: Torsten Mehlan CC: Uwe Dannowski CC: Amit Shah CC: David Woodhouse CC: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 722d17c88edb..41a0e97252ae 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -325,8 +325,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) */ dio->iocb->ki_pos += transferred; - if (dio->op == REQ_OP_WRITE) - ret = generic_write_sync(dio->iocb, transferred); + if (ret > 0 && dio->op == REQ_OP_WRITE) + ret = generic_write_sync(dio->iocb, ret); dio->iocb->ki_complete(dio->iocb, ret, 0); } -- cgit v1.2.3-59-g8ed1b From c5a94f434c82529afda290df3235e4d85873c5b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Oct 2018 17:16:29 +1100 Subject: fscache: fix race between enablement and dropping of object It was observed that a process blocked indefintely in __fscache_read_or_alloc_page(), waiting for FSCACHE_COOKIE_LOOKING_UP to be cleared via fscache_wait_for_deferred_lookup(). At this time, ->backing_objects was empty, which would normaly prevent __fscache_read_or_alloc_page() from getting to the point of waiting. This implies that ->backing_objects was cleared *after* __fscache_read_or_alloc_page was was entered. When an object is "killed" and then "dropped", FSCACHE_COOKIE_LOOKING_UP is cleared in fscache_lookup_failure(), then KILL_OBJECT and DROP_OBJECT are "called" and only in DROP_OBJECT is ->backing_objects cleared. This leaves a window where something else can set FSCACHE_COOKIE_LOOKING_UP and __fscache_read_or_alloc_page() can start waiting, before ->backing_objects is cleared There is some uncertainty in this analysis, but it seems to be fit the observations. Adding the wake in this patch will be handled correctly by __fscache_read_or_alloc_page(), as it checks if ->backing_objects is empty again, after waiting. Customer which reported the hang, also report that the hang cannot be reproduced with this fix. The backtrace for the blocked process looked like: PID: 29360 TASK: ffff881ff2ac0f80 CPU: 3 COMMAND: "zsh" #0 [ffff881ff43efbf8] schedule at ffffffff815e56f1 #1 [ffff881ff43efc58] bit_wait at ffffffff815e64ed #2 [ffff881ff43efc68] __wait_on_bit at ffffffff815e61b8 #3 [ffff881ff43efca0] out_of_line_wait_on_bit at ffffffff815e625e #4 [ffff881ff43efd08] fscache_wait_for_deferred_lookup at ffffffffa04f2e8f [fscache] #5 [ffff881ff43efd18] __fscache_read_or_alloc_page at ffffffffa04f2ffe [fscache] #6 [ffff881ff43efd58] __nfs_readpage_from_fscache at ffffffffa0679668 [nfs] #7 [ffff881ff43efd78] nfs_readpage at ffffffffa067092b [nfs] #8 [ffff881ff43efda0] generic_file_read_iter at ffffffff81187a73 #9 [ffff881ff43efe50] nfs_file_read at ffffffffa066544b [nfs] #10 [ffff881ff43efe70] __vfs_read at ffffffff811fc756 #11 [ffff881ff43efee8] vfs_read at ffffffff811fccfa #12 [ffff881ff43eff18] sys_read at ffffffff811fda62 #13 [ffff881ff43eff50] entry_SYSCALL_64_fastpath at ffffffff815e986e Signed-off-by: NeilBrown Signed-off-by: David Howells --- fs/fscache/object.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9edc920f651f..6d9cb1719de5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -730,6 +730,9 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob if (awaken) wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + /* Prevent a race with our last child, which has to signal EV_CLEARED * before dropping our spinlock. -- cgit v1.2.3-59-g8ed1b From b7e768b7e3522695ed36dcb48ecdcd344bd30a9b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 10:33:44 -0700 Subject: cachefiles: Explicitly cast enumerated type in put_object Clang warns when one enumerated type is implicitly converted to another. fs/cachefiles/namei.c:247:50: warning: implicit conversion from enumeration type 'enum cachefiles_obj_ref_trace' to different enumeration type 'enum fscache_obj_ref_trace' [-Wenum-conversion] cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry); Silence this warning by explicitly casting to fscache_obj_ref_trace, which is also done in put_object. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: David Howells --- fs/cachefiles/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 95983c744164..5ab411d4bc59 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -244,11 +244,13 @@ wait_for_old_object: ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); - cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry); + cache->cache.ops->put_object(&xobject->fscache, + (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry); goto try_again; requeue: - cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo); + cache->cache.ops->put_object(&xobject->fscache, + (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo); _leave(" = -ETIMEDOUT"); return -ETIMEDOUT; } -- cgit v1.2.3-59-g8ed1b From 34e06fe4d05bd120556a95d5ebf1bcc97b0a1ca0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 16:27:44 +0200 Subject: cachefiles: avoid deprecated get_seconds() get_seconds() returns an unsigned long can overflow on some architectures and is deprecated because of that. In cachefs, we cast that number to a a 32-bit integer, which will overflow in year 2106 on all architectures. As confirmed by David Howells, the overflow probably isn't harmful in the end, since the timestamps are only used to make the file names unique, but they don't strictly have to be in monotonically increasing order since the files only exist in order to be deleted as quickly as possible. Moving to ktime_get_real_seconds() avoids the deprecated interface. Signed-off-by: Arnd Bergmann Signed-off-by: David Howells --- fs/cachefiles/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 5ab411d4bc59..1645fcfd9691 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -338,7 +338,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, try_again: /* first step is to make up a grave dentry in the graveyard */ sprintf(nbuffer, "%08x%08x", - (uint32_t) get_seconds(), + (uint32_t) ktime_get_real_seconds(), (uint32_t) atomic_inc_return(&cache->gravecounter)); /* do the multiway lock magic */ -- cgit v1.2.3-59-g8ed1b From 31ffa563833576bd49a8bf53120568312755e6e2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 09:53:42 +0100 Subject: fscache, cachefiles: remove redundant variable 'cache' Variable 'cache' is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'cache' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index db233588a69a..8a577409d030 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -968,11 +968,8 @@ void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) __releases(&object->fscache.cookie->lock) { struct cachefiles_object *object; - struct cachefiles_cache *cache; object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); _enter("%p,{%lu}", object, page->index); -- cgit v1.2.3-59-g8ed1b From e21e57445a64598b29a6f629688f9b9a39e7242a Mon Sep 17 00:00:00 2001 From: Larry Chen Date: Fri, 30 Nov 2018 14:08:56 -0800 Subject: ocfs2: fix deadlock caused by ocfs2_defrag_extent() ocfs2_defrag_extent may fall into deadlock. ocfs2_ioctl_move_extents ocfs2_ioctl_move_extents ocfs2_move_extents ocfs2_defrag_extent ocfs2_lock_allocators_move_extents ocfs2_reserve_clusters inode_lock GLOBAL_BITMAP_SYSTEM_INODE __ocfs2_flush_truncate_log inode_lock GLOBAL_BITMAP_SYSTEM_INODE As backtrace shows above, ocfs2_reserve_clusters() will call inode_lock against the global bitmap if local allocator has not sufficient cluters. Once global bitmap could meet the demand, ocfs2_reserve_cluster will return success with global bitmap locked. After ocfs2_reserve_cluster(), if truncate log is full, __ocfs2_flush_truncate_log() will definitely fall into deadlock because it needs to inode_lock global bitmap, which has already been locked. To fix this bug, we could remove from ocfs2_lock_allocators_move_extents() the code which intends to lock global allocator, and put the removed code after __ocfs2_flush_truncate_log(). ocfs2_lock_allocators_move_extents() is referred by 2 places, one is here, the other does not need the data allocator context, which means this patch does not affect the caller so far. Link: http://lkml.kernel.org/r/20181101071422.14470-1-lchen@suse.com Signed-off-by: Larry Chen Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/move_extents.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3f1685d7d43b..1565dd8e8856 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -157,18 +157,14 @@ out: } /* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. + * lock allocator, and reserve appropriate number of bits for + * meta blocks. */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, +static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, int extra_blocks, int *credits) { @@ -193,13 +189,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, goto out; } - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); @@ -259,10 +248,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + *len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -285,6 +274,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } + /* + * Make sure ocfs2_reserve_cluster is called after + * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. + * + * If ocfs2_reserve_cluster is called + * before __ocfs2_flush_truncate_log, dead lock on global bitmap + * may happen. + * + */ + ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); + if (ret) { + mlog_errno(ret); + goto out_unlock_mutex; + } + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -617,9 +621,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; -- cgit v1.2.3-59-g8ed1b From ce96a407adef126870b3f4a1b73529dd8aa80f49 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 30 Nov 2018 14:09:14 -0800 Subject: hfs: do not free node before using hfs_bmap_free() frees the node via hfs_bnode_put(node). However, it then reads node->this when dumping error message on an error path, which may result in a use-after-free bug. This patch frees the node only when it is never again used. Link: http://lkml.kernel.org/r/1542963889-128825-1-git-send-email-bianpan2016@163.com Fixes: a1185ffa2fc ("HFS rewrite") Signed-off-by: Pan Bian Reviewed-by: Andrew Morton Cc: Joe Perches Cc: Ernesto A. Fernandez Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/btree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 98b96ffb95ed..19017d296173 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -338,13 +338,14 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; -- cgit v1.2.3-59-g8ed1b From c7d7d620dcbd2a1c595092280ca943f2fced7bbd Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 30 Nov 2018 14:09:18 -0800 Subject: hfsplus: do not free node before using hfs_bmap_free() frees node via hfs_bnode_put(node). However it then reads node->this when dumping error message on an error path, which may result in a use-after-free bug. This patch frees node only when it is never used. Link: http://lkml.kernel.org/r/1543053441-66942-1-git-send-email-bianpan2016@163.com Signed-off-by: Pan Bian Reviewed-by: Andrew Morton Cc: Ernesto A. Fernandez Cc: Joe Perches Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/btree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 236efe51eca6..66774f4cb4fd 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -466,14 +466,15 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; -- cgit v1.2.3-59-g8ed1b From 29ec90660d68bbdd69507c1c8b4e33aa299278b1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 30 Nov 2018 14:09:32 -0800 Subject: userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas After the VMA to register the uffd onto is found, check that it has VM_MAYWRITE set before allowing registration. This way we inherit all common code checks before allowing to fill file holes in shmem and hugetlbfs with UFFDIO_COPY. The userfaultfd memory model is not applicable for readonly files unless it's a MAP_PRIVATE. Link: http://lkml.kernel.org/r/20181126173452.26955-4-aarcange@redhat.com Fixes: ff62a3421044 ("hugetlb: implement memfd sealing") Signed-off-by: Andrea Arcangeli Reviewed-by: Mike Rapoport Reviewed-by: Hugh Dickins Reported-by: Jann Horn Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support") Cc: Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Peter Xu Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 15 +++++++++++++++ mm/userfaultfd.c | 15 ++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 356d2b8568c1..cd58939dc977 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1361,6 +1361,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!vma_can_userfault(cur)) goto out_unlock; + + /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + /* * If this vma contains ending address, and huge pages * check alignment. @@ -1406,6 +1419,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1552,6 +1566,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!vma_can_userfault(vma)); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 471b6457f95f..43cf314cfddd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -205,8 +205,9 @@ retry: if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; /* - * Only allow __mcopy_atomic_hugetlb on userfaultfd - * registered ranges. + * Check the vma is registered in uffd, this is + * required to enforce the VM_MAYWRITE check done at + * uffd registration time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; @@ -459,13 +460,9 @@ retry: if (!dst_vma) goto out_unlock; /* - * Be strict and only allow __mcopy_atomic on userfaultfd - * registered ranges to prevent userland errors going - * unnoticed. As far as the VM consistency is concerned, it - * would be perfectly safe to remove this check, but there's - * no useful usage for __mcopy_atomic ouside of userfaultfd - * registered ranges. This is after all why these are ioctls - * belonging to the userfaultfd and not syscalls. + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; -- cgit v1.2.3-59-g8ed1b From 164f7e586739d07eb56af6f6d66acebb11f315c8 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 30 Nov 2018 14:10:54 -0800 Subject: ocfs2: fix potential use after free ocfs2_get_dentry() calls iput(inode) to drop the reference count of inode, and if the reference count hits 0, inode is freed. However, in this function, it then reads inode->i_generation, which may result in a use after free bug. Move the put operation later. Link: http://lkml.kernel.org/r/1543109237-110227-1-git-send-email-bianpan2016@163.com Fixes: 781f200cb7a("ocfs2: Remove masklog ML_EXPORT.") Signed-off-by: Pan Bian Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 9f88188060db..4bf8d5854b27 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -125,10 +125,10 @@ check_err: check_gen: if (handle->ih_generation != inode->i_generation) { - iput(inode); trace_ocfs2_get_dentry_generation((unsigned long long)blkno, handle->ih_generation, inode->i_generation); + iput(inode); result = ERR_PTR(-ESTALE); goto bail; } -- cgit v1.2.3-59-g8ed1b