From a1f32eeca60e74f6f103ce74f586570deddaef7a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 Jul 2019 13:26:28 -0700
Subject: f2fs: use generic checking and prep function for FS_IOC_SETFLAGS

Make the f2fs implementation of FS_IOC_SETFLAGS use the new VFS helper
function vfs_ioc_setflags_prepare().

This is based on a patch from Darrick Wong, but reworked to apply after
commit 360985573b55 ("f2fs: separate f2fs i_flags from fs_flags and ext4
i_flags").

Originally-from: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f8d46df8fa9e..b7aa0144d874 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1770,7 +1770,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
 static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	u32 fsflags;
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	u32 fsflags, old_fsflags;
 	u32 iflags;
 	int ret;
 
@@ -1794,8 +1795,14 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 
 	inode_lock(inode);
 
+	old_fsflags = f2fs_iflags_to_fsflags(fi->i_flags);
+	ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
+	if (ret)
+		goto out;
+
 	ret = f2fs_setflags_common(inode, iflags,
 			f2fs_fsflags_to_iflags(F2FS_SETTABLE_FS_FL));
+out:
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 6fc93c4e0ad12e34475755315749143013c702ab Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 Jul 2019 13:26:29 -0700
Subject: f2fs: use generic checking function for FS_IOC_FSSETXATTR

Make the f2fs implementation of FS_IOC_FSSETXATTR use the new VFS helper
function vfs_ioc_fssetxattr_check(), and remove the project quota check
since it's now done by the helper function.

This is based on a patch from Darrick Wong, but reworked to apply after
commit 360985573b55 ("f2fs: separate f2fs i_flags from fs_flags and ext4
i_flags").

Originally-from: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 45 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b7aa0144d874..717bad8d31b0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2862,52 +2862,32 @@ static inline u32 f2fs_xflags_to_iflags(u32 xflags)
 	return iflags;
 }
 
-static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)
+static void f2fs_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
 {
-	struct inode *inode = file_inode(filp);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct fsxattr fa;
 
-	memset(&fa, 0, sizeof(struct fsxattr));
-	fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags);
+	simple_fill_fsxattr(fa, f2fs_iflags_to_xflags(fi->i_flags));
 
 	if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)))
-		fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
-							fi->i_projid);
-
-	if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa)))
-		return -EFAULT;
-	return 0;
+		fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid);
 }
 
-static int f2fs_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
+static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)
 {
-	/*
-	 * Project Quota ID state is only allowed to change from within the init
-	 * namespace. Enforce that restriction only if we are trying to change
-	 * the quota ID state. Everything else is allowed in user namespaces.
-	 */
-	if (current_user_ns() == &init_user_ns)
-		return 0;
-
-	if (__kprojid_val(F2FS_I(inode)->i_projid) != fa->fsx_projid)
-		return -EINVAL;
+	struct inode *inode = file_inode(filp);
+	struct fsxattr fa;
 
-	if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) {
-		if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
-			return -EINVAL;
-	} else {
-		if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
-			return -EINVAL;
-	}
+	f2fs_fill_fsxattr(inode, &fa);
 
+	if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa)))
+		return -EFAULT;
 	return 0;
 }
 
 static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	struct fsxattr fa;
+	struct fsxattr fa, old_fa;
 	u32 iflags;
 	int err;
 
@@ -2930,9 +2910,12 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 		return err;
 
 	inode_lock(inode);
-	err = f2fs_ioctl_check_project(inode, &fa);
+
+	f2fs_fill_fsxattr(inode, &old_fa);
+	err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
 	if (err)
 		goto out;
+
 	err = f2fs_setflags_common(inode, iflags,
 			f2fs_xflags_to_iflags(F2FS_SUPPORTED_XFLAGS));
 	if (err)
-- 
cgit v1.2.3-59-g8ed1b


From d5e5efa250cd644896f541551610fe7f0de35e0a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 Jul 2019 13:26:30 -0700
Subject: f2fs: remove redundant check from f2fs_setflags_common()

Now that f2fs_ioc_setflags() and f2fs_ioc_fssetxattr() call the VFS
helper functions which check for permission to change the immutable and
append-only flags, it's no longer needed to do this check in
f2fs_setflags_common() too.  So remove it.

This is based on a patch from Darrick Wong, but reworked to apply after
commit 360985573b55 ("f2fs: separate f2fs i_flags from fs_flags and ext4
i_flags").

Originally-from: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 717bad8d31b0..3e58a6f697dd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1653,19 +1653,12 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
 static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	u32 oldflags;
 
 	/* Is it quota file? Do not allow user to mess with it */
 	if (IS_NOQUOTA(inode))
 		return -EPERM;
 
-	oldflags = fi->i_flags;
-
-	if ((iflags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL))
-		if (!capable(CAP_LINUX_IMMUTABLE))
-			return -EPERM;
-
-	fi->i_flags = iflags | (oldflags & ~mask);
+	fi->i_flags = iflags | (fi->i_flags & ~mask);
 
 	if (fi->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
-- 
cgit v1.2.3-59-g8ed1b


From dbd0f6d6c2a11eb9c31ca9cd454f95bb5713e92e Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Sat, 13 Jul 2019 11:58:26 +0800
Subject: io_uring: fix the sequence comparison in io_sequence_defer

sq->cached_sq_head and cq->cached_cq_tail are both unsigned int. If
cached_sq_head overflows before cached_cq_tail, then we may miss a
barrier req. As cached_cq_tail always follows cached_sq_head, the NQ
should be enough.

Cc: stable@vger.kernel.org
Fixes: de0617e46717 ("io_uring: add support for marking commands as draining")
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d682049c07b2..708d133492a1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -429,7 +429,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 		return false;
 
-	return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+	return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
 }
 
 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3-59-g8ed1b


From f7b76ac9d17e16e44feebb6d2749fec92bfd6dd4 Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Tue, 16 Jul 2019 23:26:14 +0800
Subject: io_uring: fix counter inc/dec mismatch in async_list

We could queue a work for each req in defer and link list without
increasing async_list->cnt, so we shouldn't decrease it while exiting
from workqueue as well if we didn't process the req in async list.

Thanks to Jens Axboe <axboe@kernel.dk> for his guidance.

Fixes: 31b515106428 ("io_uring: allow workqueue item to handle multiple buffered requests")
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 708d133492a1..5ec06e5ba0be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -333,7 +333,8 @@ struct io_kiocb {
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */
-#define REQ_F_FAIL_LINK		128	/* fail rest of links */
+#define REQ_F_LINK_DONE		128	/* linked sqes done */
+#define REQ_F_FAIL_LINK		256	/* fail rest of links */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
 			nxt->flags |= REQ_F_LINK;
 		}
 
+		nxt->flags |= REQ_F_LINK_DONE;
 		INIT_WORK(&nxt->work, io_sq_wq_submit_work);
 		queue_work(req->ctx->sqo_wq, &nxt->work);
 	}
@@ -1844,6 +1846,10 @@ restart:
 		/* async context always use a copy of the sqe */
 		kfree(sqe);
 
+		/* req from defer and link list needn't decrease async cnt */
+		if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+			goto out;
+
 		if (!async_list)
 			break;
 		if (!list_empty(&req_list)) {
@@ -1891,6 +1897,7 @@ restart:
 		}
 	}
 
+out:
 	if (cur_mm) {
 		set_fs(old_fs);
 		unuse_mm(cur_mm);
-- 
cgit v1.2.3-59-g8ed1b


From c0e48f9dea9129aa11bec3ed13803bcc26e96e49 Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Thu, 18 Jul 2019 20:44:00 +0800
Subject: io_uring: add a memory barrier before atomic_read

There is a hang issue while using fio to do some basic test. The issue
can be easily reproduced using the below script:

        while true
        do
                fio  --ioengine=io_uring  -rw=write -bs=4k -numjobs=1 \
                     -size=1G -iodepth=64 -name=uring   --filename=/dev/zero
        done

After several minutes (or more), fio would block at
io_uring_enter->io_cqring_wait in order to waiting for previously
committed sqes to be completed and can't return to user anymore until
we send a SIGTERM to fio. After receiving SIGTERM, fio hangs at
io_ring_ctx_wait_and_kill with a backtrace like this:

        [54133.243816] Call Trace:
        [54133.243842]  __schedule+0x3a0/0x790
        [54133.243868]  schedule+0x38/0xa0
        [54133.243880]  schedule_timeout+0x218/0x3b0
        [54133.243891]  ? sched_clock+0x9/0x10
        [54133.243903]  ? wait_for_completion+0xa3/0x130
        [54133.243916]  ? _raw_spin_unlock_irq+0x2c/0x40
        [54133.243930]  ? trace_hardirqs_on+0x3f/0xe0
        [54133.243951]  wait_for_completion+0xab/0x130
        [54133.243962]  ? wake_up_q+0x70/0x70
        [54133.243984]  io_ring_ctx_wait_and_kill+0xa0/0x1d0
        [54133.243998]  io_uring_release+0x20/0x30
        [54133.244008]  __fput+0xcf/0x270
        [54133.244029]  ____fput+0xe/0x10
        [54133.244040]  task_work_run+0x7f/0xa0
        [54133.244056]  do_exit+0x305/0xc40
        [54133.244067]  ? get_signal+0x13b/0xbd0
        [54133.244088]  do_group_exit+0x50/0xd0
        [54133.244103]  get_signal+0x18d/0xbd0
        [54133.244112]  ? _raw_spin_unlock_irqrestore+0x36/0x60
        [54133.244142]  do_signal+0x34/0x720
        [54133.244171]  ? exit_to_usermode_loop+0x7e/0x130
        [54133.244190]  exit_to_usermode_loop+0xc0/0x130
        [54133.244209]  do_syscall_64+0x16b/0x1d0
        [54133.244221]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The reason is that we had added a req to ctx->pending_async at the very
end, but it didn't get a chance to be processed. How could this happen?

        fio#cpu0                                        wq#cpu1

        io_add_to_prev_work                    io_sq_wq_submit_work

          atomic_read() <<< 1

                                                  atomic_dec_return() << 1->0
                                                  list_empty();    <<< true;

          list_add_tail()
          atomic_read() << 0 or 1?

As atomic_ops.rst states, atomic_read does not guarantee that the
runtime modification by any other thread is visible yet, so we must take
care of that with a proper implicit or explicit memory barrier.

This issue was detected with the help of Jackie's <liuyun01@kylinos.cn>

Fixes: 31b515106428 ("io_uring: allow workqueue item to handle multiple buffered requests")
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ec06e5ba0be..324530c4d2ce 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1924,6 +1924,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
 	ret = true;
 	spin_lock(&list->lock);
 	list_add_tail(&req->list, &list->list);
+	/*
+	 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
+	 */
+	smp_mb();
 	if (!atomic_read(&list->cnt)) {
 		list_del_init(&req->list);
 		ret = false;
-- 
cgit v1.2.3-59-g8ed1b


From 6a43074e2f461c2c49a607f9f6f5218d53f97d1e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Jul 2019 13:56:42 -0600
Subject: block: properly handle IOCB_NOWAIT for async O_DIRECT IO

A caller is supposed to pass in REQ_NOWAIT if we can't block for any
given operation, but O_DIRECT for block devices just ignore this. Hence
we'll block for various resource shortages on the block layer side,
like having to wait for requests.

Use the new REQ_NOWAIT_INLINE to ask for this error to be returned
inline, so we can handle it appropriately and return -EAGAIN to the
caller.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f00b569a9f89..5dc613eec4d2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -344,15 +344,24 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	struct bio *bio;
 	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+	bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0;
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
-	int ret = 0;
+	gfp_t gfp;
+	ssize_t ret;
 
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+	if (nowait)
+		gfp = GFP_NOWAIT;
+	else
+		gfp = GFP_KERNEL;
+
+	bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool);
+	if (!bio)
+		return -EAGAIN;
 
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -374,7 +383,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	if (!is_poll)
 		blk_start_plug(&plug);
 
+	ret = 0;
 	for (;;) {
+		int err;
+
 		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> 9;
 		bio->bi_write_hint = iocb->ki_hint;
@@ -382,8 +394,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		bio->bi_end_io = blkdev_bio_end_io;
 		bio->bi_ioprio = iocb->ki_ioprio;
 
-		ret = bio_iov_iter_get_pages(bio, iter);
-		if (unlikely(ret)) {
+		err = bio_iov_iter_get_pages(bio, iter);
+		if (unlikely(err)) {
+			if (!ret)
+				ret = err;
 			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 			break;
@@ -398,6 +412,14 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
 
+		/*
+		 * Tell underlying layer to not block for resource shortage.
+		 * And if we would have blocked, return error inline instead
+		 * of through the bio->bi_end_io() callback.
+		 */
+		if (nowait)
+			bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
+
 		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
@@ -411,6 +433,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			}
 
 			qc = submit_bio(bio);
+			if (qc == BLK_QC_T_EAGAIN) {
+				if (!ret)
+					ret = -EAGAIN;
+				goto error;
+			}
 
 			if (polled)
 				WRITE_ONCE(iocb->ki_cookie, qc);
@@ -431,8 +458,20 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			atomic_inc(&dio->ref);
 		}
 
-		submit_bio(bio);
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		qc = submit_bio(bio);
+		if (qc == BLK_QC_T_EAGAIN) {
+			if (!ret)
+				ret = -EAGAIN;
+			goto error;
+		}
+		ret += bio->bi_iter.bi_size;
+
+		bio = bio_alloc(gfp, nr_pages);
+		if (!bio) {
+			if (!ret)
+				ret = -EAGAIN;
+			goto error;
+		}
 	}
 
 	if (!is_poll)
@@ -452,13 +491,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	}
 	__set_current_state(TASK_RUNNING);
 
+out:
 	if (!ret)
 		ret = blk_status_to_errno(dio->bio.bi_status);
-	if (likely(!ret))
-		ret = dio->size;
 
 	bio_put(&dio->bio);
 	return ret;
+error:
+	if (!is_poll)
+		blk_finish_plug(&plug);
+	goto out;
 }
 
 static ssize_t
-- 
cgit v1.2.3-59-g8ed1b


From bd11b3a391e3df6fa958facbe4b3f9f4cca9bd49 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Jul 2019 08:37:31 -0600
Subject: io_uring: don't use iov_iter_advance() for fixed buffers

Hrvoje reports that when a large fixed buffer is registered and IO is
being done to the latter pages of said buffer, the IO submission time
is much worse:

reading to the start of the buffer: 11238 ns
reading to the end of the buffer:   1039879 ns

In fact, it's worse by two orders of magnitude. The reason for that is
how io_uring figures out how to setup the iov_iter. We point the iter
at the first bvec, and then use iov_iter_advance() to fast-forward to
the offset within that buffer we need.

However, that is abysmally slow, as it entails iterating the bvecs
that we setup as part of buffer registration. There's really no need
to use this generic helper, as we know it's a BVEC type iterator, and
we also know that each bvec is PAGE_SIZE in size, apart from possibly
the first and last. Hence we can just use a shift on the offset to
find the right index, and then adjust the iov_iter appropriately.
After this fix, the timings are:

reading to the start of the buffer: 10135 ns
reading to the end of the buffer:   1377 ns

Or about an 755x improvement for the tail page.

Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Tested-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 324530c4d2ce..0e6bd60cb113 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1066,8 +1066,44 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	 */
 	offset = buf_addr - imu->ubuf;
 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
-	if (offset)
-		iov_iter_advance(iter, offset);
+
+	if (offset) {
+		/*
+		 * Don't use iov_iter_advance() here, as it's really slow for
+		 * using the latter parts of a big fixed buffer - it iterates
+		 * over each segment manually. We can cheat a bit here, because
+		 * we know that:
+		 *
+		 * 1) it's a BVEC iter, we set it up
+		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 *    first and last bvec
+		 *
+		 * So just find our index, and adjust the iterator afterwards.
+		 * If the offset is within the first bvec (or the whole first
+		 * bvec, just use iov_iter_advance(). This makes it easier
+		 * since we can just skip the first segment, which may not
+		 * be PAGE_SIZE aligned.
+		 */
+		const struct bio_vec *bvec = imu->bvec;
+
+		if (offset <= bvec->bv_len) {
+			iov_iter_advance(iter, offset);
+		} else {
+			unsigned long seg_skip;
+
+			/* skip first vec */
+			offset -= bvec->bv_len;
+			seg_skip = 1 + (offset >> PAGE_SHIFT);
+
+			iter->bvec = bvec + seg_skip;
+			iter->nr_segs -= seg_skip;
+			iter->count -= (seg_skip << PAGE_SHIFT);
+			iter->iov_offset = offset & ~PAGE_MASK;
+			if (iter->iov_offset)
+				iter->count -= iter->iov_offset;
+		}
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 9310a7ba6de8cce6209e3e8a3cdf733f824cdd9b Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Mon, 22 Jul 2019 10:23:27 +0800
Subject: io_uring: track io length in async_list based on bytes

We are using PAGE_SIZE as the unit to determine if the total len in
async_list has exceeded max_pages, it's not fair for smaller io sizes.
For example, if we are doing 1k-size io streams, we will never exceed
max_pages since len >>= PAGE_SHIFT always gets zero. So use original
bytes to make it more accurate.

Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0e6bd60cb113..2039f888197e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
 
 	struct file		*file;
 	off_t			io_end;
-	size_t			io_pages;
+	size_t			io_len;
 };
 
 struct io_ring_ctx {
@@ -1158,28 +1158,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
 	off_t io_end = kiocb->ki_pos + len;
 
 	if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
-		unsigned long max_pages;
+		unsigned long max_bytes;
 
 		/* Use 8x RA size as a decent limiter for both reads/writes */
-		max_pages = filp->f_ra.ra_pages;
-		if (!max_pages)
-			max_pages = VM_READAHEAD_PAGES;
-		max_pages *= 8;
-
-		/* If max pages are exceeded, reset the state */
-		len >>= PAGE_SHIFT;
-		if (async_list->io_pages + len <= max_pages) {
+		max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
+		if (!max_bytes)
+			max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
+
+		/* If max len are exceeded, reset the state */
+		if (async_list->io_len + len <= max_bytes) {
 			req->flags |= REQ_F_SEQ_PREV;
-			async_list->io_pages += len;
+			async_list->io_len += len;
 		} else {
 			io_end = 0;
-			async_list->io_pages = 0;
+			async_list->io_len = 0;
 		}
 	}
 
 	/* New file? Reset state. */
 	if (async_list->file != filp) {
-		async_list->io_pages = 0;
+		async_list->io_len = 0;
 		async_list->file = filp;
 	}
 	async_list->io_end = io_end;
-- 
cgit v1.2.3-59-g8ed1b


From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 11 Jul 2019 09:54:40 -0700
Subject: access: avoid the RCU grace period for the temporary subjective
 credentials

It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU
work because it installs a temporary credential that gets allocated and
freed for each system call.

The allocation and freeing overhead is mostly benign, but because
credentials can be accessed under the RCU read lock, the freeing
involves a RCU grace period.

Which is not a huge deal normally, but if you have a lot of access()
calls, this causes a fair amount of seconday damage: instead of having a
nice alloc/free patterns that hits in hot per-CPU slab caches, you have
all those delayed free's, and on big machines with hundreds of cores,
the RCU overhead can end up being enormous.

But it turns out that all of this is entirely unnecessary.  Exactly
because access() only installs the credential as the thread-local
subjective credential, the temporary cred pointer doesn't actually need
to be RCU free'd at all.  Once we're done using it, we can just free it
synchronously and avoid all the RCU overhead.

So add a 'non_rcu' flag to 'struct cred', which can be set by users that
know they only use it in non-RCU context (there are other potential
users for this).  We can make it a union with the rcu freeing list head
that we need for the RCU case, so this doesn't need any extra storage.

Note that this also makes 'get_current_cred()' clear the new non_rcu
flag, in case we have filesystems that take a long-term reference to the
cred and then expect the RCU delayed freeing afterwards.  It's not
entirely clear that this is required, but it makes for clear semantics:
the subjective cred remains non-RCU as long as you only access it
synchronously using the thread-local accessors, but you _can_ use it as
a generic cred if you want to.

It is possible that we should just remove the whole RCU markings for
->cred entirely.  Only ->real_cred is really supposed to be accessed
through RCU, and the long-term cred copies that nfs uses might want to
explicitly re-enable RCU freeing if required, rather than have
get_current_cred() do it implicitly.

But this is a "minimal semantic changes" change for the immediate
problem.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jan Glauber <jglauber@marvell.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Jayachandran Chandrasekharan Nair <jnair@marvell.com>
Cc: Greg KH <greg@kroah.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/open.c            | 19 +++++++++++++++++++
 include/linux/cred.h |  8 +++++++-
 kernel/cred.c        | 21 +++++++++++++++++++--
 3 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index b5b80469b93d..a59abe3c669a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -374,6 +374,25 @@ long do_faccessat(int dfd, const char __user *filename, int mode)
 				override_cred->cap_permitted;
 	}
 
+	/*
+	 * The new set of credentials can *only* be used in
+	 * task-synchronous circumstances, and does not need
+	 * RCU freeing, unless somebody then takes a separate
+	 * reference to it.
+	 *
+	 * NOTE! This is _only_ true because this credential
+	 * is used purely for override_creds() that installs
+	 * it as the subjective cred. Other threads will be
+	 * accessing ->real_cred, not the subjective cred.
+	 *
+	 * If somebody _does_ make a copy of this (using the
+	 * 'get_current_cred()' function), that will clear the
+	 * non_rcu field, because now that other user may be
+	 * expecting RCU freeing. But normal thread-synchronous
+	 * cred accesses will keep things non-RCY.
+	 */
+	override_cred->non_rcu = 1;
+
 	old_cred = override_creds(override_cred);
 retry:
 	res = user_path_at(dfd, filename, lookup_flags, &path);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 7eb43a038330..f7a30e0099be 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -145,7 +145,11 @@ struct cred {
 	struct user_struct *user;	/* real user ID subscription */
 	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
-	struct rcu_head	rcu;		/* RCU deletion hook */
+	/* RCU deletion */
+	union {
+		int non_rcu;			/* Can we skip RCU deletion? */
+		struct rcu_head	rcu;		/* RCU deletion hook */
+	};
 } __randomize_layout;
 
 extern void __put_cred(struct cred *);
@@ -246,6 +250,7 @@ static inline const struct cred *get_cred(const struct cred *cred)
 	if (!cred)
 		return cred;
 	validate_creds(cred);
+	nonconst_cred->non_rcu = 0;
 	return get_new_cred(nonconst_cred);
 }
 
@@ -257,6 +262,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 	if (!atomic_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
 	validate_creds(cred);
+	nonconst_cred->non_rcu = 0;
 	return cred;
 }
 
diff --git a/kernel/cred.c b/kernel/cred.c
index c73a87a4df13..153ae369e024 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
 	BUG_ON(cred == current->cred);
 	BUG_ON(cred == current->real_cred);
 
-	call_rcu(&cred->rcu, put_cred_rcu);
+	if (cred->non_rcu)
+		put_cred_rcu(&cred->rcu);
+	else
+		call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
@@ -256,6 +259,7 @@ struct cred *prepare_creds(void)
 	old = task->cred;
 	memcpy(new, old, sizeof(struct cred));
 
+	new->non_rcu = 0;
 	atomic_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
@@ -535,7 +539,19 @@ const struct cred *override_creds(const struct cred *new)
 
 	validate_creds(old);
 	validate_creds(new);
-	get_cred(new);
+
+	/*
+	 * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
+	 *
+	 * That means that we do not clear the 'non_rcu' flag, since
+	 * we are only installing the cred into the thread-synchronous
+	 * '->cred' pointer, not the '->real_cred' pointer that is
+	 * visible to other threads under RCU.
+	 *
+	 * Also note that we did validate_creds() manually, not depending
+	 * on the validation in 'get_cred()'.
+	 */
+	get_new_cred((struct cred *)new);
 	alter_cred_subscribers(new, 1);
 	rcu_assign_pointer(current->cred, new);
 	alter_cred_subscribers(old, -1);
@@ -672,6 +688,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	validate_creds(old);
 
 	*new = *old;
+	new->non_rcu = 0;
 	atomic_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_uid(new->user);
-- 
cgit v1.2.3-59-g8ed1b


From 0ce38c5f929c83dff8ea805f6c6ef2eb97b66431 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 25 Jul 2019 16:58:33 +0900
Subject: iomap: fix Invalid License ID

Detected by:

  $ ./scripts/spdxcheck.py
  fs/iomap/Makefile: 1:27 Invalid License ID: GPL-2.0-or-newer

Fixes: 1c230208f53d ("iomap: start moving code to fs/iomap/")
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/iomap/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 2d165388d952..93cd11938bf5 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-or-newer
+# SPDX-License-Identifier: GPL-2.0-or-later
 #
 # Copyright (c) 2019 Oracle.
 # All Rights Reserved.
-- 
cgit v1.2.3-59-g8ed1b


From 16d51a590a8ce3befb1308e0e7ab77f3b661af33 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 16 Jul 2019 17:20:45 +0200
Subject: sched/fair: Don't free p->numa_faults with concurrent readers

When going through execve(), zero out the NUMA fault statistics instead of
freeing them.

During execve, the task is reachable through procfs and the scheduler. A
concurrent /proc/*/sched reader can read data from a freed ->numa_faults
allocation (confirmed by KASAN) and write it back to userspace.
I believe that it would also be possible for a use-after-free read to occur
through a race between a NUMA fault and execve(): task_numa_fault() can
lead to task_numa_compare(), which invokes task_weight() on the currently
running task of a different CPU.

Another way to fix this would be to make ->numa_faults RCU-managed or add
extra locking, but it seems easier to wipe the NUMA fault statistics on
execve.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()")
Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                            |  2 +-
 include/linux/sched/numa_balancing.h |  4 ++--
 kernel/fork.c                        |  2 +-
 kernel/sched/fair.c                  | 24 ++++++++++++++++++++----
 4 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index c71cbfe6826a..f7f6a140856a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1828,7 +1828,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	membarrier_execve(current);
 	rseq_execve(current);
 	acct_update_integrals(current);
-	task_numa_free(current);
+	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
 	if (filename)
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index e7dd04a84ba8..3988762efe15 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -19,7 +19,7 @@
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
-extern void task_numa_free(struct task_struct *p);
+extern void task_numa_free(struct task_struct *p, bool final);
 extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
 					int src_nid, int dst_cpu);
 #else
@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p)
 static inline void set_numabalancing_state(bool enabled)
 {
 }
-static inline void task_numa_free(struct task_struct *p)
+static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
diff --git a/kernel/fork.c b/kernel/fork.c
index d8ae0f1b4148..2852d0e76ea3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	cgroup_free(tsk);
-	task_numa_free(tsk);
+	task_numa_free(tsk, true);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95a87e9..6adb0e0f5feb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2353,13 +2353,23 @@ no_join:
 	return;
 }
 
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
 {
 	struct numa_group *grp = p->numa_group;
-	void *numa_faults = p->numa_faults;
+	unsigned long *numa_faults = p->numa_faults;
 	unsigned long flags;
 	int i;
 
+	if (!numa_faults)
+		return;
+
 	if (grp) {
 		spin_lock_irqsave(&grp->lock, flags);
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2372,8 +2382,14 @@ void task_numa_free(struct task_struct *p)
 		put_numa_group(grp);
 	}
 
-	p->numa_faults = NULL;
-	kfree(numa_faults);
+	if (final) {
+		p->numa_faults = NULL;
+		kfree(numa_faults);
+	} else {
+		p->total_numa_faults = 0;
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+			numa_faults[i] = 0;
+	}
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 6e7ca09b583de4be6c27d9d4b06e8c5dd46a58fa Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 25 Jul 2019 11:27:29 +0300
Subject: btrfs: Fix deadlock caused by missing memory barrier

Commit 06297d8cefca ("btrfs: switch extent_buffer blocking_writers from
atomic to int") changed the type of blocking_writers but forgot to
adjust relevant code in btrfs_tree_unlock by converting the
smp_mb__after_atomic to smp_mb.  This opened up the possibility of a
deadlock due to re-ordering of setting blocking_writers and
checking/waking up the waiter. This particular lockup is explained in a
comment above waitqueue_active() function.

Fix it by converting the memory barrier to a full smp_mb, accounting
for the fact that blocking_writers is a simple integer.

Fixes: 06297d8cefca ("btrfs: switch extent_buffer blocking_writers from atomic to int")
Tested-by: Johannes Thumshirn <jthumshirn@suse.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/locking.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 98fccce4208c..393eceda57c8 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -346,9 +346,12 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
 	if (blockers) {
 		btrfs_assert_no_spinning_writers(eb);
 		eb->blocking_writers--;
-		/* Use the lighter barrier after atomic */
-		smp_mb__after_atomic();
-		cond_wake_up_nomb(&eb->write_lock_wq);
+		/*
+		 * We need to order modifying blocking_writers above with
+		 * actually waking up the sleepers to ensure they see the
+		 * updated value of blocking_writers
+		 */
+		cond_wake_up(&eb->write_lock_wq);
 	} else {
 		btrfs_assert_spinning_writers_put(eb);
 		write_unlock(&eb->lock);
-- 
cgit v1.2.3-59-g8ed1b


From 36703247d5f52a679df9da51192b6950fe81689f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Jul 2019 10:20:18 -0600
Subject: io_uring: ensure ->list is initialized for poll commands

Daniel reports that when testing an http server that uses io_uring
to poll for incoming connections, sometimes it hard crashes. This is
due to an uninitialized list member for the io_uring request. Normally
this doesn't trigger and none of the test cases caught it.

Reported-by: Daniel Kozak <kozzi11@gmail.com>
Tested-by: Daniel Kozak <kozzi11@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2039f888197e..15d9b16ed29d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1666,6 +1666,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	INIT_LIST_HEAD(&poll->wait.entry);
 	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
 
+	INIT_LIST_HEAD(&req->list);
+
 	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
 
 	spin_lock_irq(&ctx->completion_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 35a3a90cc56cee54fb4d45afbbc6dfea0835de0c Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Sun, 19 May 2019 18:56:50 -0500
Subject: afs: yfsclient: Mark expected switch fall-throughs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation to enabling -Wimplicit-fallthrough, mark switch
cases where we are expecting to fall through.

This patch fixes the following warnings:

fs/afs/yfsclient.c: In function ‘yfs_deliver_fs_fetch_opaque_acl’:
fs/afs/yfsclient.c:1984:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/yfsclient.c:1987:2: note: here
  case 1:
  ^~~~
fs/afs/yfsclient.c:2005:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/yfsclient.c:2008:2: note: here
  case 2:
  ^~~~
fs/afs/yfsclient.c:2014:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/yfsclient.c:2017:2: note: here
  case 3:
  ^~~~
fs/afs/yfsclient.c:2035:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/yfsclient.c:2038:2: note: here
  case 4:
  ^~~~
fs/afs/yfsclient.c:2047:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/yfsclient.c:2050:2: note: here
  case 5:
  ^~~~

Warning level 3 was used: -Wimplicit-fallthrough=3

Also, fix some commenting style issues.

This patch is part of the ongoing efforts to enable
-Wimplicit-fallthrough.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 fs/afs/yfsclient.c | 54 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 18722aaeda33..2575503170fc 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -450,8 +450,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		req->offset = req->pos & (PAGE_SIZE - 1);
 		afs_extract_to_tmp64(call);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the returned data length */
+		/* extract the returned data length */
 	case 1:
 		_debug("extract data length");
 		ret = afs_extract_data(call, true);
@@ -477,8 +478,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		call->bvec[0].bv_page = req->pages[req->index];
 		iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
 		ASSERTCMP(size, <=, PAGE_SIZE);
+		/* Fall through */
 
-		/* Fall through - and extract the returned data */
+		/* extract the returned data */
 	case 2:
 		_debug("extract data %zu/%llu",
 		       iov_iter_count(&call->iter), req->remain);
@@ -505,8 +507,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		/* Discard any excess data the server gave us */
 		iov_iter_discard(&call->iter, READ, req->actual_len - req->len);
 		call->unmarshall = 3;
-
 		/* Fall through */
+
 	case 3:
 		_debug("extract discard %zu/%llu",
 		       iov_iter_count(&call->iter), req->actual_len - req->len);
@@ -521,8 +523,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 				   sizeof(struct yfs_xdr_YFSFetchStatus) +
 				   sizeof(struct yfs_xdr_YFSCallBack) +
 				   sizeof(struct yfs_xdr_YFSVolSync));
+		/* Fall through */
 
-		/* Fall through - and extract the metadata */
+		/* extract the metadata */
 	case 4:
 		ret = afs_extract_data(call, false);
 		if (ret < 0)
@@ -539,8 +542,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		req->file_size = call->out_scb->status.size;
 
 		call->unmarshall++;
-
 		/* Fall through */
+
 	case 5:
 		break;
 	}
@@ -1429,8 +1432,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 0:
 		call->unmarshall++;
 		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
+		/* Fall through */
 
-		/* Fall through - and extract the returned status record */
+		/* extract the returned status record */
 	case 1:
 		_debug("extract status");
 		ret = afs_extract_data(call, true);
@@ -1441,8 +1445,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 		xdr_decode_YFSFetchVolumeStatus(&bp, call->out_volstatus);
 		call->unmarshall++;
 		afs_extract_to_tmp(call);
+		/* Fall through */
 
-		/* Fall through - and extract the volume name length */
+		/* extract the volume name length */
 	case 2:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1456,8 +1461,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 		size = (call->count + 3) & ~3; /* It's padded */
 		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the volume name */
+		/* extract the volume name */
 	case 3:
 		_debug("extract volname");
 		ret = afs_extract_data(call, true);
@@ -1469,8 +1475,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 		_debug("volname '%s'", p);
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the offline message length */
+		/* extract the offline message length */
 	case 4:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1484,8 +1491,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 		size = (call->count + 3) & ~3; /* It's padded */
 		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the offline message */
+		/* extract the offline message */
 	case 5:
 		_debug("extract offline");
 		ret = afs_extract_data(call, true);
@@ -1498,8 +1506,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the message of the day length */
+		/* extract the message of the day length */
 	case 6:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1513,8 +1522,9 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 		size = (call->count + 3) & ~3; /* It's padded */
 		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the message of the day */
+		/* extract the message of the day */
 	case 7:
 		_debug("extract motd");
 		ret = afs_extract_data(call, false);
@@ -1526,8 +1536,8 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
 		_debug("motd '%s'", p);
 
 		call->unmarshall++;
-
 		/* Fall through */
+
 	case 8:
 		break;
 	}
@@ -1805,9 +1815,9 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 	case 0:
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the file status count and array in two steps */
-		/* Fall through */
 	case 1:
 		_debug("extract status count");
 		ret = afs_extract_data(call, true);
@@ -1824,8 +1834,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		call->unmarshall++;
 	more_counts:
 		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
-
 		/* Fall through */
+
 	case 2:
 		_debug("extract status array %u", call->count);
 		ret = afs_extract_data(call, true);
@@ -1845,9 +1855,9 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		call->count = 0;
 		call->unmarshall++;
 		afs_extract_to_tmp(call);
+		/* Fall through */
 
 		/* Extract the callback count and array in two steps */
-		/* Fall through */
 	case 3:
 		_debug("extract CB count");
 		ret = afs_extract_data(call, true);
@@ -1863,8 +1873,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		call->unmarshall++;
 	more_cbs:
 		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
-
 		/* Fall through */
+
 	case 4:
 		_debug("extract CB array");
 		ret = afs_extract_data(call, true);
@@ -1881,8 +1891,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 
 		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
 		call->unmarshall++;
-
 		/* Fall through */
+
 	case 5:
 		ret = afs_extract_data(call, false);
 		if (ret < 0)
@@ -1892,8 +1902,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
 		call->unmarshall++;
-
 		/* Fall through */
+
 	case 6:
 		break;
 	}
@@ -1978,6 +1988,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 	case 0:
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the file ACL length */
 	case 1:
@@ -1999,6 +2010,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 			iov_iter_discard(&call->iter, READ, size);
 		}
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the file ACL */
 	case 2:
@@ -2008,6 +2020,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the volume ACL length */
 	case 3:
@@ -2029,6 +2042,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 			iov_iter_discard(&call->iter, READ, size);
 		}
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the volume ACL */
 	case 4:
@@ -2041,6 +2055,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 				   sizeof(struct yfs_xdr_YFSFetchStatus) +
 				   sizeof(struct yfs_xdr_YFSVolSync));
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the metadata */
 	case 5:
@@ -2057,6 +2072,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 		xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
 		call->unmarshall++;
+		/* Fall through */
 
 	case 6:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 2988160827b780eec3f958ade2100cc223d4d2d6 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Sun, 19 May 2019 18:43:53 -0500
Subject: afs: fsclient: Mark expected switch fall-throughs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation to enabling -Wimplicit-fallthrough, mark switch
cases where we are expecting to fall through.

This patch fixes the following warnings:

Warning level 3 was used: -Wimplicit-fallthrough=3

fs/afs/fsclient.c: In function ‘afs_deliver_fs_fetch_acl’:
fs/afs/fsclient.c:2199:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/fsclient.c:2202:2: note: here
  case 1:
  ^~~~
fs/afs/fsclient.c:2216:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/fsclient.c:2219:2: note: here
  case 2:
  ^~~~
fs/afs/fsclient.c:2225:19: warning: this statement may fall through [-Wimplicit-fallthrough=]
   call->unmarshall++;
   ~~~~~~~~~~~~~~~~^~
fs/afs/fsclient.c:2228:2: note: here
  case 3:
  ^~~~

This patch is part of the ongoing efforts to enable
-Wimplicit-fallthrough.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 fs/afs/fsclient.c | 51 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1ce73e014139..114f281f3687 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -339,8 +339,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 			call->tmp_u = htonl(0);
 			afs_extract_to_tmp(call);
 		}
+		/* Fall through */
 
-		/* Fall through - and extract the returned data length */
+		/* extract the returned data length */
 	case 1:
 		_debug("extract data length");
 		ret = afs_extract_data(call, true);
@@ -366,8 +367,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		call->bvec[0].bv_page = req->pages[req->index];
 		iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
 		ASSERTCMP(size, <=, PAGE_SIZE);
+		/* Fall through */
 
-		/* Fall through - and extract the returned data */
+		/* extract the returned data */
 	case 2:
 		_debug("extract data %zu/%llu",
 		       iov_iter_count(&call->iter), req->remain);
@@ -394,8 +396,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		/* Discard any excess data the server gave us */
 		iov_iter_discard(&call->iter, READ, req->actual_len - req->len);
 		call->unmarshall = 3;
-
 		/* Fall through */
+
 	case 3:
 		_debug("extract discard %zu/%llu",
 		       iov_iter_count(&call->iter), req->actual_len - req->len);
@@ -407,8 +409,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 	no_more_data:
 		call->unmarshall = 4;
 		afs_extract_to_buf(call, (21 + 3 + 6) * 4);
+		/* Fall through */
 
-		/* Fall through - and extract the metadata */
+		/* extract the metadata */
 	case 4:
 		ret = afs_extract_data(call, false);
 		if (ret < 0)
@@ -1471,8 +1474,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 0:
 		call->unmarshall++;
 		afs_extract_to_buf(call, 12 * 4);
+		/* Fall through */
 
-		/* Fall through - and extract the returned status record */
+		/* extract the returned status record */
 	case 1:
 		_debug("extract status");
 		ret = afs_extract_data(call, true);
@@ -1483,8 +1487,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 		xdr_decode_AFSFetchVolumeStatus(&bp, call->out_volstatus);
 		call->unmarshall++;
 		afs_extract_to_tmp(call);
+		/* Fall through */
 
-		/* Fall through - and extract the volume name length */
+		/* extract the volume name length */
 	case 2:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1498,8 +1503,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 		size = (call->count + 3) & ~3; /* It's padded */
 		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the volume name */
+		/* extract the volume name */
 	case 3:
 		_debug("extract volname");
 		ret = afs_extract_data(call, true);
@@ -1511,8 +1517,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 		_debug("volname '%s'", p);
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the offline message length */
+		/* extract the offline message length */
 	case 4:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1526,8 +1533,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 		size = (call->count + 3) & ~3; /* It's padded */
 		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the offline message */
+		/* extract the offline message */
 	case 5:
 		_debug("extract offline");
 		ret = afs_extract_data(call, true);
@@ -1540,8 +1548,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the message of the day length */
+		/* extract the message of the day length */
 	case 6:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1555,8 +1564,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 		size = (call->count + 3) & ~3; /* It's padded */
 		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the message of the day */
+		/* extract the message of the day */
 	case 7:
 		_debug("extract motd");
 		ret = afs_extract_data(call, false);
@@ -1850,8 +1860,9 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
 	case 0:
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract the capabilities word count */
+		/* Extract the capabilities word count */
 	case 1:
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -1863,8 +1874,9 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
 		call->count2 = count;
 		iov_iter_discard(&call->iter, READ, count * sizeof(__be32));
 		call->unmarshall++;
+		/* Fall through */
 
-		/* Fall through - and extract capabilities words */
+		/* Extract capabilities words */
 	case 2:
 		ret = afs_extract_data(call, false);
 		if (ret < 0)
@@ -2020,9 +2032,9 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 	case 0:
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the file status count and array in two steps */
-		/* Fall through */
 	case 1:
 		_debug("extract status count");
 		ret = afs_extract_data(call, true);
@@ -2039,8 +2051,8 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		call->unmarshall++;
 	more_counts:
 		afs_extract_to_buf(call, 21 * sizeof(__be32));
-
 		/* Fall through */
+
 	case 2:
 		_debug("extract status array %u", call->count);
 		ret = afs_extract_data(call, true);
@@ -2060,9 +2072,9 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		call->count = 0;
 		call->unmarshall++;
 		afs_extract_to_tmp(call);
+		/* Fall through */
 
 		/* Extract the callback count and array in two steps */
-		/* Fall through */
 	case 3:
 		_debug("extract CB count");
 		ret = afs_extract_data(call, true);
@@ -2078,8 +2090,8 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 		call->unmarshall++;
 	more_cbs:
 		afs_extract_to_buf(call, 3 * sizeof(__be32));
-
 		/* Fall through */
+
 	case 4:
 		_debug("extract CB array");
 		ret = afs_extract_data(call, true);
@@ -2096,8 +2108,8 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 
 		afs_extract_to_buf(call, 6 * sizeof(__be32));
 		call->unmarshall++;
-
 		/* Fall through */
+
 	case 5:
 		ret = afs_extract_data(call, false);
 		if (ret < 0)
@@ -2193,6 +2205,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
 	case 0:
 		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the returned data length */
 	case 1:
@@ -2210,6 +2223,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
 		acl->size = call->count2;
 		afs_extract_begin(call, acl->data, size);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the returned data */
 	case 2:
@@ -2219,6 +2233,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
 
 		afs_extract_to_buf(call, (21 + 6) * 4);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the metadata */
 	case 3:
-- 
cgit v1.2.3-59-g8ed1b


From a3b46b86ca76d7f9d487e6a0b594fd1984e0796e Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 26 Jul 2019 16:47:05 +0900
Subject: btrfs: fix extent_state leak in btrfs_lock_and_flush_ordered_range

btrfs_lock_and_flush_ordered_range() loads given "*cached_state" into
cachedp, which, in general, is NULL. Then, lock_extent_bits() updates
"cachedp", but it never goes backs to the caller. Thus the caller still
see its "cached_state" to be NULL and never free the state allocated
under btrfs_lock_and_flush_ordered_range(). As a result, we will
see massive state leak with e.g. fstests btrfs/005. Fix this bug by
properly handling the pointers.

Fixes: bd80d94efb83 ("btrfs: Always use a cached extent_state in btrfs_lock_and_flush_ordered_range")
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ordered-data.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1744ba8b2754..ae7f64a8facb 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -985,13 +985,14 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
 					struct extent_state **cached_state)
 {
 	struct btrfs_ordered_extent *ordered;
-	struct extent_state *cachedp = NULL;
+	struct extent_state *cache = NULL;
+	struct extent_state **cachedp = &cache;
 
 	if (cached_state)
-		cachedp = *cached_state;
+		cachedp = cached_state;
 
 	while (1) {
-		lock_extent_bits(tree, start, end, &cachedp);
+		lock_extent_bits(tree, start, end, cachedp);
 		ordered = btrfs_lookup_ordered_range(inode, start,
 						     end - start + 1);
 		if (!ordered) {
@@ -1001,10 +1002,10 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
 			 * aren't exposing it outside of this function
 			 */
 			if (!cached_state)
-				refcount_dec(&cachedp->refs);
+				refcount_dec(&cache->refs);
 			break;
 		}
-		unlock_extent_cached(tree, start, end, &cachedp);
+		unlock_extent_cached(tree, start, end, cachedp);
 		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 19a1c4092e7ca1ad1a72ac5535f902c483372cd5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 24 Jul 2019 12:45:46 -0400
Subject: fix the struct mount leak in umount_tree()

	We need to drop everything we remove from the tree, whether
mnt_has_parent() is true or not.  Usually the bug manifests as a slow
memory leak (leaked struct mount for initramfs); it becomes much more
visible in mount_subtree() users, such as btrfs.  There we leak
a struct mount for btrfs superblock being mounted, which prevents
fs shutdown on subsequent umount.

Fixes: 56cbb429d911 ("switch the remnants of releasing the mountpoint away from fs_pin")
Reported-by: Nikolay Borisov <nborisov@suse.com>
Tested-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 6464ea4acba9..d28d30b13043 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1463,7 +1463,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 
 		disconnect = disconnect_mount(p, how);
-
 		if (mnt_has_parent(p)) {
 			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
@@ -1471,10 +1470,11 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
 			} else {
 				umount_mnt(p);
-				hlist_add_head(&p->mnt_umount, &unmounted);
 			}
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
+		if (disconnect)
+			hlist_add_head(&p->mnt_umount, &unmounted);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 543b8c468f55f27f3c0178a22a91a51aabbbc428 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 17 Jul 2019 18:31:53 -0700
Subject: f2fs: fix to read source block before invalidating it

f2fs_allocate_data_block() invalidates old block address and enable new block
address. Then, if we try to read old block by f2fs_submit_page_bio(), it will
give WARN due to reading invalid blocks.

Let's make the order sanely back.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 70 +++++++++++++++++++++++++++++-------------------------------
 1 file changed, 34 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6691f526fa40..8974672db78f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -796,6 +796,29 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	if (lfs_mode)
 		down_write(&fio.sbi->io_order_lock);
 
+	mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
+					fio.old_blkaddr, false);
+	if (!mpage)
+		goto up_out;
+
+	fio.encrypted_page = mpage;
+
+	/* read source block in mpage */
+	if (!PageUptodate(mpage)) {
+		err = f2fs_submit_page_bio(&fio);
+		if (err) {
+			f2fs_put_page(mpage, 1);
+			goto up_out;
+		}
+		lock_page(mpage);
+		if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
+						!PageUptodate(mpage))) {
+			err = -EIO;
+			f2fs_put_page(mpage, 1);
+			goto up_out;
+		}
+	}
+
 	f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 					&sum, CURSEG_COLD_DATA, NULL, false);
 
@@ -803,44 +826,18 @@ static int move_data_block(struct inode *inode, block_t bidx,
 				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
 	if (!fio.encrypted_page) {
 		err = -ENOMEM;
-		goto recover_block;
-	}
-
-	mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
-					fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
-	if (mpage) {
-		bool updated = false;
-
-		if (PageUptodate(mpage)) {
-			memcpy(page_address(fio.encrypted_page),
-					page_address(mpage), PAGE_SIZE);
-			updated = true;
-		}
 		f2fs_put_page(mpage, 1);
-		invalidate_mapping_pages(META_MAPPING(fio.sbi),
-					fio.old_blkaddr, fio.old_blkaddr);
-		if (updated)
-			goto write_page;
-	}
-
-	err = f2fs_submit_page_bio(&fio);
-	if (err)
-		goto put_page_out;
-
-	/* write page */
-	lock_page(fio.encrypted_page);
-
-	if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
-		err = -EIO;
-		goto put_page_out;
-	}
-	if (unlikely(!PageUptodate(fio.encrypted_page))) {
-		err = -EIO;
-		goto put_page_out;
+		goto recover_block;
 	}
 
-write_page:
+	/* write target block */
 	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
+	memcpy(page_address(fio.encrypted_page),
+				page_address(mpage), PAGE_SIZE);
+	f2fs_put_page(mpage, 1);
+	invalidate_mapping_pages(META_MAPPING(fio.sbi),
+				fio.old_blkaddr, fio.old_blkaddr);
+
 	set_page_dirty(fio.encrypted_page);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
@@ -871,11 +868,12 @@ write_page:
 put_page_out:
 	f2fs_put_page(fio.encrypted_page, 1);
 recover_block:
-	if (lfs_mode)
-		up_write(&fio.sbi->io_order_lock);
 	if (err)
 		f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
 								true, true);
+up_out:
+	if (lfs_mode)
+		up_write(&fio.sbi->io_order_lock);
 put_out:
 	f2fs_put_dnode(&dn);
 out:
-- 
cgit v1.2.3-59-g8ed1b


From 2e616d9f9ce8d469db4cd0a019cdc2ff3feab577 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 28 Jul 2019 21:12:32 -0700
Subject: xfs: fix stack contents leakage in the v1 inumber ioctls

Explicitly initialize the onstack structures to zero so we don't leak
kernel memory into userspace when converting the in-core inumbers
structure to the v1 inogrp ioctl structure.  Add a comment about why we
have to use memset to ensure that the padding holes in the structures
are set to zero.

Fixes: 5f19c7fc6873351 ("xfs: introduce v5 inode group structure")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/xfs/xfs_itable.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index a8a06bb78ea8..f5c955d35be4 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -272,6 +272,7 @@ xfs_bulkstat_to_bstat(
 	struct xfs_bstat		*bs1,
 	const struct xfs_bulkstat	*bstat)
 {
+	/* memset is needed here because of padding holes in the structure. */
 	memset(bs1, 0, sizeof(struct xfs_bstat));
 	bs1->bs_ino = bstat->bs_ino;
 	bs1->bs_mode = bstat->bs_mode;
@@ -388,6 +389,8 @@ xfs_inumbers_to_inogrp(
 	struct xfs_inogrp		*ig1,
 	const struct xfs_inumbers	*ig)
 {
+	/* memset is needed here because of padding holes in the structure. */
+	memset(ig1, 0, sizeof(struct xfs_inogrp));
 	ig1->xi_startino = ig->xi_startino;
 	ig1->xi_alloccount = ig->xi_alloccount;
 	ig1->xi_allocmask = ig->xi_allocmask;
-- 
cgit v1.2.3-59-g8ed1b


From 38fb6d0ea34299d97b031ed64fe994158b6f8eb3 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Thu, 25 Jul 2019 11:08:52 +0800
Subject: f2fs: use EINVAL for superblock with invalid magic

The kernel mount_block_root() function expects -EACESS or -EINVAL for a
unmountable filesystem when trying to mount the root with different
filesystem types.

However, in 5.3-rc1 the behavior when F2FS code cannot find valid block
changed to return -EFSCORRUPTED(-EUCLEAN), and this error code makes
mount_block_root() fail when trying to probe F2FS.

When the magic number of the superblock mismatches, it has a high
probability that it's just not a F2FS. In this case return -EINVAL seems
to be a better result, and this return value can make mount_block_root()
probing work again.

Return -EINVAL when the superblock has magic mismatch, -EFSCORRUPTED in
other cases (the magic matches but the superblock cannot be recognized).

Fixes: 10f966bbf521 ("f2fs: use generic EFSBADCRC/EFSCORRUPTED")
Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d95a681ef7c9..1838dd852a50 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2422,6 +2422,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 	size_t crc_offset = 0;
 	__u32 crc = 0;
 
+	if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) {
+		f2fs_info(sbi, "Magic Mismatch, valid(0x%x) - read(0x%x)",
+			  F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
+		return -EINVAL;
+	}
+
 	/* Check checksum_offset and crc in superblock */
 	if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) {
 		crc_offset = le32_to_cpu(raw_super->checksum_offset);
@@ -2429,26 +2435,20 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 			offsetof(struct f2fs_super_block, crc)) {
 			f2fs_info(sbi, "Invalid SB checksum offset: %zu",
 				  crc_offset);
-			return 1;
+			return -EFSCORRUPTED;
 		}
 		crc = le32_to_cpu(raw_super->crc);
 		if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) {
 			f2fs_info(sbi, "Invalid SB checksum value: %u", crc);
-			return 1;
+			return -EFSCORRUPTED;
 		}
 	}
 
-	if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
-		f2fs_info(sbi, "Magic Mismatch, valid(0x%x) - read(0x%x)",
-			  F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
-		return 1;
-	}
-
 	/* Currently, support only 4KB page cache size */
 	if (F2FS_BLKSIZE != PAGE_SIZE) {
 		f2fs_info(sbi, "Invalid page_cache_size (%lu), supports only 4KB",
 			  PAGE_SIZE);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	/* Currently, support only 4KB block size */
@@ -2456,14 +2456,14 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 	if (blocksize != F2FS_BLKSIZE) {
 		f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB",
 			  blocksize);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	/* check log blocks per segment */
 	if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
 		f2fs_info(sbi, "Invalid log blocks per segment (%u)",
 			  le32_to_cpu(raw_super->log_blocks_per_seg));
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	/* Currently, support 512/1024/2048/4096 bytes sector size */
@@ -2473,7 +2473,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 				F2FS_MIN_LOG_SECTOR_SIZE) {
 		f2fs_info(sbi, "Invalid log sectorsize (%u)",
 			  le32_to_cpu(raw_super->log_sectorsize));
-		return 1;
+		return -EFSCORRUPTED;
 	}
 	if (le32_to_cpu(raw_super->log_sectors_per_block) +
 		le32_to_cpu(raw_super->log_sectorsize) !=
@@ -2481,7 +2481,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		f2fs_info(sbi, "Invalid log sectors per block(%u) log sectorsize(%u)",
 			  le32_to_cpu(raw_super->log_sectors_per_block),
 			  le32_to_cpu(raw_super->log_sectorsize));
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	segment_count = le32_to_cpu(raw_super->segment_count);
@@ -2495,7 +2495,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 	if (segment_count > F2FS_MAX_SEGMENT ||
 				segment_count < F2FS_MIN_SEGMENTS) {
 		f2fs_info(sbi, "Invalid segment count (%u)", segment_count);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	if (total_sections > segment_count ||
@@ -2503,25 +2503,25 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 			segs_per_sec > segment_count || !segs_per_sec) {
 		f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)",
 			  segment_count, total_sections, segs_per_sec);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	if ((segment_count / segs_per_sec) < total_sections) {
 		f2fs_info(sbi, "Small segment_count (%u < %u * %u)",
 			  segment_count, segs_per_sec, total_sections);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
 		f2fs_info(sbi, "Wrong segment_count / block_count (%u > %llu)",
 			  segment_count, le64_to_cpu(raw_super->block_count));
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	if (secs_per_zone > total_sections || !secs_per_zone) {
 		f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)",
 			  secs_per_zone, total_sections);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 	if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION ||
 			raw_super->hot_ext_count > F2FS_MAX_EXTENSION ||
@@ -2531,7 +2531,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 			  le32_to_cpu(raw_super->extension_count),
 			  raw_super->hot_ext_count,
 			  F2FS_MAX_EXTENSION);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	if (le32_to_cpu(raw_super->cp_payload) >
@@ -2539,7 +2539,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		f2fs_info(sbi, "Insane cp_payload (%u > %u)",
 			  le32_to_cpu(raw_super->cp_payload),
 			  blocks_per_seg - F2FS_CP_PACKS);
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	/* check reserved ino info */
@@ -2550,12 +2550,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 			  le32_to_cpu(raw_super->node_ino),
 			  le32_to_cpu(raw_super->meta_ino),
 			  le32_to_cpu(raw_super->root_ino));
-		return 1;
+		return -EFSCORRUPTED;
 	}
 
 	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
 	if (sanity_check_area_boundary(sbi, bh))
-		return 1;
+		return -EFSCORRUPTED;
 
 	return 0;
 }
@@ -2872,10 +2872,10 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
 		}
 
 		/* sanity checking of raw super */
-		if (sanity_check_raw_super(sbi, bh)) {
+		err = sanity_check_raw_super(sbi, bh);
+		if (err) {
 			f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock",
 				 block + 1);
-			err = -EFSCORRUPTED;
 			brelse(bh);
 			continue;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 61c30c98ef17e5a330d7bb8494b78b3d6dffe9b8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 29 Jul 2019 13:57:49 +0200
Subject: dax: Fix missed wakeup in put_unlocked_entry()

The condition checking whether put_unlocked_entry() needs to wake up
following waiter got broken by commit 23c84eb78375 ("dax: Fix missed
wakeup with PMD faults"). We need to wake the waiter whenever the passed
entry is valid (i.e., non-NULL and not special conflict entry). This
could lead to processes never being woken up when waiting for entry
lock. Fix the condition.

Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/20190729120228.GC17833@quack2.suse.cz
Fixes: 23c84eb78375 ("dax: Fix missed wakeup with PMD faults")
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index a237141d8787..b64964ef44f6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -266,7 +266,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 static void put_unlocked_entry(struct xa_state *xas, void *entry)
 {
 	/* If we were the only waiter woken, wake the next one */
-	if (entry && dax_is_conflict(entry))
+	if (entry && !dax_is_conflict(entry))
 		dax_wake_entry(xas, entry, false);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b4f9a1a87a48c255bb90d8a6c3d555a1abb88130 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 17 Jul 2019 13:23:39 +0100
Subject: Btrfs: fix incremental send failure after deduplication

When doing an incremental send operation we can fail if we previously did
deduplication operations against a file that exists in both snapshots. In
that case we will fail the send operation with -EIO and print a message
to dmesg/syslog like the following:

  BTRFS error (device sdc): Send: inconsistent snapshot, found updated \
  extent for inode 257 without updated inode item, send root is 258, \
  parent root is 257

This requires that we deduplicate to the same file in both snapshots for
the same amount of times on each snapshot. The issue happens because a
deduplication only updates the iversion of an inode and does not update
any other field of the inode, therefore if we deduplicate the file on
each snapshot for the same amount of time, the inode will have the same
iversion value (stored as the "sequence" field on the inode item) on both
snapshots, therefore it will be seen as unchanged between in the send
snapshot while there are new/updated/deleted extent items when comparing
to the parent snapshot. This makes the send operation return -EIO and
print an error message.

Example reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  # Create our first file. The first half of the file has several 64Kb
  # extents while the second half as a single 512Kb extent.
  $ xfs_io -f -s -c "pwrite -S 0xb8 -b 64K 0 512K" /mnt/foo
  $ xfs_io -c "pwrite -S 0xb8 512K 512K" /mnt/foo

  # Create the base snapshot and the parent send stream from it.
  $ btrfs subvolume snapshot -r /mnt /mnt/mysnap1
  $ btrfs send -f /tmp/1.snap /mnt/mysnap1

  # Create our second file, that has exactly the same data as the first
  # file.
  $ xfs_io -f -c "pwrite -S 0xb8 0 1M" /mnt/bar

  # Create the second snapshot, used for the incremental send, before
  # doing the file deduplication.
  $ btrfs subvolume snapshot -r /mnt /mnt/mysnap2

  # Now before creating the incremental send stream:
  #
  # 1) Deduplicate into a subrange of file foo in snapshot mysnap1. This
  #    will drop several extent items and add a new one, also updating
  #    the inode's iversion (sequence field in inode item) by 1, but not
  #    any other field of the inode;
  #
  # 2) Deduplicate into a different subrange of file foo in snapshot
  #    mysnap2. This will replace an extent item with a new one, also
  #    updating the inode's iversion by 1 but not any other field of the
  #    inode.
  #
  # After these two deduplication operations, the inode items, for file
  # foo, are identical in both snapshots, but we have different extent
  # items for this inode in both snapshots. We want to check this doesn't
  # cause send to fail with an error or produce an incorrect stream.

  $ xfs_io -r -c "dedupe /mnt/bar 0 0 512K" /mnt/mysnap1/foo
  $ xfs_io -r -c "dedupe /mnt/bar 512K 512K 512K" /mnt/mysnap2/foo

  # Create the incremental send stream.
  $ btrfs send -p /mnt/mysnap1 -f /tmp/2.snap /mnt/mysnap2
  ERROR: send ioctl failed with -5: Input/output error

This issue started happening back in 2015 when deduplication was updated
to not update the inode's ctime and mtime and update only the iversion.
Back then we would hit a BUG_ON() in send, but later in 2016 send was
updated to return -EIO and print the error message instead of doing the
BUG_ON().

A test case for fstests follows soon.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203933
Fixes: 1c919a5e13702c ("btrfs: don't update mtime/ctime on deduped inodes")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 77 +++++++++++----------------------------------------------
 1 file changed, 15 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 69b59bf75882..c3c0c064c25d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6322,68 +6322,21 @@ static int changed_extent(struct send_ctx *sctx,
 {
 	int ret = 0;
 
-	if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
-		if (result == BTRFS_COMPARE_TREE_CHANGED) {
-			struct extent_buffer *leaf_l;
-			struct extent_buffer *leaf_r;
-			struct btrfs_file_extent_item *ei_l;
-			struct btrfs_file_extent_item *ei_r;
-
-			leaf_l = sctx->left_path->nodes[0];
-			leaf_r = sctx->right_path->nodes[0];
-			ei_l = btrfs_item_ptr(leaf_l,
-					      sctx->left_path->slots[0],
-					      struct btrfs_file_extent_item);
-			ei_r = btrfs_item_ptr(leaf_r,
-					      sctx->right_path->slots[0],
-					      struct btrfs_file_extent_item);
-
-			/*
-			 * We may have found an extent item that has changed
-			 * only its disk_bytenr field and the corresponding
-			 * inode item was not updated. This case happens due to
-			 * very specific timings during relocation when a leaf
-			 * that contains file extent items is COWed while
-			 * relocation is ongoing and its in the stage where it
-			 * updates data pointers. So when this happens we can
-			 * safely ignore it since we know it's the same extent,
-			 * but just at different logical and physical locations
-			 * (when an extent is fully replaced with a new one, we
-			 * know the generation number must have changed too,
-			 * since snapshot creation implies committing the current
-			 * transaction, and the inode item must have been updated
-			 * as well).
-			 * This replacement of the disk_bytenr happens at
-			 * relocation.c:replace_file_extents() through
-			 * relocation.c:btrfs_reloc_cow_block().
-			 */
-			if (btrfs_file_extent_generation(leaf_l, ei_l) ==
-			    btrfs_file_extent_generation(leaf_r, ei_r) &&
-			    btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
-			    btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
-			    btrfs_file_extent_compression(leaf_l, ei_l) ==
-			    btrfs_file_extent_compression(leaf_r, ei_r) &&
-			    btrfs_file_extent_encryption(leaf_l, ei_l) ==
-			    btrfs_file_extent_encryption(leaf_r, ei_r) &&
-			    btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
-			    btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
-			    btrfs_file_extent_type(leaf_l, ei_l) ==
-			    btrfs_file_extent_type(leaf_r, ei_r) &&
-			    btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
-			    btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
-			    btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
-			    btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
-			    btrfs_file_extent_offset(leaf_l, ei_l) ==
-			    btrfs_file_extent_offset(leaf_r, ei_r) &&
-			    btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
-			    btrfs_file_extent_num_bytes(leaf_r, ei_r))
-				return 0;
-		}
-
-		inconsistent_snapshot_error(sctx, result, "extent");
-		return -EIO;
-	}
+	/*
+	 * We have found an extent item that changed without the inode item
+	 * having changed. This can happen either after relocation (where the
+	 * disk_bytenr of an extent item is replaced at
+	 * relocation.c:replace_file_extents()) or after deduplication into a
+	 * file in both the parent and send snapshots (where an extent item can
+	 * get modified or replaced with a new one). Note that deduplication
+	 * updates the inode item, but it only changes the iversion (sequence
+	 * field in the inode item) of the inode, so if a file is deduplicated
+	 * the same amount of times in both the parent and send snapshots, its
+	 * iversion becames the same in both snapshots, whence the inode item is
+	 * the same on both snapshots.
+	 */
+	if (sctx->cur_ino != sctx->cmp_key->objectid)
+		return 0;
 
 	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
 		if (result != BTRFS_COMPARE_TREE_DELETED)
-- 
cgit v1.2.3-59-g8ed1b


From cb2d3daddbfb6318d170e79aac1f7d5e4d49f0d7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Jul 2019 11:27:04 +0100
Subject: Btrfs: fix race leading to fs corruption after transaction abort

When one transaction is finishing its commit, it is possible for another
transaction to start and enter its initial commit phase as well. If the
first ends up getting aborted, we have a small time window where the second
transaction commit does not notice that the previous transaction aborted
and ends up committing, writing a superblock that points to btrees that
reference extent buffers (nodes and leafs) that were not persisted to disk.
The consequence is that after mounting the filesystem again, we will be
unable to load some btree nodes/leafs, either because the content on disk
is either garbage (or just zeroes) or corresponds to the old content of a
previouly COWed or deleted node/leaf, resulting in the well known error
messages "parent transid verify failed on ...".
The following sequence diagram illustrates how this can happen.

        CPU 1                                           CPU 2

 <at transaction N>

 btrfs_commit_transaction()
   (...)
   --> sets transaction state to
       TRANS_STATE_UNBLOCKED
   --> sets fs_info->running_transaction
       to NULL

                                                    (...)
                                                    btrfs_start_transaction()
                                                      start_transaction()
                                                        wait_current_trans()
                                                          --> returns immediately
                                                              because
                                                              fs_info->running_transaction
                                                              is NULL
                                                        join_transaction()
                                                          --> creates transaction N + 1
                                                          --> sets
                                                              fs_info->running_transaction
                                                              to transaction N + 1
                                                          --> adds transaction N + 1 to
                                                              the fs_info->trans_list list
                                                        --> returns transaction handle
                                                            pointing to the new
                                                            transaction N + 1
                                                    (...)

                                                    btrfs_sync_file()
                                                      btrfs_start_transaction()
                                                        --> returns handle to
                                                            transaction N + 1
                                                      (...)

   btrfs_write_and_wait_transaction()
     --> writeback of some extent
         buffer fails, returns an
	 error
   btrfs_handle_fs_error()
     --> sets BTRFS_FS_STATE_ERROR in
         fs_info->fs_state
   --> jumps to label "scrub_continue"
   cleanup_transaction()
     btrfs_abort_transaction(N)
       --> sets BTRFS_FS_STATE_TRANS_ABORTED
           flag in fs_info->fs_state
       --> sets aborted field in the
           transaction and transaction
	   handle structures, for
           transaction N only
     --> removes transaction from the
         list fs_info->trans_list
                                                      btrfs_commit_transaction(N + 1)
                                                        --> transaction N + 1 was not
							    aborted, so it proceeds
                                                        (...)
                                                        --> sets the transaction's state
                                                            to TRANS_STATE_COMMIT_START
                                                        --> does not find the previous
                                                            transaction (N) in the
                                                            fs_info->trans_list, so it
                                                            doesn't know that transaction
                                                            was aborted, and the commit
                                                            of transaction N + 1 proceeds
                                                        (...)
                                                        --> sets transaction N + 1 state
                                                            to TRANS_STATE_UNBLOCKED
                                                        btrfs_write_and_wait_transaction()
                                                          --> succeeds writing all extent
                                                              buffers created in the
                                                              transaction N + 1
                                                        write_all_supers()
                                                           --> succeeds
                                                           --> we now have a superblock on
                                                               disk that points to trees
                                                               that refer to at least one
                                                               extent buffer that was
                                                               never persisted

So fix this by updating the transaction commit path to check if the flag
BTRFS_FS_STATE_TRANS_ABORTED is set on fs_info->fs_state if after setting
the transaction to the TRANS_STATE_COMMIT_START we do not find any previous
transaction in the fs_info->trans_list. If the flag is set, just fail the
transaction commit with -EROFS, as we do in other places. The exact error
code for the previous transaction abort was already logged and reported.

Fixes: 49b25e0540904b ("btrfs: enhance transaction abort infrastructure")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b8ae1a8f02d..39b7bcde3c6f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2037,6 +2037,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		}
 	} else {
 		spin_unlock(&fs_info->trans_lock);
+		/*
+		 * The previous transaction was aborted and was already removed
+		 * from the list of transactions at fs_info->trans_list. So we
+		 * abort to prevent writing a new superblock that reflects a
+		 * corrupt state (pointing to trees with unwritten nodes/leafs).
+		 */
+		if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+			ret = -EROFS;
+			goto cleanup_transaction;
+		}
 	}
 
 	extwriter_counter_dec(cur_trans, trans->type);
-- 
cgit v1.2.3-59-g8ed1b


From a6d155d2e363f26290ffd50591169cb96c2a609e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 29 Jul 2019 09:37:10 +0100
Subject: Btrfs: fix deadlock between fiemap and transaction commits

The fiemap handler locks a file range that can have unflushed delalloc,
and after locking the range, it tries to attach to a running transaction.
If the running transaction started its commit, that is, it is in state
TRANS_STATE_COMMIT_START, and either the filesystem was mounted with the
flushoncommit option or the transaction is creating a snapshot for the
subvolume that contains the file that fiemap is operating on, we end up
deadlocking. This happens because fiemap is blocked on the transaction,
waiting for it to complete, and the transaction is waiting for the flushed
dealloc to complete, which requires locking the file range that the fiemap
task already locked. The following stack traces serve as an example of
when this deadlock happens:

  (...)
  [404571.515510] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
  [404571.515956] Call Trace:
  [404571.516360]  ? __schedule+0x3ae/0x7b0
  [404571.516730]  schedule+0x3a/0xb0
  [404571.517104]  lock_extent_bits+0x1ec/0x2a0 [btrfs]
  [404571.517465]  ? remove_wait_queue+0x60/0x60
  [404571.517832]  btrfs_finish_ordered_io+0x292/0x800 [btrfs]
  [404571.518202]  normal_work_helper+0xea/0x530 [btrfs]
  [404571.518566]  process_one_work+0x21e/0x5c0
  [404571.518990]  worker_thread+0x4f/0x3b0
  [404571.519413]  ? process_one_work+0x5c0/0x5c0
  [404571.519829]  kthread+0x103/0x140
  [404571.520191]  ? kthread_create_worker_on_cpu+0x70/0x70
  [404571.520565]  ret_from_fork+0x3a/0x50
  [404571.520915] kworker/u8:6    D    0 31651      2 0x80004000
  [404571.521290] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs]
  (...)
  [404571.537000] fsstress        D    0 13117  13115 0x00004000
  [404571.537263] Call Trace:
  [404571.537524]  ? __schedule+0x3ae/0x7b0
  [404571.537788]  schedule+0x3a/0xb0
  [404571.538066]  wait_current_trans+0xc8/0x100 [btrfs]
  [404571.538349]  ? remove_wait_queue+0x60/0x60
  [404571.538680]  start_transaction+0x33c/0x500 [btrfs]
  [404571.539076]  btrfs_check_shared+0xa3/0x1f0 [btrfs]
  [404571.539513]  ? extent_fiemap+0x2ce/0x650 [btrfs]
  [404571.539866]  extent_fiemap+0x2ce/0x650 [btrfs]
  [404571.540170]  do_vfs_ioctl+0x526/0x6f0
  [404571.540436]  ksys_ioctl+0x70/0x80
  [404571.540734]  __x64_sys_ioctl+0x16/0x20
  [404571.540997]  do_syscall_64+0x60/0x1d0
  [404571.541279]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  (...)
  [404571.543729] btrfs           D    0 14210  14208 0x00004000
  [404571.544023] Call Trace:
  [404571.544275]  ? __schedule+0x3ae/0x7b0
  [404571.544526]  ? wait_for_completion+0x112/0x1a0
  [404571.544795]  schedule+0x3a/0xb0
  [404571.545064]  schedule_timeout+0x1ff/0x390
  [404571.545351]  ? lock_acquire+0xa6/0x190
  [404571.545638]  ? wait_for_completion+0x49/0x1a0
  [404571.545890]  ? wait_for_completion+0x112/0x1a0
  [404571.546228]  wait_for_completion+0x131/0x1a0
  [404571.546503]  ? wake_up_q+0x70/0x70
  [404571.546775]  btrfs_wait_ordered_extents+0x27c/0x400 [btrfs]
  [404571.547159]  btrfs_commit_transaction+0x3b0/0xae0 [btrfs]
  [404571.547449]  ? btrfs_mksubvol+0x4a4/0x640 [btrfs]
  [404571.547703]  ? remove_wait_queue+0x60/0x60
  [404571.547969]  btrfs_mksubvol+0x605/0x640 [btrfs]
  [404571.548226]  ? __sb_start_write+0xd4/0x1c0
  [404571.548512]  ? mnt_want_write_file+0x24/0x50
  [404571.548789]  btrfs_ioctl_snap_create_transid+0x169/0x1a0 [btrfs]
  [404571.549048]  btrfs_ioctl_snap_create_v2+0x11d/0x170 [btrfs]
  [404571.549307]  btrfs_ioctl+0x133f/0x3150 [btrfs]
  [404571.549549]  ? mem_cgroup_charge_statistics+0x4c/0xd0
  [404571.549792]  ? mem_cgroup_commit_charge+0x84/0x4b0
  [404571.550064]  ? __handle_mm_fault+0xe3e/0x11f0
  [404571.550306]  ? do_raw_spin_unlock+0x49/0xc0
  [404571.550608]  ? _raw_spin_unlock+0x24/0x30
  [404571.550976]  ? __handle_mm_fault+0xedf/0x11f0
  [404571.551319]  ? do_vfs_ioctl+0xa2/0x6f0
  [404571.551659]  ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
  [404571.552087]  do_vfs_ioctl+0xa2/0x6f0
  [404571.552355]  ksys_ioctl+0x70/0x80
  [404571.552621]  __x64_sys_ioctl+0x16/0x20
  [404571.552864]  do_syscall_64+0x60/0x1d0
  [404571.553104]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  (...)

If we were joining the transaction instead of attaching to it, we would
not risk a deadlock because a join only blocks if the transaction is in a
state greater then or equals to TRANS_STATE_COMMIT_DOING, and the delalloc
flush performed by a transaction is done before it reaches that state,
when it is in the state TRANS_STATE_COMMIT_START. However a transaction
join is intended for use cases where we do modify the filesystem, and
fiemap only needs to peek at delayed references from the current
transaction in order to determine if extents are shared, and, besides
that, when there is no current transaction or when it blocks to wait for
a current committing transaction to complete, it creates a new transaction
without reserving any space. Such unnecessary transactions, besides doing
unnecessary IO, can cause transaction aborts (-ENOSPC) and unnecessary
rotation of the precious backup roots.

So fix this by adding a new transaction join variant, named join_nostart,
which behaves like the regular join, but it does not create a transaction
when none currently exists or after waiting for a committing transaction
to complete.

Fixes: 03628cdbc64db6 ("Btrfs: do not start a transaction during fiemap")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     |  2 +-
 fs/btrfs/transaction.c | 22 ++++++++++++++++++----
 fs/btrfs/transaction.h |  3 +++
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 89116afda7a2..e5d85311d5d5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1483,7 +1483,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
 	ulist_init(roots);
 	ulist_init(tmp);
 
-	trans = btrfs_attach_transaction(root);
+	trans = btrfs_join_transaction_nostart(root);
 	if (IS_ERR(trans)) {
 		if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) {
 			ret = PTR_ERR(trans);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 39b7bcde3c6f..e3adb714c04b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,15 +28,18 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
 	[TRANS_STATE_COMMIT_START]	= (__TRANS_START | __TRANS_ATTACH),
 	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_START |
 					   __TRANS_ATTACH |
-					   __TRANS_JOIN),
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOSTART),
 	[TRANS_STATE_UNBLOCKED]		= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN |
-					   __TRANS_JOIN_NOLOCK),
+					   __TRANS_JOIN_NOLOCK |
+					   __TRANS_JOIN_NOSTART),
 	[TRANS_STATE_COMPLETED]		= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN |
-					   __TRANS_JOIN_NOLOCK),
+					   __TRANS_JOIN_NOLOCK |
+					   __TRANS_JOIN_NOSTART),
 };
 
 void btrfs_put_transaction(struct btrfs_transaction *transaction)
@@ -543,7 +546,8 @@ again:
 		ret = join_transaction(fs_info, type);
 		if (ret == -EBUSY) {
 			wait_current_trans(fs_info);
-			if (unlikely(type == TRANS_ATTACH))
+			if (unlikely(type == TRANS_ATTACH ||
+				     type == TRANS_JOIN_NOSTART))
 				ret = -ENOENT;
 		}
 	} while (ret == -EBUSY);
@@ -659,6 +663,16 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
 				 BTRFS_RESERVE_NO_FLUSH, true);
 }
 
+/*
+ * Similar to regular join but it never starts a transaction when none is
+ * running or after waiting for the current one to finish.
+ */
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
+				 BTRFS_RESERVE_NO_FLUSH, true);
+}
+
 /*
  * btrfs_attach_transaction() - catch the running transaction
  *
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 527ea94b57d9..2c5a6f6e5bb0 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -94,11 +94,13 @@ struct btrfs_transaction {
 #define __TRANS_JOIN		(1U << 11)
 #define __TRANS_JOIN_NOLOCK	(1U << 12)
 #define __TRANS_DUMMY		(1U << 13)
+#define __TRANS_JOIN_NOSTART	(1U << 14)
 
 #define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
 #define TRANS_ATTACH		(__TRANS_ATTACH)
 #define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
 #define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
+#define TRANS_JOIN_NOSTART	(__TRANS_JOIN_NOSTART)
 
 #define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)
 
@@ -183,6 +185,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
 					int min_factor);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
 					struct btrfs_root *root);
-- 
cgit v1.2.3-59-g8ed1b


From afa1d96d1430c2138c545fb76e6dcb21222098d4 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Tue, 30 Jul 2019 11:28:20 -0700
Subject: xfs: Fix possible null-pointer dereferences in
 xchk_da_btree_block_check_sibling()

In xchk_da_btree_block_check_sibling(), there is an if statement on
line 274 to check whether ds->state->altpath.blk[level].bp is NULL:
    if (ds->state->altpath.blk[level].bp)

When ds->state->altpath.blk[level].bp is NULL, it is used on line 281:
    xfs_trans_brelse(..., ds->state->altpath.blk[level].bp);
        struct xfs_buf_log_item *bip = bp->b_log_item;
        ASSERT(bp->b_transp == tp);

Thus, possible null-pointer dereferences may occur.

To fix these bugs, ds->state->altpath.blk[level].bp is checked before
being used.

These bugs are found by a static analysis tool STCheck written by us.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/dabtree.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 94c4f1de1922..77ff9f97bcda 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -278,7 +278,11 @@ xchk_da_btree_block_check_sibling(
 	/* Compare upper level pointer to sibling pointer. */
 	if (ds->state->altpath.blk[level].blkno != sibling)
 		xchk_da_set_corrupt(ds, level);
-	xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+	if (ds->state->altpath.blk[level].bp) {
+		xfs_trans_brelse(ds->dargs.trans,
+				ds->state->altpath.blk[level].bp);
+		ds->state->altpath.blk[level].bp = NULL;
+	}
 out:
 	return error;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 89e524c04fa966330e2e80ab2bc50b9944c5847a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 30 Jul 2019 13:10:14 +0200
Subject: loop: Fix mount(2) failure due to race with LOOP_SET_FD

Commit 33ec3e53e7b1 ("loop: Don't change loop device under exclusive
opener") made LOOP_SET_FD ioctl acquire exclusive block device reference
while it updates loop device binding. However this can make perfectly
valid mount(2) fail with EBUSY due to racing LOOP_SET_FD holding
temporarily the exclusive bdev reference in cases like this:

for i in {a..z}{a..z}; do
        dd if=/dev/zero of=$i.image bs=1k count=0 seek=1024
        mkfs.ext2 $i.image
        mkdir mnt$i
done

echo "Run"
for i in {a..z}{a..z}; do
        mount -o loop -t ext2 $i.image mnt$i &
done

Fix the problem by not getting full exclusive bdev reference in
LOOP_SET_FD but instead just mark the bdev as being claimed while we
update the binding information. This just blocks new exclusive openers
instead of failing them with EBUSY thus fixing the problem.

Fixes: 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener")
Cc: stable@vger.kernel.org
Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 16 +++++-----
 fs/block_dev.c       | 83 ++++++++++++++++++++++++++++++++++++----------------
 include/linux/fs.h   |  6 ++++
 3 files changed, 73 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 44c9985f352a..3036883fc9f8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -924,6 +924,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	struct file	*file;
 	struct inode	*inode;
 	struct address_space *mapping;
+	struct block_device *claimed_bdev = NULL;
 	int		lo_flags = 0;
 	int		error;
 	loff_t		size;
@@ -942,10 +943,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	 * here to avoid changing device under exclusive owner.
 	 */
 	if (!(mode & FMODE_EXCL)) {
-		bdgrab(bdev);
-		error = blkdev_get(bdev, mode | FMODE_EXCL, loop_set_fd);
-		if (error)
+		claimed_bdev = bd_start_claiming(bdev, loop_set_fd);
+		if (IS_ERR(claimed_bdev)) {
+			error = PTR_ERR(claimed_bdev);
 			goto out_putf;
+		}
 	}
 
 	error = mutex_lock_killable(&loop_ctl_mutex);
@@ -1015,15 +1017,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
-	if (!(mode & FMODE_EXCL))
-		blkdev_put(bdev, mode | FMODE_EXCL);
+	if (claimed_bdev)
+		bd_abort_claiming(bdev, claimed_bdev, loop_set_fd);
 	return 0;
 
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 out_bdev:
-	if (!(mode & FMODE_EXCL))
-		blkdev_put(bdev, mode | FMODE_EXCL);
+	if (claimed_bdev)
+		bd_abort_claiming(bdev, claimed_bdev, loop_set_fd);
 out_putf:
 	fput(file);
 out:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c2a85b587922..22591bad9353 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1181,8 +1181,7 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
  * Pointer to the block device containing @bdev on success, ERR_PTR()
  * value on failure.
  */
-static struct block_device *bd_start_claiming(struct block_device *bdev,
-					      void *holder)
+struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
 {
 	struct gendisk *disk;
 	struct block_device *whole;
@@ -1229,6 +1228,62 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 		return ERR_PTR(err);
 	}
 }
+EXPORT_SYMBOL(bd_start_claiming);
+
+static void bd_clear_claiming(struct block_device *whole, void *holder)
+{
+	lockdep_assert_held(&bdev_lock);
+	/* tell others that we're done */
+	BUG_ON(whole->bd_claiming != holder);
+	whole->bd_claiming = NULL;
+	wake_up_bit(&whole->bd_claiming, 0);
+}
+
+/**
+ * bd_finish_claiming - finish claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device (returned from bd_start_claiming())
+ * @holder: holder that has claimed @bdev
+ *
+ * Finish exclusive open of a block device. Mark the device as exlusively
+ * open by the holder and wake up all waiters for exclusive open to finish.
+ */
+void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
+			void *holder)
+{
+	spin_lock(&bdev_lock);
+	BUG_ON(!bd_may_claim(bdev, whole, holder));
+	/*
+	 * Note that for a whole device bd_holders will be incremented twice,
+	 * and bd_holder will be set to bd_may_claim before being set to holder
+	 */
+	whole->bd_holders++;
+	whole->bd_holder = bd_may_claim;
+	bdev->bd_holders++;
+	bdev->bd_holder = holder;
+	bd_clear_claiming(whole, holder);
+	spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_finish_claiming);
+
+/**
+ * bd_abort_claiming - abort claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device (returned from bd_start_claiming())
+ * @holder: holder that has claimed @bdev
+ *
+ * Abort claiming of a block device when the exclusive open failed. This can be
+ * also used when exclusive open is not actually desired and we just needed
+ * to block other exclusive openers for a while.
+ */
+void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
+		       void *holder)
+{
+	spin_lock(&bdev_lock);
+	bd_clear_claiming(whole, holder);
+	spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_abort_claiming);
 
 #ifdef CONFIG_SYSFS
 struct bd_holder_disk {
@@ -1698,29 +1753,7 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
-		spin_lock(&bdev_lock);
-
-		if (!res) {
-			BUG_ON(!bd_may_claim(bdev, whole, holder));
-			/*
-			 * Note that for a whole device bd_holders
-			 * will be incremented twice, and bd_holder
-			 * will be set to bd_may_claim before being
-			 * set to holder
-			 */
-			whole->bd_holders++;
-			whole->bd_holder = bd_may_claim;
-			bdev->bd_holders++;
-			bdev->bd_holder = holder;
-		}
-
-		/* tell others that we're done */
-		BUG_ON(whole->bd_claiming != holder);
-		whole->bd_claiming = NULL;
-		wake_up_bit(&whole->bd_claiming, 0);
-
-		spin_unlock(&bdev_lock);
-
+		bd_finish_claiming(bdev, whole, holder);
 		/*
 		 * Block event polling for write claims if requested.  Any
 		 * write holder makes the write_holder state stick until
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56b8e358af5c..997a530ff4e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2598,6 +2598,12 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 					       void *holder);
 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 					      void *holder);
+extern struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder);
+extern void bd_finish_claiming(struct block_device *bdev,
+			       struct block_device *whole, void *holder);
+extern void bd_abort_claiming(struct block_device *bdev,
+			      struct block_device *whole, void *holder);
 extern void blkdev_put(struct block_device *bdev, fmode_t mode);
 extern int __blkdev_reread_part(struct block_device *bdev);
 extern int blkdev_reread_part(struct block_device *bdev);
-- 
cgit v1.2.3-59-g8ed1b


From 055d88242a6046a1ceac3167290f054c72571cd9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 30 Jul 2019 21:25:20 +0200
Subject: compat_ioctl: pppoe: fix PPPOEIOCSFWD handling

Support for handling the PPPOEIOCSFWD ioctl in compat mode was added in
linux-2.5.69 along with hundreds of other commands, but was always broken
sincen only the structure is compatible, but the command number is not,
due to the size being sizeof(size_t), or at first sizeof(sizeof((struct
sockaddr_pppox)), which is different on 64-bit architectures.

Guillaume Nault adds:

  And the implementation was broken until 2016 (see 29e73269aa4d ("pppoe:
  fix reference counting in PPPoE proxy")), and nobody ever noticed. I
  should probably have removed this ioctl entirely instead of fixing it.
  Clearly, it has never been used.

Fix it by adding a compat_ioctl handler for all pppoe variants that
translates the command number and then calls the regular ioctl function.

All other ioctl commands handled by pppoe are compatible between 32-bit
and 64-bit, and require compat_ptr() conversion.

This should apply to all stable kernels.

Acked-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pppoe.c  |  3 +++
 drivers/net/ppp/pppox.c  | 13 +++++++++++++
 drivers/net/ppp/pptp.c   |  3 +++
 fs/compat_ioctl.c        |  3 ---
 include/linux/if_pppox.h |  3 +++
 net/l2tp/l2tp_ppp.c      |  3 +++
 6 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 1d902ecb4aa8..a44dd3c8af63 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1115,6 +1115,9 @@ static const struct proto_ops pppoe_ops = {
 	.recvmsg	= pppoe_recvmsg,
 	.mmap		= sock_no_mmap,
 	.ioctl		= pppox_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= pppox_compat_ioctl,
+#endif
 };
 
 static const struct pppox_proto pppoe_proto = {
diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c
index 5ef422a43d70..08364f10a43f 100644
--- a/drivers/net/ppp/pppox.c
+++ b/drivers/net/ppp/pppox.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
 #include <linux/net.h>
@@ -98,6 +99,18 @@ int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 EXPORT_SYMBOL(pppox_ioctl);
 
+#ifdef CONFIG_COMPAT
+int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	if (cmd == PPPOEIOCSFWD32)
+		cmd = PPPOEIOCSFWD;
+
+	return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+
+EXPORT_SYMBOL(pppox_compat_ioctl);
+#endif
+
 static int pppox_create(struct net *net, struct socket *sock, int protocol,
 			int kern)
 {
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index a8e52c8e4128..734de7de03f7 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -623,6 +623,9 @@ static const struct proto_ops pptp_ops = {
 	.recvmsg    = sock_no_recvmsg,
 	.mmap       = sock_no_mmap,
 	.ioctl      = pppox_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = pppox_compat_ioctl,
+#endif
 };
 
 static const struct pppox_proto pppox_pptp_proto = {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6e30949d9f77..a7ec2d3dff92 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -638,9 +638,6 @@ COMPATIBLE_IOCTL(PPPIOCDISCONN)
 COMPATIBLE_IOCTL(PPPIOCATTCHAN)
 COMPATIBLE_IOCTL(PPPIOCGCHAN)
 COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
-/* PPPOX */
-COMPATIBLE_IOCTL(PPPOEIOCSFWD)
-COMPATIBLE_IOCTL(PPPOEIOCDFWD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 8b728750a625..69e813bcb947 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -80,6 +80,9 @@ extern int register_pppox_proto(int proto_num, const struct pppox_proto *pp);
 extern void unregister_pppox_proto(int proto_num);
 extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */
 extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+
+#define PPPOEIOCSFWD32    _IOW(0xB1 ,0, compat_size_t)
 
 /* PPPoX socket states */
 enum {
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1d0e5904dedf..c54cb59593ef 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1681,6 +1681,9 @@ static const struct proto_ops pppol2tp_ops = {
 	.recvmsg	= pppol2tp_recvmsg,
 	.mmap		= sock_no_mmap,
 	.ioctl		= pppox_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = pppox_compat_ioctl,
+#endif
 };
 
 static const struct pppox_proto pppol2tp_proto = {
-- 
cgit v1.2.3-59-g8ed1b


From d0ee879187df966ef638031b5f5183078d672141 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Wed, 31 Jul 2019 14:39:33 +0800
Subject: io_uring: fix KASAN use after free in io_sq_wq_submit_work

[root@localhost ~]# ./liburing/test/link

QEMU Standard PC report that:

[   29.379892] CPU: 0 PID: 84 Comm: kworker/u2:2 Not tainted 5.3.0-rc2-00051-g4010b622f1d2-dirty #86
[   29.379902] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[   29.379913] Workqueue: io_ring-wq io_sq_wq_submit_work
[   29.379929] Call Trace:
[   29.379953]  dump_stack+0xa9/0x10e
[   29.379970]  ? io_sq_wq_submit_work+0xbf4/0xe90
[   29.379986]  print_address_description.cold.6+0x9/0x317
[   29.379999]  ? io_sq_wq_submit_work+0xbf4/0xe90
[   29.380010]  ? io_sq_wq_submit_work+0xbf4/0xe90
[   29.380026]  __kasan_report.cold.7+0x1a/0x34
[   29.380044]  ? io_sq_wq_submit_work+0xbf4/0xe90
[   29.380061]  kasan_report+0xe/0x12
[   29.380076]  io_sq_wq_submit_work+0xbf4/0xe90
[   29.380104]  ? io_sq_thread+0xaf0/0xaf0
[   29.380152]  process_one_work+0xb59/0x19e0
[   29.380184]  ? pwq_dec_nr_in_flight+0x2c0/0x2c0
[   29.380221]  worker_thread+0x8c/0xf40
[   29.380248]  ? __kthread_parkme+0xab/0x110
[   29.380265]  ? process_one_work+0x19e0/0x19e0
[   29.380278]  kthread+0x30b/0x3d0
[   29.380292]  ? kthread_create_on_node+0xe0/0xe0
[   29.380311]  ret_from_fork+0x3a/0x50

[   29.380635] Allocated by task 209:
[   29.381255]  save_stack+0x19/0x80
[   29.381268]  __kasan_kmalloc.constprop.6+0xc1/0xd0
[   29.381279]  kmem_cache_alloc+0xc0/0x240
[   29.381289]  io_submit_sqe+0x11bc/0x1c70
[   29.381300]  io_ring_submit+0x174/0x3c0
[   29.381311]  __x64_sys_io_uring_enter+0x601/0x780
[   29.381322]  do_syscall_64+0x9f/0x4d0
[   29.381336]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[   29.381633] Freed by task 84:
[   29.382186]  save_stack+0x19/0x80
[   29.382198]  __kasan_slab_free+0x11d/0x160
[   29.382210]  kmem_cache_free+0x8c/0x2f0
[   29.382220]  io_put_req+0x22/0x30
[   29.382230]  io_sq_wq_submit_work+0x28b/0xe90
[   29.382241]  process_one_work+0xb59/0x19e0
[   29.382251]  worker_thread+0x8c/0xf40
[   29.382262]  kthread+0x30b/0x3d0
[   29.382272]  ret_from_fork+0x3a/0x50

[   29.382569] The buggy address belongs to the object at ffff888067172140
                which belongs to the cache io_kiocb of size 224
[   29.384692] The buggy address is located 120 bytes inside of
                224-byte region [ffff888067172140, ffff888067172220)
[   29.386723] The buggy address belongs to the page:
[   29.387575] page:ffffea00019c5c80 refcount:1 mapcount:0 mapping:ffff88806ace5180 index:0x0
[   29.387587] flags: 0x100000000000200(slab)
[   29.387603] raw: 0100000000000200 dead000000000100 dead000000000122 ffff88806ace5180
[   29.387617] raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000
[   29.387624] page dumped because: kasan: bad access detected

[   29.387920] Memory state around the buggy address:
[   29.388771]  ffff888067172080: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
[   29.390062]  ffff888067172100: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
[   29.391325] >ffff888067172180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   29.392578]                                         ^
[   29.393480]  ffff888067172200: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
[   29.394744]  ffff888067172280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   29.396003] ==================================================================
[   29.397260] Disabling lock debugging due to kernel taint

io_sq_wq_submit_work free and read req again.

Cc: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Cc: linux-block@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: f7b76ac9d17e ("io_uring: fix counter inc/dec mismatch in async_list")
Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 012bc0efb9d3..d542f1cf4428 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1838,6 +1838,7 @@ restart:
 	do {
 		struct sqe_submit *s = &req->submit;
 		const struct io_uring_sqe *sqe = s->sqe;
+		unsigned int flags = req->flags;
 
 		/* Ensure we clear previously set non-block flag */
 		req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1883,7 +1884,7 @@ restart:
 		kfree(sqe);
 
 		/* req from defer and link list needn't decrease async cnt */
-		if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+		if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
 			goto out;
 
 		if (!async_list)
-- 
cgit v1.2.3-59-g8ed1b


From c2c44ec20a8496f7a3b3401c092afe96908eced1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 27 Jul 2019 08:29:57 -0400
Subject: Unbreak mount_capable()

In "consolidate the capability checks in sget_{fc,userns}())" the
wrong argument had been passed to mount_capable() by sget_fc().
That mistake had been further obscured later, when switching
mount_capable() to fs_context has moved the calculation of
bogus argument from sget_fc() to mount_capable() itself.  It
should've been fc->user_ns all along.

Screwed-up-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: Christian Brauner <christian@brauner.io>
Tested-by: Christian Brauner <christian@brauner.io>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 113c58f19425..5960578a4076 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -478,13 +478,10 @@ EXPORT_SYMBOL(generic_shutdown_super);
 
 bool mount_capable(struct fs_context *fc)
 {
-	struct user_namespace *user_ns = fc->global ? &init_user_ns
-						    : fc->user_ns;
-
 	if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
 		return capable(CAP_SYS_ADMIN);
 	else
-		return ns_capable(user_ns, CAP_SYS_ADMIN);
+		return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 706cb5492c8c459199fa0ab3b5fd2ba54ee53b0c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sat, 27 Jul 2019 17:12:54 +0200
Subject: gfs2: Inode dirtying fix

With the recent iomap write page reclaim deadlock fix, it turns out that the
GLF_DIRTY flag isn't always set when it needs to be anymore: previously, this
happened as a side effect of always adding the inode buffer head to the current
transaction with gfs2_trans_add_meta, but this isn't happening consistently
anymore.  Fix by removing an additional unnecessary gfs2_trans_add_meta call
and by setting the GLF_DIRTY flag in gfs2_iomap_end.

(The GLF_DIRTY flag causes inode_go_sync to flush the transaction log when
syncing out the glock of that inode.  When the flag isn't set, inode_go_sync
will skip inodes, including ones with an i_state of I_DIRTY_PAGES, which will
lead to cluster incoherency.)

In addition, in gfs2_iomap_page_done, if the metadata has changed, mark the
inode as I_DIRTY_DATASYNC to have the inode added to the current transaction:
we don't expect metadata to change here, but let's err on the safe side.

Fixes: d0a22a4b03b8 ("gfs2: Fix iomap write page reclaim deadlock");
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/bmap.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 79581b9bdebb..4df26ef2b2b1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1002,11 +1002,16 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
 				 unsigned copied, struct page *page,
 				 struct iomap *iomap)
 {
+	struct gfs2_trans *tr = current->journal_info;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 
 	if (page && !gfs2_is_stuffed(ip))
 		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+
+	if (tr->tr_num_buf_new)
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+
 	gfs2_trans_end(sdp);
 }
 
@@ -1099,8 +1104,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 		tr = current->journal_info;
 		if (tr->tr_num_buf_new)
 			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-		else
-			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[0]);
 
 		gfs2_trans_end(sdp);
 	}
@@ -1181,10 +1184,16 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 
 	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
 		gfs2_quota_unlock(ip);
+
+	if (unlikely(!written))
+		goto out_unlock;
+
 	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
 		mark_inode_dirty(inode);
-	gfs2_write_unlock(inode);
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 
+out_unlock:
+	gfs2_write_unlock(inode);
 out:
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0eb6ddfb865c7d7ec05cb330050671363515e67f Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 1 Aug 2019 19:21:51 +0900
Subject: block: Fix __blkdev_direct_IO() for bio fragments

The recent fix to properly handle IOCB_NOWAIT for async O_DIRECT IO
(patch 6a43074e2f46) introduced two problems with BIO fragment handling
for direct IOs:
1) The dio size processed is calculated by incrementing the ret variable
by the size of the bio fragment issued for the dio. However, this size
is obtained directly from bio->bi_iter.bi_size AFTER the bio submission
which may result in referencing the bi_size value after the bio
completed, resulting in an incorrect value use.
2) The ret variable is not incremented by the size of the last bio
fragment issued for the bio, leading to an invalid IO size being
returned to the user.

Fix both problem by using dio->size (which is incremented before the bio
submission) to update the value of ret after bio submissions, including
for the last bio fragment issued.

Fixes: 6a43074e2f46 ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO")
Reported-by: Masato Suzuki <masato.suzuki@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 22591bad9353..a6f7c892cb4a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -439,6 +439,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 					ret = -EAGAIN;
 				goto error;
 			}
+			ret = dio->size;
 
 			if (polled)
 				WRITE_ONCE(iocb->ki_cookie, qc);
@@ -465,7 +466,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 				ret = -EAGAIN;
 			goto error;
 		}
-		ret += bio->bi_iter.bi_size;
+		ret = dio->size;
 
 		bio = bio_alloc(gfp, nr_pages);
 		if (!bio) {
-- 
cgit v1.2.3-59-g8ed1b


From 7bc36e3ce91471b6377c8eadc0a2f220a2280083 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 2 Aug 2019 21:48:40 -0700
Subject: ocfs2: remove set but not used variable 'last_hash'

Fixes gcc '-Wunused-but-set-variable' warning:

  fs/ocfs2/xattr.c: In function ocfs2_xattr_bucket_find:
  fs/ocfs2/xattr.c:3828:6: warning: variable last_hash set but not used [-Wunused-but-set-variable]

It's never used and can be removed.

Link: http://lkml.kernel.org/r/20190716132110.34836-1-yuehaibing@huawei.com
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 385f3aaa2448..90c830e3758e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3825,7 +3825,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int low_bucket = 0, bucket, high_bucket;
 	struct ocfs2_xattr_bucket *search;
-	u32 last_hash;
 	u64 blkno, lower_blkno = 0;
 
 	search = ocfs2_xattr_bucket_new(inode);
@@ -3869,8 +3868,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		if (xh->xh_count)
 			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
 
-		last_hash = le32_to_cpu(xe->xe_name_hash);
-
 		/* record lower_blkno which may be the insert place. */
 		lower_blkno = blkno;
 
-- 
cgit v1.2.3-59-g8ed1b


From 315c69261dd3fa12dbc830d4fa00d1fad98d3b03 Mon Sep 17 00:00:00 2001
From: Paul Wise <pabs3@bonedaddy.net>
Date: Fri, 2 Aug 2019 21:49:05 -0700
Subject: coredump: split pipe command whitespace before expanding template

Save the offsets of the start of each argument to avoid having to update
pointers to each argument after every corename krealloc and to avoid
having to duplicate the memory for the dump command.

Executable names containing spaces were previously being expanded from
%e or %E and then split in the middle of the filename.  This is
incorrect behaviour since an argument list can represent arguments with
spaces.

The splitting could lead to extra arguments being passed to the core
dump handler that it might have interpreted as options or ignored
completely.

Core dump handlers that are not aware of this Linux kernel issue will be
using %e or %E without considering that it may be split and so they will
be vulnerable to processes with spaces in their names breaking their
argument list.  If their internals are otherwise well written, such as
if they are written in shell but quote arguments, they will work better
after this change than before.  If they are not well written, then there
is a slight chance of breakage depending on the details of the code but
they will already be fairly broken by the split filenames.

Core dump handlers that are aware of this Linux kernel issue will be
placing %e or %E as the last item in their core_pattern and then
aggregating all of the remaining arguments into one, separated by
spaces.  Alternatively they will be obtaining the filename via other
methods.  Both of these will be compatible with the new arrangement.

A side effect from this change is that unknown template types (for
example %z) result in an empty argument to the dump handler instead of
the argument being dropped.  This is a desired change as:

It is easier for dump handlers to process empty arguments than dropped
ones, especially if they are written in shell or don't pass each
template item with a preceding command-line option in order to
differentiate between individual template types.  Most core_patterns in
the wild do not use options so they can confuse different template types
(especially numeric ones) if an earlier one gets dropped in old kernels.
If the kernel introduces a new template type and a core_pattern uses it,
the core dump handler might not expect that the argument can be dropped
in old kernels.

For example, this can result in security issues when %d is dropped in
old kernels.  This happened with the corekeeper package in Debian and
resulted in the interface between corekeeper and Linux having to be
rewritten to use command-line options to differentiate between template
types.

The core_pattern for most core dump handlers is written by the handler
author who would generally not insert unknown template types so this
change should be compatible with all the core dump handlers that exist.

Link: http://lkml.kernel.org/r/20190528051142.24939-1-pabs3@bonedaddy.net
Fixes: 74aadce98605 ("core_pattern: allow passing of arguments to user mode helper when core_pattern is a pipe")
Signed-off-by: Paul Wise <pabs3@bonedaddy.net>
Reported-by: Jakub Wilk <jwilk@jwilk.net> [https://bugs.debian.org/924398]
Reported-by: Paul Wise <pabs3@bonedaddy.net> [https://lore.kernel.org/linux-fsdevel/c8b7ecb8508895bf4adb62a748e2ea2c71854597.camel@bonedaddy.net/]
Suggested-by: Jakub Wilk <jwilk@jwilk.net>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index e42e17e55bfd..b1ea7dfbd149 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -7,6 +7,7 @@
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
+#include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
@@ -187,11 +188,13 @@ put_exe_file:
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(struct core_name *cn, struct coredump_params *cprm)
+static int format_corename(struct core_name *cn, struct coredump_params *cprm,
+			   size_t **argv, int *argc)
 {
 	const struct cred *cred = current_cred();
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
+	bool was_space = false;
 	int pid_in_pattern = 0;
 	int err = 0;
 
@@ -201,12 +204,35 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 		return -ENOMEM;
 	cn->corename[0] = '\0';
 
-	if (ispipe)
+	if (ispipe) {
+		int argvs = sizeof(core_pattern) / 2;
+		(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
+		if (!(*argv))
+			return -ENOMEM;
+		(*argv)[(*argc)++] = 0;
 		++pat_ptr;
+	}
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
 	while (*pat_ptr) {
+		/*
+		 * Split on spaces before doing template expansion so that
+		 * %e and %E don't get split if they have spaces in them
+		 */
+		if (ispipe) {
+			if (isspace(*pat_ptr)) {
+				was_space = true;
+				pat_ptr++;
+				continue;
+			} else if (was_space) {
+				was_space = false;
+				err = cn_printf(cn, "%c", '\0');
+				if (err)
+					return err;
+				(*argv)[(*argc)++] = cn->used;
+			}
+		}
 		if (*pat_ptr != '%') {
 			err = cn_printf(cn, "%c", *pat_ptr++);
 		} else {
@@ -546,6 +572,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	struct cred *cred;
 	int retval = 0;
 	int ispipe;
+	size_t *argv = NULL;
+	int argc = 0;
 	struct files_struct *displaced;
 	/* require nonrelative corefile path and be extra careful */
 	bool need_suid_safe = false;
@@ -592,9 +620,10 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 
 	old_cred = override_creds(cred);
 
-	ispipe = format_corename(&cn, &cprm);
+	ispipe = format_corename(&cn, &cprm, &argv, &argc);
 
 	if (ispipe) {
+		int argi;
 		int dump_count;
 		char **helper_argv;
 		struct subprocess_info *sub_info;
@@ -637,12 +666,16 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
+		helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
+					    GFP_KERNEL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
 			goto fail_dropcount;
 		}
+		for (argi = 0; argi < argc; argi++)
+			helper_argv[argi] = cn.corename + argv[argi];
+		helper_argv[argi] = NULL;
 
 		retval = -ENOMEM;
 		sub_info = call_usermodehelper_setup(helper_argv[0],
@@ -652,7 +685,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			retval = call_usermodehelper_exec(sub_info,
 							  UMH_WAIT_EXEC);
 
-		argv_free(helper_argv);
+		kfree(helper_argv);
 		if (retval) {
 			printk(KERN_INFO "Core dump to |%s pipe failed\n",
 			       cn.corename);
@@ -766,6 +799,7 @@ fail_dropcount:
 	if (ispipe)
 		atomic_dec(&core_dump_count);
 fail_unlock:
+	kfree(argv);
 	kfree(cn.corename);
 	coredump_finish(mm, core_dumped);
 	revert_creds(old_cred);
-- 
cgit v1.2.3-59-g8ed1b