From 64ad6c488975d7516230cf7849190a991fd615ae Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@osandov.com>
Date: Tue, 2 Jun 2015 17:31:00 -0700
Subject: Btrfs: don't invalidate root dentry when subvolume deletion fails

Since commit bafc9b754f75 ("vfs: More precise tests in d_invalidate"),
mounted subvolumes can be deleted because d_invalidate() won't fail.
However, we run into problems when we attempt to delete the default
subvolume while it is mounted as the root filesystem:

	# btrfs subvol list /
	ID 257 gen 306 top level 5 path rootvol
	ID 267 gen 334 top level 5 path snap1
	# btrfs subvol get-default /
	ID 267 gen 334 top level 5 path snap1
	# btrfs inspect-internal rootid /
	267
	# mount -o subvol=/ /dev/vda1 /mnt
	# btrfs subvol del /mnt/snap1
	Delete subvolume (no-commit): '/mnt/snap1'
	ERROR: cannot delete '/mnt/snap1' - Operation not permitted
	# findmnt /
	findmnt: can't read /proc/mounts: No such file or directory
	# ls /proc
	#

Markus reported that this same scenario simply led to a kernel oops.

This happens because in btrfs_ioctl_snap_destroy(), we call
d_invalidate() before we check may_destroy_subvol(), which means that we
detach the submounts and drop the dentry before erroring out. Instead,
we should only invalidate the dentry once the deletion has succeeded.
Additionally, the shrink_dcache_sb() isn't necessary; d_invalidate()
will prune the dcache for the deleted subvolume.

Cc: <stable@vger.kernel.org>
Fixes: bafc9b754f75 ("vfs: More precise tests in d_invalidate")
Reported-by: Markus Schauler <mschauler@gmail.com>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs/btrfs/ioctl.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1c22c6518504..6f790bcddfc1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2413,8 +2413,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out_unlock_inode;
 	}
 
-	d_invalidate(dentry);
-
 	down_write(&root->fs_info->subvol_sem);
 
 	err = may_destroy_subvol(dest);
@@ -2508,7 +2506,7 @@ out_up_write:
 out_unlock_inode:
 	mutex_unlock(&inode->i_mutex);
 	if (!err) {
-		shrink_dcache_sb(root->fs_info->sb);
+		d_invalidate(dentry);
 		btrfs_invalidate_inodes(dest);
 		d_delete(dentry);
 		ASSERT(dest->send_in_progress == 0);
-- 
cgit v1.2.3-59-g8ed1b


From 6d13f5497f9c44597ba566f741f7ce66ed099456 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 24 Apr 2015 19:12:01 +0200
Subject: btrfs: fix warnings after changes in btrfs_abort_transaction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/btrfs/volumes.c: In function ‘btrfs_create_uuid_tree’:
fs/btrfs/volumes.c:3909:3: warning: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘long int’ [-Wformat=]
   btrfs_abort_transaction(trans, tree_root,
   ^
  CC [M]  fs/btrfs/ioctl.o
fs/btrfs/ioctl.c: In function ‘create_subvol’:
fs/btrfs/ioctl.c:549:3: warning: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘long int’ [-Wformat=]
   btrfs_abort_transaction(trans, root, PTR_ERR(new_root));

PTR_ERR returns long, but we're really using 'int' for the error codes
everywhere so just set and use the local variable.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c   | 2 +-
 fs/btrfs/volumes.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/ioctl.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6f790bcddfc1..f77f6b3d24b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -553,8 +553,8 @@ static noinline int create_subvol(struct inode *dir,
 	key.offset = (u64)-1;
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
 	if (IS_ERR(new_root)) {
-		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
 		ret = PTR_ERR(new_root);
+		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
 	}
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d0582b785485..1fcbd93ca0ec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3908,9 +3908,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
 	uuid_root = btrfs_create_tree(trans, fs_info,
 				      BTRFS_UUID_TREE_OBJECTID);
 	if (IS_ERR(uuid_root)) {
-		btrfs_abort_transaction(trans, tree_root,
-					PTR_ERR(uuid_root));
-		return PTR_ERR(uuid_root);
+		ret = PTR_ERR(uuid_root);
+		btrfs_abort_transaction(trans, tree_root, ret);
+		return ret;
 	}
 
 	fs_info->uuid_root = uuid_root;
-- 
cgit v1.2.3-59-g8ed1b


From 01b810b889d557257580970e1a7ba9c85b54766b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 12 May 2015 19:14:49 +0200
Subject: btrfs: make root id query unprivileged

The INO_LOOKUP ioctl can lookup path for a given inode number and is
thus restricted. As a sideefect it can find the root id of the
containing subvolume and we're using this int the 'btrfs inspect rootid'
command.

The restriction is unnecessary in case we set the ioctl args
 args::treeid    = 0
 args::objectid  = 256 (BTRFS_FIRST_FREE_OBJECTID)

Then the path will be empty and the treeid is filled with the root id of
the inode on which the ioctl is called. This behaviour is unchanged,
after the root restriction is removed.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/ioctl.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f77f6b3d24b9..f7c65ca056f8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2271,10 +2271,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 {
 	 struct btrfs_ioctl_ino_lookup_args *args;
 	 struct inode *inode;
-	 int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	int ret = 0;
 
 	args = memdup_user(argp, sizeof(*args));
 	if (IS_ERR(args))
@@ -2282,13 +2279,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 
 	inode = file_inode(file);
 
+	/*
+	 * Unprivileged query to obtain the containing subvolume root id. The
+	 * path is reset so it's consistent with btrfs_search_path_in_tree.
+	 */
 	if (args->treeid == 0)
 		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
 
+	if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+		args->name[0] = 0;
+		goto out;
+	}
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
 					args->treeid, args->objectid,
 					args->name);
 
+out:
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
 		ret = -EFAULT;
 
-- 
cgit v1.2.3-59-g8ed1b


From e4826a5b2430f23ad4ec7823efcd413fce9f2d64 Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Tue, 9 Jun 2015 17:38:32 +0530
Subject: Btrfs: btrfs_defrag_file: Fix ra_index computation.

Read-ahead is done for the pages in the range [ra_index, ra_index + cluster -
1]. So the next read-ahead should be starting from the page at index 'ra_index
+ cluster' (unless we deemed that the extent at 'ra_index + cluster' as
non-defraggable) rather than from the page at index 'ra_index +
max_cluster'. This patch fixes this. I did run the xfstests suite to make sure
that the code does not regress.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ioctl.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f7c65ca056f8..25c422b11bdb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1368,7 +1368,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			ra_index = max(i, ra_index);
 			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
 				       cluster);
-			ra_index += max_cluster;
+			ra_index += cluster;
 		}
 
 		mutex_lock(&inode->i_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From 070034bdf98544b23a7fcf500618fd31dec06ab2 Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Tue, 9 Jun 2015 10:35:11 +0530
Subject: Btrfs: btrfs_defrag_file: Fix calculation of max_to_defrag.

max_to_defrag represents the number of pages to defrag rather than the last
page of the file range to be defragged.

Consider a file having 10 4k blocks (i.e. blocks in the range [0 - 9]). If the
defrag ioctl was invoked for the block range [3 - 6], then max_to_defrag
should actually have the value 4. Instead in the current code we end up
setting it to 6.

Now, this does not (yet) cause an issue since the first part of the while loop
condition in btrfs_defrag_file() (i.e. "i <= last_index") causes the control
to flow out of the while loop before any buggy behavior is actually caused. So
the patch just makes sure that max_to_defrag ends up having the right value
rather than fixing a bug. I did run the xfstests suite to make sure that the
code does not regress.

Changelog: v1->v2:
Provide a much descriptive commit message.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ioctl.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 25c422b11bdb..9041f154cc32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1318,7 +1318,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index + 1;
+		max_to_defrag = last_index - i + 1;
 
 	/*
 	 * make writeback starts from i, so the defrag range can be
-- 
cgit v1.2.3-59-g8ed1b


From e1d227a42ea2b4664f94212bd1106b9a3413ffb8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Mon, 8 Jun 2015 15:05:25 -0700
Subject: btrfs: Handle unaligned length in extent_same

The extent-same code rejects requests with an unaligned length. This
poses a problem when we want to dedupe the tail extent of files as we
skip cloning the portion between i_size and the extent boundary.

If we don't clone the entire extent, it won't be deleted. So the
combination of these behaviors winds up giving us worst-case dedupe on
many files.

We can fix this by allowing a length that extents to i_size and
internally aligining those to the end of the block. This is what
btrfs_ioctl_clone() so we can just copy that check over.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/ioctl.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9041f154cc32..c86b835da7a8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2889,12 +2889,19 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 	return ret;
 }
 
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+				     u64 olen)
 {
+	u64 len = *plen;
 	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
 
-	if (off + len > inode->i_size || off + len < off)
+	if (off + olen > inode->i_size || off + olen < off)
 		return -EINVAL;
+
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == inode->i_size)
+		*plen = len = ALIGN(inode->i_size, bs) - off;
+
 	/* Check that we are block aligned - btrfs_clone() requires this */
 	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
 		return -EINVAL;
@@ -2902,10 +2909,11 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
 	return 0;
 }
 
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 			     struct inode *dst, u64 dst_loff)
 {
 	int ret;
+	u64 len = olen;
 
 	/*
 	 * btrfs_clone() can't handle extents in the same file
@@ -2920,11 +2928,11 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
 
 	btrfs_double_lock(src, loff, dst, dst_loff, len);
 
-	ret = extent_same_check_offsets(src, loff, len);
+	ret = extent_same_check_offsets(src, loff, &len, olen);
 	if (ret)
 		goto out_unlock;
 
-	ret = extent_same_check_offsets(dst, dst_loff, len);
+	ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
 	if (ret)
 		goto out_unlock;
 
@@ -2937,7 +2945,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
 
 	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
 	if (ret == 0)
-		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
 
 out_unlock:
 	btrfs_double_unlock(src, loff, dst, dst_loff, len);
-- 
cgit v1.2.3-59-g8ed1b