Btrfs: account for missing devices in RAID allocation profiles

When we mount in RAID degraded mode without adding a new device to replace the failed one, we can end up using the wrong RAID flags for allocations. This results in strange combinations of block groups (raid1 in a raid10 filesystem) and corruptions when we try to allocate blocks from single spindle chunks on drives that are actually missing. The first device has two small 4MB chunks in it that mkfs creates and these are usually unused in a raid1 or raid10 setup. But, in -o degraded, the allocator will fall back to these because the mask of desired raid groups isn't correct. The fix here is to count the missing devices as we build up the list of devices in the system. This count is used when picking the raid level to make sure we continue using the same levels that were in place before we lost a drive. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2010-12-13 14:56:23 -0500
committer: Chris Mason <chris.mason@oracle.com> 2010-12-13 20:06:52 -0500
commit: cd02dca56442e1504fd6bc5b96f7f1870162b266 (patch)
tree: 1a38d99fc581974ba6d8136c42ca81f3b1216ea3 /fs
parent: Btrfs: EIO when we fail to read tree roots (diff)
download: linux-dev-cd02dca56442e1504fd6bc5b96f7f1870162b266.tar.xz
linux-dev-cd02dca56442e1504fd6bc5b96f7f1870162b266.zip
3 files changed, 36 insertions, 3 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41133b064d72..4be231e0d2bd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3044,7 +3044,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	u64 num_devices = root->fs_info->fs_devices->rw_devices;
+	/*
+	 * we add in the count of missing devices because we want
+	 * to make sure that any RAID levels on a degraded FS
+	 * continue to be honored.
+	 */
+	u64 num_devices = root->fs_info->fs_devices->rw_devices +
+		root->fs_info->fs_devices->missing_devices;
 
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -7891,7 +7897,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = root->fs_info->fs_devices->rw_devices;
+	/*
+	 * we add in the count of missing devices because we want
+	 * to make sure that any RAID levels on a degraded FS
+	 * continue to be honored.
+	 */
+	num_devices = root->fs_info->fs_devices->rw_devices +
+		root->fs_info->fs_devices->missing_devices;
+
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 91851b555e2e..177b73179590 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -413,12 +413,16 @@ static noinline int device_list_add(const char *path,
 
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
-	} else if (strcmp(device->name, path)) {
+	} else if (!device->name || strcmp(device->name, path)) {
 		name = kstrdup(path, GFP_NOFS);
 		if (!name)
 			return -ENOMEM;
 		kfree(device->name);
 		device->name = name;
+		if (device->missing) {
+			fs_devices->missing_devices--;
+			device->missing = 0;
+		}
 	}
 
 	if (found_transid > fs_devices->latest_trans) {
@@ -1238,6 +1242,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	device->fs_devices->num_devices--;
 
+	if (device->missing)
+		root->fs_info->fs_devices->missing_devices--;
+
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
 				 struct btrfs_device, dev_list);
 	if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3084,7 +3091,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
 	device->fs_devices = fs_devices;
+	device->missing = 1;
 	fs_devices->num_devices++;
+	fs_devices->missing_devices++;
 	spin_lock_init(&device->io_lock);
 	INIT_LIST_HEAD(&device->dev_alloc_list);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3282,6 +3291,15 @@ static int read_one_dev(struct btrfs_root *root,
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
+		} else if (!device->missing) {
+			/*
+			 * this happens when a device that was properly setup
+			 * in the device info lists suddenly goes bad.
+			 * device->bdev is NULL, and so we have to set
+			 * device->missing to one here
+			 */
+			root->fs_info->fs_devices->missing_devices++;
+			device->missing = 1;
 		}
 	}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..a668c0116982 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -45,6 +45,7 @@ struct btrfs_device {
 	int barriers;
 	int writeable;
 	int in_fs_metadata;
+	int missing;
 
 	spinlock_t io_lock;
 
@@ -94,6 +95,7 @@ struct btrfs_fs_devices {
 	u64 num_devices;
 	u64 open_devices;
 	u64 rw_devices;
+	u64 missing_devices;
 	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
author	Chris Mason <chris.mason@oracle.com>	2010-12-13 14:56:23 -0500
committer	Chris Mason <chris.mason@oracle.com>	2010-12-13 20:06:52 -0500
commit	cd02dca56442e1504fd6bc5b96f7f1870162b266 (patch)
tree	1a38d99fc581974ba6d8136c42ca81f3b1216ea3 /fs
parent	Btrfs: EIO when we fail to read tree roots (diff)
download	linux-dev-cd02dca56442e1504fd6bc5b96f7f1870162b266.tar.xz linux-dev-cd02dca56442e1504fd6bc5b96f7f1870162b266.zip