1 files changed, 143 insertions, 37 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 11c3d7bfa797..8d59914f2057 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1050,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 static void compute_parity6(struct stripe_head *sh, int method)
 {
 	raid6_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
 	struct bio *chosen;
 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
 	void *ptrs[disks];
@@ -1131,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
 /* Compute one missing block */
 static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
-	raid6_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
+	int i, count, disks = sh->disks;
 	void *ptr[MAX_XOR_BLOCKS], *p;
 	int pd_idx = sh->pd_idx;
 	int qd_idx = raid6_next_disk(pd_idx, disks);
@@ -1170,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 /* Compute two missing blocks */
 static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 {
-	raid6_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
+	int i, count, disks = sh->disks;
 	int pd_idx = sh->pd_idx;
 	int qd_idx = raid6_next_disk(pd_idx, disks);
 	int d0_idx = raid6_next_disk(qd_idx, disks);
@@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh)
 static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
 	raid6_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
+	int disks = sh->disks;
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
-	int syncing;
+	int syncing, expanding, expanded;
 	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 	int non_overwrite = 0;
 	int failed_num[2] = {0, 0};
@@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+	if (to_read || non_overwrite || (to_write && failed) ||
+	    (syncing && (uptodate < disks)) || expanding) {
 		for (i=disks; i--;) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
 			    (dev->toread ||
 			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 			     syncing ||
+			     expanding ||
 			     (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
 			     (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
 				    )
@@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 				}
 			}
 		}
+
+	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+		/* Need to write out all blocks after computing P&Q */
+		sh->disks = conf->raid_disks;
+		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+					     conf->raid_disks);
+		compute_parity6(sh, RECONSTRUCT_WRITE);
+		for (i = conf->raid_disks ; i-- ;  ) {
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			locked++;
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+		}
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+	} else if (expanded) {
+		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		atomic_dec(&conf->reshape_stripes);
+		wake_up(&conf->wait_for_overlap);
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+	}
+
+	if (expanding && locked == 0) {
+		/* We have read all the blocks in this stripe and now we need to
+		 * copy some of them into a target stripe for expand.
+		 */
+		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+		for (i = 0; i < sh->disks ; i++)
+			if (i != pd_idx && i != qd_idx) {
+				int dd_idx2, pd_idx2, j;
+				struct stripe_head *sh2;
+
+				sector_t bn = compute_blocknr(sh, i);
+				sector_t s = raid5_compute_sector(
+					bn, conf->raid_disks,
+					conf->raid_disks - conf->max_degraded,
+					&dd_idx2, &pd_idx2, conf);
+				sh2 = get_active_stripe(conf, s,
+							conf->raid_disks,
+						       pd_idx2, 1);
+				if (sh2 == NULL)
+					/* so for only the early blocks of
+					 * this stripe have been requests.
+					 * When later blocks get requests, we
+					 * will try again
+					 */
+					continue;
+				if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+				    test_bit(R5_Expanded,
+					     &sh2->dev[dd_idx2].flags)) {
+					/* must have already done this block */
+					release_stripe(sh2);
+					continue;
+				}
+				memcpy(page_address(sh2->dev[dd_idx2].page),
+				       page_address(sh->dev[i].page),
+				       STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
+				set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
+				for (j = 0 ; j < conf->raid_disks ; j++)
+					if (j != sh2->pd_idx &&
+					    j != raid6_next_disk(sh2->pd_idx,
+							   sh2->disks) &&
+					    !test_bit(R5_Expanded,
+						      &sh2->dev[j].flags))
+						break;
+				if (j == conf->raid_disks) {
+					set_bit(STRIPE_EXPAND_READY,
+						&sh2->state);
+					set_bit(STRIPE_HANDLE, &sh2->state);
+				}
+				release_stripe(sh2);
+			}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -2395,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (syncing)
+			if (syncing || expanding || expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -2915,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	struct stripe_head *sh;
 	int pd_idx;
 	sector_t first_sector, last_sector;
-	int raid_disks;
-	int data_disks;
+	int raid_disks = conf->previous_raid_disks;
+	int data_disks = raid_disks - conf->max_degraded;
+	int new_data_disks = conf->raid_disks - conf->max_degraded;
 	int i;
 	int dd_idx;
 	sector_t writepos, safepos, gap;
@@ -2925,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	    conf->expand_progress != 0) {
 		/* restarting in the middle, skip the initial sectors */
 		sector_nr = conf->expand_progress;
-		sector_div(sector_nr, conf->raid_disks-1);
+		sector_div(sector_nr, new_data_disks);
 		*skipped = 1;
 		return sector_nr;
 	}
@@ -2939,14 +3015,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * to after where expand_lo old_maps to
 	 */
 	writepos = conf->expand_progress +
-		conf->chunk_size/512*(conf->raid_disks-1);
-	sector_div(writepos, conf->raid_disks-1);
+		conf->chunk_size/512*(new_data_disks);
+	sector_div(writepos, new_data_disks);
 	safepos = conf->expand_lo;
-	sector_div(safepos, conf->previous_raid_disks-1);
+	sector_div(safepos, data_disks);
 	gap = conf->expand_progress - conf->expand_lo;
 
 	if (writepos >= safepos ||
-	    gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+	    gap > (new_data_disks)*3000*2 /*3Meg*/) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes)==0);
@@ -2976,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 			sector_t s;
 			if (j == sh->pd_idx)
 				continue;
+			if (conf->level == 6 &&
+			    j == raid6_next_disk(sh->pd_idx, sh->disks))
+				continue;
 			s = compute_blocknr(sh, j);
 			if (s < (mddev->array_size<<1)) {
 				skipped = 1;
@@ -2992,28 +3071,27 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		release_stripe(sh);
 	}
 	spin_lock_irq(&conf->device_lock);
-	conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+	conf->expand_progress = (sector_nr + i) * new_data_disks;
 	spin_unlock_irq(&conf->device_lock);
 	/* Ok, those stripe are ready. We can start scheduling
 	 * reads on the source stripes.
 	 * The source stripes are determined by mapping the first and last
 	 * block on the destination stripes.
 	 */
-	raid_disks = conf->previous_raid_disks;
-	data_disks = raid_disks - 1;
 	first_sector =
-		raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+		raid5_compute_sector(sector_nr*(new_data_disks),
 				     raid_disks, data_disks,
 				     &dd_idx, &pd_idx, conf);
 	last_sector =
 		raid5_compute_sector((sector_nr+conf->chunk_size/512)
-				     *(conf->raid_disks-1) -1,
+				     *(new_data_disks) -1,
 				     raid_disks, data_disks,
 				     &dd_idx, &pd_idx, conf);
 	if (last_sector >= (mddev->size<<1))
 		last_sector = (mddev->size<<1)-1;
 	while (first_sector <= last_sector) {
-		pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+		pd_idx = stripe_to_pdidx(first_sector, conf,
+					 conf->previous_raid_disks);
 		sh = get_active_stripe(conf, first_sector,
 				       conf->previous_raid_disks, pd_idx, 0);
 		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
@@ -3348,35 +3426,44 @@ static int run(mddev_t *mddev)
 		 */
 		sector_t here_new, here_old;
 		int old_disks;
+		int max_degraded = (mddev->level == 5 ? 1 : 2);
 
 		if (mddev->new_level != mddev->level ||
 		    mddev->new_layout != mddev->layout ||
 		    mddev->new_chunk != mddev->chunk_size) {
-			printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+			printk(KERN_ERR "raid5: %s: unsupported reshape "
+			       "required - aborting.\n",
 			       mdname(mddev));
 			return -EINVAL;
 		}
 		if (mddev->delta_disks <= 0) {
-			printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+			printk(KERN_ERR "raid5: %s: unsupported reshape "
+			       "(reduce disks) required - aborting.\n",
 			       mdname(mddev));
 			return -EINVAL;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
 		/* reshape_position must be on a new-stripe boundary, and one
-		 * further up in new geometry must map after here in old geometry.
+		 * further up in new geometry must map after here in old
+		 * geometry.
 		 */
 		here_new = mddev->reshape_position;
-		if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
-			printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+		if (sector_div(here_new, (mddev->chunk_size>>9)*
+			       (mddev->raid_disks - max_degraded))) {
+			printk(KERN_ERR "raid5: reshape_position not "
+			       "on a stripe boundary\n");
 			return -EINVAL;
 		}
 		/* here_new is the stripe we will write to */
 		here_old = mddev->reshape_position;
-		sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
-		/* here_old is the first stripe that we might need to read from */
+		sector_div(here_old, (mddev->chunk_size>>9)*
+			   (old_disks-max_degraded));
+		/* here_old is the first stripe that we might need to read
+		 * from */
 		if (here_new >= here_old) {
 			/* Reading from the same stripe as writing to - bad */
-			printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+			printk(KERN_ERR "raid5: reshape_position too early for "
+			       "auto-recovery - aborting.\n");
 			return -EINVAL;
 		}
 		printk(KERN_INFO "raid5: reshape will continue\n");
@@ -3555,12 +3642,15 @@ static int run(mddev_t *mddev)
 	}
 
 	/* Ok, everything is just fine now */
-	sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
+	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
+		printk(KERN_WARNING
+		       "raid5: failed to create sysfs attributes for %s\n",
+		       mdname(mddev));
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
-	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
+	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
 	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
 					    conf->max_degraded);
@@ -3591,6 +3681,7 @@ static int stop(mddev_t *mddev)
 	mddev->thread = NULL;
 	shrink_stripes(conf);
 	kfree(conf->stripe_hashtbl);
+	mddev->queue->backing_dev_info.congested_fn = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	kfree(conf->disks);
@@ -3814,6 +3905,8 @@ static int raid5_check_reshape(mddev_t *mddev)
 	if (err)
 		return err;
 
+	if (mddev->degraded > conf->max_degraded)
+		return -EINVAL;
 	/* looks like we might be able to manage this */
 	return 0;
 }
@@ -3827,8 +3920,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	int added_devices = 0;
 	unsigned long flags;
 
-	if (mddev->degraded ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 
 	ITERATE_RDEV(mddev, rdev, rtmp)
@@ -3836,7 +3928,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 		    !test_bit(Faulty, &rdev->flags))
 			spares++;
 
-	if (spares < mddev->delta_disks-1)
+	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
 		/* Not enough devices even to make a degraded array
 		 * of that size
 		 */
@@ -3862,7 +3954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
 				added_devices++;
 				rdev->recovery_offset = 0;
 				sprintf(nm, "rd%d", rdev->raid_disk);
-				sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+				if (sysfs_create_link(&mddev->kobj,
+						      &rdev->kobj, nm))
+					printk(KERN_WARNING
+					       "raid5: failed to create "
+					       " link %s for %s\n",
+					       nm, mdname(mddev));
 			} else
 				break;
 		}
@@ -3899,7 +3996,8 @@ static void end_reshape(raid5_conf_t *conf)
 	struct block_device *bdev;
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-		conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+		conf->mddev->array_size = conf->mddev->size *
+			(conf->raid_disks - conf->max_degraded);
 		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
 		conf->mddev->changed = 1;
 
@@ -3972,6 +4070,10 @@ static struct mdk_personality raid6_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+	.check_reshape	= raid5_check_reshape,
+	.start_reshape  = raid5_start_reshape,
+#endif
 	.quiesce	= raid5_quiesce,
 };
 static struct mdk_personality raid5_personality =
@@ -4011,6 +4113,10 @@ static struct mdk_personality raid4_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+	.check_reshape	= raid5_check_reshape,
+	.start_reshape  = raid5_start_reshape,
+#endif
 	.quiesce	= raid5_quiesce,
 };