diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/Kconfig | 4 | ||||
| -rw-r--r-- | drivers/md/bitmap.c | 47 | ||||
| -rw-r--r-- | drivers/md/bitmap.h | 2 | ||||
| -rw-r--r-- | drivers/md/faulty.c | 9 | ||||
| -rw-r--r-- | drivers/md/linear.c | 36 | ||||
| -rw-r--r-- | drivers/md/md.c | 543 | ||||
| -rw-r--r-- | drivers/md/md.h | 16 | ||||
| -rw-r--r-- | drivers/md/multipath.c | 13 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 251 | ||||
| -rw-r--r-- | drivers/md/raid0.h | 3 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 114 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 300 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 12 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 233 | 
14 files changed, 994 insertions, 589 deletions
| diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index acb3a4e404ff..4a6feac8c94a 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -100,8 +100,8 @@ config MD_RAID1  	  If unsure, say Y.  config MD_RAID10 -	tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" -	depends on BLK_DEV_MD && EXPERIMENTAL +	tristate "RAID-10 (mirrored striping) mode" +	depends on BLK_DEV_MD  	---help---  	  RAID-10 provides a combination of striping (RAID-0) and  	  mirroring (RAID-1) with easier configuration and more flexible diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 26ac8aad0b19..1742435ce3ae 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -505,7 +505,7 @@ void bitmap_update_sb(struct bitmap *bitmap)  		return;  	}  	spin_unlock_irqrestore(&bitmap->lock, flags); -	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); +	sb = kmap_atomic(bitmap->sb_page, KM_USER0);  	sb->events = cpu_to_le64(bitmap->mddev->events);  	if (bitmap->mddev->events < bitmap->events_cleared) {  		/* rocking back to read-only */ @@ -526,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap)  	if (!bitmap || !bitmap->sb_page)  		return; -	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); +	sb = kmap_atomic(bitmap->sb_page, KM_USER0);  	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));  	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));  	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version)); @@ -575,7 +575,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)  		return err;  	} -	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); +	sb = kmap_atomic(bitmap->sb_page, KM_USER0);  	chunksize = le32_to_cpu(sb->chunksize);  	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -661,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,  		return 0;  	}  	spin_unlock_irqrestore(&bitmap->lock, flags); -	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); +	sb = kmap_atomic(bitmap->sb_page, KM_USER0);  	old = le32_to_cpu(sb->state) & bits;  	switch (op) {  		case MASK_SET: sb->state |= cpu_to_le32(bits); @@ -1292,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect  	if (!bitmap) return 0;  	if (behind) { +		int bw;  		atomic_inc(&bitmap->behind_writes); +		bw = atomic_read(&bitmap->behind_writes); +		if (bw > bitmap->behind_writes_used) +			bitmap->behind_writes_used = bw; +  		PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", -		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); +		       bw, bitmap->max_write_behind);  	}  	while (sectors) { @@ -1351,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto  {  	if (!bitmap) return;  	if (behind) { -		atomic_dec(&bitmap->behind_writes); +		if (atomic_dec_and_test(&bitmap->behind_writes)) +			wake_up(&bitmap->behind_wait);  		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",  		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);  	} @@ -1675,12 +1681,13 @@ int bitmap_create(mddev_t *mddev)  	atomic_set(&bitmap->pending_writes, 0);  	init_waitqueue_head(&bitmap->write_wait);  	init_waitqueue_head(&bitmap->overflow_wait); +	init_waitqueue_head(&bitmap->behind_wait);  	bitmap->mddev = mddev; -	bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); +	bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap");  	if (bm) { -		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); +		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear");  		sysfs_put(bm);  	} else  		bitmap->sysfs_can_clear = NULL; @@ -1692,7 +1699,7 @@ int bitmap_create(mddev_t *mddev)  		 * and bypass the page cache, we must sync the file  		 * first.  		 */ -		vfs_fsync(file, file->f_dentry, 1); +		vfs_fsync(file, 1);  	}  	/* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */  	if (!mddev->bitmap_info.external) @@ -2006,6 +2013,27 @@ static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)  static struct md_sysfs_entry bitmap_can_clear =  __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); +static ssize_t +behind_writes_used_show(mddev_t *mddev, char *page) +{ +	if (mddev->bitmap == NULL) +		return sprintf(page, "0\n"); +	return sprintf(page, "%lu\n", +		       mddev->bitmap->behind_writes_used); +} + +static ssize_t +behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) +{ +	if (mddev->bitmap) +		mddev->bitmap->behind_writes_used = 0; +	return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, +       behind_writes_used_show, behind_writes_used_reset); +  static struct attribute *md_bitmap_attrs[] = {  	&bitmap_location.attr,  	&bitmap_timeout.attr, @@ -2013,6 +2041,7 @@ static struct attribute *md_bitmap_attrs[] = {  	&bitmap_chunksize.attr,  	&bitmap_metadata.attr,  	&bitmap_can_clear.attr, +	&max_backlog_used.attr,  	NULL  };  struct attribute_group md_bitmap_group = { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index cb821d76d1b4..3797dea4723a 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -227,6 +227,7 @@ struct bitmap {  	int allclean;  	atomic_t behind_writes; +	unsigned long behind_writes_used; /* highest actual value at runtime */  	/*  	 * the bitmap daemon - periodically wakes up and sweeps the bitmap @@ -239,6 +240,7 @@ struct bitmap {  	atomic_t pending_writes; /* pending writes to the bitmap file */  	wait_queue_head_t write_wait;  	wait_queue_head_t overflow_wait; +	wait_queue_head_t behind_wait;  	struct sysfs_dirent *sysfs_can_clear;  }; diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 8e3850b98cca..1a8987884614 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -169,10 +169,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode)  		conf->nfaults = n+1;  } -static int make_request(struct request_queue *q, struct bio *bio) +static int make_request(mddev_t *mddev, struct bio *bio)  { -	mddev_t *mddev = q->queuedata; -	conf_t *conf = (conf_t*)mddev->private; +	conf_t *conf = mddev->private;  	int failit = 0;  	if (bio_data_dir(bio) == WRITE) { @@ -225,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio)  static void status(struct seq_file *seq, mddev_t *mddev)  { -	conf_t *conf = (conf_t*)mddev->private; +	conf_t *conf = mddev->private;  	int n;  	if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) @@ -328,7 +327,7 @@ static int run(mddev_t *mddev)  static int stop(mddev_t *mddev)  { -	conf_t *conf = (conf_t *)mddev->private; +	conf_t *conf = mddev->private;  	kfree(conf);  	mddev->private = NULL; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09437e958235..7e0e057db9a7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -159,7 +159,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)  		sector_t sectors;  		if (j < 0 || j >= raid_disks || disk->rdev) { -			printk("linear: disk numbering problem. Aborting!\n"); +			printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", +			       mdname(mddev));  			goto out;  		} @@ -187,7 +188,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)  	}  	if (cnt != raid_disks) { -		printk("linear: not enough drives present. Aborting!\n"); +		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", +		       mdname(mddev));  		goto out;  	} @@ -282,29 +284,21 @@ static int linear_stop (mddev_t *mddev)  	rcu_barrier();  	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/  	kfree(conf); +	mddev->private = NULL;  	return 0;  } -static int linear_make_request (struct request_queue *q, struct bio *bio) +static int linear_make_request (mddev_t *mddev, struct bio *bio)  { -	const int rw = bio_data_dir(bio); -	mddev_t *mddev = q->queuedata;  	dev_info_t *tmp_dev;  	sector_t start_sector; -	int cpu;  	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {  		md_barrier_request(mddev, bio);  		return 0;  	} -	cpu = part_stat_lock(); -	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); -	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], -		      bio_sectors(bio)); -	part_stat_unlock(); -  	rcu_read_lock();  	tmp_dev = which_dev(mddev, bio->bi_sector);  	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; @@ -314,12 +308,14 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)  		     || (bio->bi_sector < start_sector))) {  		char b[BDEVNAME_SIZE]; -		printk("linear_make_request: Sector %llu out of bounds on " -			"dev %s: %llu sectors, offset %llu\n", -			(unsigned long long)bio->bi_sector, -			bdevname(tmp_dev->rdev->bdev, b), -			(unsigned long long)tmp_dev->rdev->sectors, -			(unsigned long long)start_sector); +		printk(KERN_ERR +		       "md/linear:%s: make_request: Sector %llu out of bounds on " +		       "dev %s: %llu sectors, offset %llu\n", +		       mdname(mddev), +		       (unsigned long long)bio->bi_sector, +		       bdevname(tmp_dev->rdev->bdev, b), +		       (unsigned long long)tmp_dev->rdev->sectors, +		       (unsigned long long)start_sector);  		rcu_read_unlock();  		bio_io_error(bio);  		return 0; @@ -336,9 +332,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)  		bp = bio_split(bio, end_sector - bio->bi_sector); -		if (linear_make_request(q, &bp->bio1)) +		if (linear_make_request(mddev, &bp->bio1))  			generic_make_request(&bp->bio1); -		if (linear_make_request(q, &bp->bio2)) +		if (linear_make_request(mddev, &bp->bio2))  			generic_make_request(&bp->bio2);  		bio_pair_release(bp);  		return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index cefd63daff31..46b3a044eadf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -215,8 +215,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock);   */  static int md_make_request(struct request_queue *q, struct bio *bio)  { +	const int rw = bio_data_dir(bio);  	mddev_t *mddev = q->queuedata;  	int rv; +	int cpu; +  	if (mddev == NULL || mddev->pers == NULL) {  		bio_io_error(bio);  		return 0; @@ -237,13 +240,27 @@ static int md_make_request(struct request_queue *q, struct bio *bio)  	}  	atomic_inc(&mddev->active_io);  	rcu_read_unlock(); -	rv = mddev->pers->make_request(q, bio); + +	rv = mddev->pers->make_request(mddev, bio); + +	cpu = part_stat_lock(); +	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); +	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], +		      bio_sectors(bio)); +	part_stat_unlock(); +  	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)  		wake_up(&mddev->sb_wait);  	return rv;  } +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */  static void mddev_suspend(mddev_t *mddev)  {  	BUG_ON(mddev->suspended); @@ -251,13 +268,6 @@ static void mddev_suspend(mddev_t *mddev)  	synchronize_rcu();  	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);  	mddev->pers->quiesce(mddev, 1); -	md_unregister_thread(mddev->thread); -	mddev->thread = NULL; -	/* we now know that no code is executing in the personality module, -	 * except possibly the tail end of a ->bi_end_io function, but that -	 * is certain to complete before the module has a chance to get -	 * unloaded -	 */  }  static void mddev_resume(mddev_t *mddev) @@ -344,7 +354,7 @@ static void md_submit_barrier(struct work_struct *ws)  		bio_endio(bio, 0);  	else {  		bio->bi_rw &= ~(1<<BIO_RW_BARRIER); -		if (mddev->pers->make_request(mddev->queue, bio)) +		if (mddev->pers->make_request(mddev, bio))  			generic_make_request(bio);  		mddev->barrier = POST_REQUEST_BARRIER;  		submit_barriers(mddev); @@ -406,6 +416,27 @@ static void mddev_put(mddev_t *mddev)  	spin_unlock(&all_mddevs_lock);  } +static void mddev_init(mddev_t *mddev) +{ +	mutex_init(&mddev->open_mutex); +	mutex_init(&mddev->reconfig_mutex); +	mutex_init(&mddev->bitmap_info.mutex); +	INIT_LIST_HEAD(&mddev->disks); +	INIT_LIST_HEAD(&mddev->all_mddevs); +	init_timer(&mddev->safemode_timer); +	atomic_set(&mddev->active, 1); +	atomic_set(&mddev->openers, 0); +	atomic_set(&mddev->active_io, 0); +	spin_lock_init(&mddev->write_lock); +	atomic_set(&mddev->flush_pending, 0); +	init_waitqueue_head(&mddev->sb_wait); +	init_waitqueue_head(&mddev->recovery_wait); +	mddev->reshape_position = MaxSector; +	mddev->resync_min = 0; +	mddev->resync_max = MaxSector; +	mddev->level = LEVEL_NONE; +} +  static mddev_t * mddev_find(dev_t unit)  {  	mddev_t *mddev, *new = NULL; @@ -472,23 +503,7 @@ static mddev_t * mddev_find(dev_t unit)  	else  		new->md_minor = MINOR(unit) >> MdpMinorShift; -	mutex_init(&new->open_mutex); -	mutex_init(&new->reconfig_mutex); -	mutex_init(&new->bitmap_info.mutex); -	INIT_LIST_HEAD(&new->disks); -	INIT_LIST_HEAD(&new->all_mddevs); -	init_timer(&new->safemode_timer); -	atomic_set(&new->active, 1); -	atomic_set(&new->openers, 0); -	atomic_set(&new->active_io, 0); -	spin_lock_init(&new->write_lock); -	atomic_set(&new->flush_pending, 0); -	init_waitqueue_head(&new->sb_wait); -	init_waitqueue_head(&new->recovery_wait); -	new->reshape_position = MaxSector; -	new->resync_min = 0; -	new->resync_max = MaxSector; -	new->level = LEVEL_NONE; +	mddev_init(new);  	goto retry;  } @@ -508,9 +523,36 @@ static inline int mddev_trylock(mddev_t * mddev)  	return mutex_trylock(&mddev->reconfig_mutex);  } -static inline void mddev_unlock(mddev_t * mddev) +static struct attribute_group md_redundancy_group; + +static void mddev_unlock(mddev_t * mddev)  { -	mutex_unlock(&mddev->reconfig_mutex); +	if (mddev->to_remove) { +		/* These cannot be removed under reconfig_mutex as +		 * an access to the files will try to take reconfig_mutex +		 * while holding the file unremovable, which leads to +		 * a deadlock. +		 * So hold open_mutex instead - we are allowed to take +		 * it while holding reconfig_mutex, and md_run can +		 * use it to wait for the remove to complete. +		 */ +		struct attribute_group *to_remove = mddev->to_remove; +		mddev->to_remove = NULL; +		mutex_lock(&mddev->open_mutex); +		mutex_unlock(&mddev->reconfig_mutex); + +		if (to_remove != &md_redundancy_group) +			sysfs_remove_group(&mddev->kobj, to_remove); +		if (mddev->pers == NULL || +		    mddev->pers->sync_request == NULL) { +			sysfs_remove_group(&mddev->kobj, &md_redundancy_group); +			if (mddev->sysfs_action) +				sysfs_put(mddev->sysfs_action); +			mddev->sysfs_action = NULL; +		} +		mutex_unlock(&mddev->open_mutex); +	} else +		mutex_unlock(&mddev->reconfig_mutex);  	md_wakeup_thread(mddev->thread);  } @@ -1029,10 +1071,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)  				mddev->bitmap_info.default_offset;  	} else if (mddev->pers == NULL) { -		/* Insist on good event counter while assembling */ +		/* Insist on good event counter while assembling, except +		 * for spares (which don't need an event count) */  		++ev1; -		if (ev1 < mddev->events)  -			return -EINVAL; +		if (sb->disks[rdev->desc_nr].state & ( +			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) +			if (ev1 < mddev->events)  +				return -EINVAL;  	} else if (mddev->bitmap) {  		/* if adding to array with a bitmap, then we can accept an  		 * older device ... but not too old. @@ -1428,10 +1473,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)  		}  	} else if (mddev->pers == NULL) { -		/* Insist of good event counter while assembling */ +		/* Insist of good event counter while assembling, except for +		 * spares (which don't need an event count) */  		++ev1; -		if (ev1 < mddev->events) -			return -EINVAL; +		if (rdev->desc_nr >= 0 && +		    rdev->desc_nr < le32_to_cpu(sb->max_dev) && +		    le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) +			if (ev1 < mddev->events) +				return -EINVAL;  	} else if (mddev->bitmap) {  		/* If adding to array with a bitmap, then we can accept an  		 * older device, but not too old. @@ -1766,7 +1815,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)  		kobject_del(&rdev->kobj);  		goto fail;  	} -	rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); +	rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state");  	list_add_rcu(&rdev->same_set, &mddev->disks);  	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); @@ -2047,7 +2096,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)  		if (rdev->sb_events == mddev->events ||  		    (nospares &&  		     rdev->raid_disk < 0 && -		     (rdev->sb_events&1)==0 &&  		     rdev->sb_events+1 == mddev->events)) {  			/* Don't update this superblock */  			rdev->sb_loaded = 2; @@ -2100,28 +2148,14 @@ repeat:  	 * and 'events' is odd, we can roll back to the previous clean state */  	if (nospares  	    && (mddev->in_sync && mddev->recovery_cp == MaxSector) -	    && (mddev->events & 1) -	    && mddev->events != 1) +	    && mddev->can_decrease_events +	    && mddev->events != 1) {  		mddev->events--; -	else { +		mddev->can_decrease_events = 0; +	} else {  		/* otherwise we have to go forward and ... */  		mddev->events ++; -		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ -			/* .. if the array isn't clean, an 'even' event must also go -			 * to spares. */ -			if ((mddev->events&1)==0) { -				nospares = 0; -				sync_req = 2; /* force a second update to get the -					       * even/odd in sync */ -			} -		} else { -			/* otherwise an 'odd' event must go to spares */ -			if ((mddev->events&1)) { -				nospares = 0; -				sync_req = 2; /* force a second update to get the -					       * even/odd in sync */ -			} -		} +		mddev->can_decrease_events = nospares;  	}  	if (!mddev->events) { @@ -2365,6 +2399,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)  			return err;  		sprintf(nm, "rd%d", rdev->raid_disk);  		sysfs_remove_link(&rdev->mddev->kobj, nm); +		rdev->raid_disk = -1;  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);  		md_wakeup_thread(rdev->mddev->thread);  	} else if (rdev->mddev->pers) { @@ -2780,8 +2815,9 @@ static void analyze_sbs(mddev_t * mddev)  	i = 0;  	rdev_for_each(rdev, tmp, mddev) { -		if (rdev->desc_nr >= mddev->max_disks || -		    i > mddev->max_disks) { +		if (mddev->max_disks && +		    (rdev->desc_nr >= mddev->max_disks || +		     i > mddev->max_disks)) {  			printk(KERN_WARNING  			       "md: %s: %s: only %d devices permitted\n",  			       mdname(mddev), bdevname(rdev->bdev, b), @@ -2897,9 +2933,10 @@ level_show(mddev_t *mddev, char *page)  static ssize_t  level_store(mddev_t *mddev, const char *buf, size_t len)  { -	char level[16]; +	char clevel[16];  	ssize_t rv = len;  	struct mdk_personality *pers; +	long level;  	void *priv;  	mdk_rdev_t *rdev; @@ -2932,19 +2969,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len)  	}  	/* Now find the new personality */ -	if (len == 0 || len >= sizeof(level)) +	if (len == 0 || len >= sizeof(clevel))  		return -EINVAL; -	strncpy(level, buf, len); -	if (level[len-1] == '\n') +	strncpy(clevel, buf, len); +	if (clevel[len-1] == '\n')  		len--; -	level[len] = 0; +	clevel[len] = 0; +	if (strict_strtol(clevel, 10, &level)) +		level = LEVEL_NONE; -	request_module("md-%s", level); +	if (request_module("md-%s", clevel) != 0) +		request_module("md-level-%s", clevel);  	spin_lock(&pers_lock); -	pers = find_pers(LEVEL_NONE, level); +	pers = find_pers(level, clevel);  	if (!pers || !try_module_get(pers->owner)) {  		spin_unlock(&pers_lock); -		printk(KERN_WARNING "md: personality %s not loaded\n", level); +		printk(KERN_WARNING "md: personality %s not loaded\n", clevel);  		return -EINVAL;  	}  	spin_unlock(&pers_lock); @@ -2957,7 +2997,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)  	if (!pers->takeover) {  		module_put(pers->owner);  		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", -		       mdname(mddev), level); +		       mdname(mddev), clevel);  		return -EINVAL;  	} @@ -2973,13 +3013,44 @@ level_store(mddev_t *mddev, const char *buf, size_t len)  		mddev->delta_disks = 0;  		module_put(pers->owner);  		printk(KERN_WARNING "md: %s: %s would not accept array\n", -		       mdname(mddev), level); +		       mdname(mddev), clevel);  		return PTR_ERR(priv);  	}  	/* Looks like we have a winner */  	mddev_suspend(mddev);  	mddev->pers->stop(mddev); +	 +	if (mddev->pers->sync_request == NULL && +	    pers->sync_request != NULL) { +		/* need to add the md_redundancy_group */ +		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) +			printk(KERN_WARNING +			       "md: cannot register extra attributes for %s\n", +			       mdname(mddev)); +		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); +	}		 +	if (mddev->pers->sync_request != NULL && +	    pers->sync_request == NULL) { +		/* need to remove the md_redundancy_group */ +		if (mddev->to_remove == NULL) +			mddev->to_remove = &md_redundancy_group; +	} + +	if (mddev->pers->sync_request == NULL && +	    mddev->external) { +		/* We are converting from a no-redundancy array +		 * to a redundancy array and metadata is managed +		 * externally so we need to be sure that writes +		 * won't block due to a need to transition +		 *      clean->dirty +		 * until external management is started. +		 */ +		mddev->in_sync = 0; +		mddev->safemode_delay = 0; +		mddev->safemode = 0; +	} +  	module_put(mddev->pers->owner);  	/* Invalidate devices that are now superfluous */  	list_for_each_entry(rdev, &mddev->disks, same_set) @@ -2994,11 +3065,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len)  	mddev->layout = mddev->new_layout;  	mddev->chunk_sectors = mddev->new_chunk_sectors;  	mddev->delta_disks = 0; +	if (mddev->pers->sync_request == NULL) { +		/* this is now an array without redundancy, so +		 * it must always be in_sync +		 */ +		mddev->in_sync = 1; +		del_timer_sync(&mddev->safemode_timer); +	}  	pers->run(mddev);  	mddev_resume(mddev);  	set_bit(MD_CHANGE_DEVS, &mddev->flags);  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);  	md_wakeup_thread(mddev->thread); +	sysfs_notify(&mddev->kobj, NULL, "level"); +	md_new_event(mddev);  	return rv;  } @@ -3237,6 +3317,7 @@ array_state_show(mddev_t *mddev, char *page)  }  static int do_md_stop(mddev_t * mddev, int ro, int is_open); +static int md_set_readonly(mddev_t * mddev, int is_open);  static int do_md_run(mddev_t * mddev);  static int restart_array(mddev_t *mddev); @@ -3267,7 +3348,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)  		break; /* not supported yet */  	case readonly:  		if (mddev->pers) -			err = do_md_stop(mddev, 1, 0); +			err = md_set_readonly(mddev, 0);  		else {  			mddev->ro = 1;  			set_disk_ro(mddev->gendisk, 1); @@ -3277,7 +3358,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)  	case read_auto:  		if (mddev->pers) {  			if (mddev->ro == 0) -				err = do_md_stop(mddev, 1, 0); +				err = md_set_readonly(mddev, 0);  			else if (mddev->ro == 1)  				err = restart_array(mddev);  			if (err == 0) { @@ -4082,15 +4163,6 @@ static void mddev_delayed_delete(struct work_struct *ws)  {  	mddev_t *mddev = container_of(ws, mddev_t, del_work); -	if (mddev->private) { -		sysfs_remove_group(&mddev->kobj, &md_redundancy_group); -		if (mddev->private != (void*)1) -			sysfs_remove_group(&mddev->kobj, mddev->private); -		if (mddev->sysfs_action) -			sysfs_put(mddev->sysfs_action); -		mddev->sysfs_action = NULL; -		mddev->private = NULL; -	}  	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);  	kobject_del(&mddev->kobj);  	kobject_put(&mddev->kobj); @@ -4189,7 +4261,7 @@ static int md_alloc(dev_t dev, char *name)  	mutex_unlock(&disks_mutex);  	if (!error) {  		kobject_uevent(&mddev->kobj, KOBJ_ADD); -		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); +		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state");  	}  	mddev_put(mddev);  	return error; @@ -4234,11 +4306,10 @@ static void md_safemode_timeout(unsigned long data)  static int start_dirty_degraded; -static int do_md_run(mddev_t * mddev) +static int md_run(mddev_t *mddev)  {  	int err;  	mdk_rdev_t *rdev; -	struct gendisk *disk;  	struct mdk_personality *pers;  	if (list_empty(&mddev->disks)) @@ -4248,6 +4319,13 @@ static int do_md_run(mddev_t * mddev)  	if (mddev->pers)  		return -EBUSY; +	/* These two calls synchronise us with the +	 * sysfs_remove_group calls in mddev_unlock, +	 * so they must have completed. +	 */ +	mutex_lock(&mddev->open_mutex); +	mutex_unlock(&mddev->open_mutex); +  	/*  	 * Analyze all RAID superblock(s)  	 */ @@ -4296,8 +4374,6 @@ static int do_md_run(mddev_t * mddev)  		sysfs_notify_dirent(rdev->sysfs_state);  	} -	disk = mddev->gendisk; -  	spin_lock(&pers_lock);  	pers = find_pers(mddev->level, mddev->clevel);  	if (!pers || !try_module_get(pers->owner)) { @@ -4398,7 +4474,7 @@ static int do_md_run(mddev_t * mddev)  			printk(KERN_WARNING  			       "md: cannot register extra attributes for %s\n",  			       mdname(mddev)); -		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); +		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");  	} else if (mddev->ro == 2) /* auto-readonly not meaningful */  		mddev->ro = 0; @@ -4425,22 +4501,32 @@ static int do_md_run(mddev_t * mddev)  	if (mddev->flags)  		md_update_sb(mddev, 0); -	set_capacity(disk, mddev->array_sectors); -  	md_wakeup_thread(mddev->thread);  	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ -	revalidate_disk(mddev->gendisk); -	mddev->changed = 1;  	md_new_event(mddev);  	sysfs_notify_dirent(mddev->sysfs_state);  	if (mddev->sysfs_action)  		sysfs_notify_dirent(mddev->sysfs_action);  	sysfs_notify(&mddev->kobj, NULL, "degraded"); -	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);  	return 0;  } +static int do_md_run(mddev_t *mddev) +{ +	int err; + +	err = md_run(mddev); +	if (err) +		goto out; + +	set_capacity(mddev->gendisk, mddev->array_sectors); +	revalidate_disk(mddev->gendisk); +	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: +	return err; +} +  static int restart_array(mddev_t *mddev)  {  	struct gendisk *disk = mddev->gendisk; @@ -4491,9 +4577,110 @@ void restore_bitmap_write_access(struct file *file)  	spin_unlock(&inode->i_lock);  } +static void md_clean(mddev_t *mddev) +{ +	mddev->array_sectors = 0; +	mddev->external_size = 0; +	mddev->dev_sectors = 0; +	mddev->raid_disks = 0; +	mddev->recovery_cp = 0; +	mddev->resync_min = 0; +	mddev->resync_max = MaxSector; +	mddev->reshape_position = MaxSector; +	mddev->external = 0; +	mddev->persistent = 0; +	mddev->level = LEVEL_NONE; +	mddev->clevel[0] = 0; +	mddev->flags = 0; +	mddev->ro = 0; +	mddev->metadata_type[0] = 0; +	mddev->chunk_sectors = 0; +	mddev->ctime = mddev->utime = 0; +	mddev->layout = 0; +	mddev->max_disks = 0; +	mddev->events = 0; +	mddev->can_decrease_events = 0; +	mddev->delta_disks = 0; +	mddev->new_level = LEVEL_NONE; +	mddev->new_layout = 0; +	mddev->new_chunk_sectors = 0; +	mddev->curr_resync = 0; +	mddev->resync_mismatches = 0; +	mddev->suspend_lo = mddev->suspend_hi = 0; +	mddev->sync_speed_min = mddev->sync_speed_max = 0; +	mddev->recovery = 0; +	mddev->in_sync = 0; +	mddev->degraded = 0; +	mddev->barriers_work = 0; +	mddev->safemode = 0; +	mddev->bitmap_info.offset = 0; +	mddev->bitmap_info.default_offset = 0; +	mddev->bitmap_info.chunksize = 0; +	mddev->bitmap_info.daemon_sleep = 0; +	mddev->bitmap_info.max_write_behind = 0; +} + +static void md_stop_writes(mddev_t *mddev) +{ +	if (mddev->sync_thread) { +		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +		set_bit(MD_RECOVERY_INTR, &mddev->recovery); +		md_unregister_thread(mddev->sync_thread); +		mddev->sync_thread = NULL; +	} + +	del_timer_sync(&mddev->safemode_timer); + +	bitmap_flush(mddev); +	md_super_wait(mddev); + +	if (!mddev->in_sync || mddev->flags) { +		/* mark array as shutdown cleanly */ +		mddev->in_sync = 1; +		md_update_sb(mddev, 1); +	} +} + +static void md_stop(mddev_t *mddev) +{ +	md_stop_writes(mddev); + +	mddev->pers->stop(mddev); +	if (mddev->pers->sync_request && mddev->to_remove == NULL) +		mddev->to_remove = &md_redundancy_group; +	module_put(mddev->pers->owner); +	mddev->pers = NULL; +	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} + +static int md_set_readonly(mddev_t *mddev, int is_open) +{ +	int err = 0; +	mutex_lock(&mddev->open_mutex); +	if (atomic_read(&mddev->openers) > is_open) { +		printk("md: %s still in use.\n",mdname(mddev)); +		err = -EBUSY; +		goto out; +	} +	if (mddev->pers) { +		md_stop_writes(mddev); + +		err  = -ENXIO; +		if (mddev->ro==1) +			goto out; +		mddev->ro = 1; +		set_disk_ro(mddev->gendisk, 1); +		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +		sysfs_notify_dirent(mddev->sysfs_state); +		err = 0;	 +	} +out: +	mutex_unlock(&mddev->open_mutex); +	return err; +} +  /* mode:   *   0 - completely stop and dis-assemble array - *   1 - switch to readonly   *   2 - stop but do not disassemble array   */  static int do_md_stop(mddev_t * mddev, int mode, int is_open) @@ -4508,64 +4695,32 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)  		err = -EBUSY;  	} else if (mddev->pers) { -		if (mddev->sync_thread) { -			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); -			set_bit(MD_RECOVERY_INTR, &mddev->recovery); -			md_unregister_thread(mddev->sync_thread); -			mddev->sync_thread = NULL; -		} - -		del_timer_sync(&mddev->safemode_timer); +		if (mddev->ro) +			set_disk_ro(disk, 0); -		switch(mode) { -		case 1: /* readonly */ -			err  = -ENXIO; -			if (mddev->ro==1) -				goto out; -			mddev->ro = 1; -			break; -		case 0: /* disassemble */ -		case 2: /* stop */ -			bitmap_flush(mddev); -			md_super_wait(mddev); -			if (mddev->ro) -				set_disk_ro(disk, 0); +		md_stop(mddev); +		mddev->queue->merge_bvec_fn = NULL; +		mddev->queue->unplug_fn = NULL; +		mddev->queue->backing_dev_info.congested_fn = NULL; -			mddev->pers->stop(mddev); -			mddev->queue->merge_bvec_fn = NULL; -			mddev->queue->unplug_fn = NULL; -			mddev->queue->backing_dev_info.congested_fn = NULL; -			module_put(mddev->pers->owner); -			if (mddev->pers->sync_request && mddev->private == NULL) -				mddev->private = (void*)1; -			mddev->pers = NULL; -			/* tell userspace to handle 'inactive' */ -			sysfs_notify_dirent(mddev->sysfs_state); +		/* tell userspace to handle 'inactive' */ +		sysfs_notify_dirent(mddev->sysfs_state); -			list_for_each_entry(rdev, &mddev->disks, same_set) -				if (rdev->raid_disk >= 0) { -					char nm[20]; -					sprintf(nm, "rd%d", rdev->raid_disk); -					sysfs_remove_link(&mddev->kobj, nm); -				} +		list_for_each_entry(rdev, &mddev->disks, same_set) +			if (rdev->raid_disk >= 0) { +				char nm[20]; +				sprintf(nm, "rd%d", rdev->raid_disk); +				sysfs_remove_link(&mddev->kobj, nm); +			} -			set_capacity(disk, 0); -			mddev->changed = 1; +		set_capacity(disk, 0); +		revalidate_disk(disk); -			if (mddev->ro) -				mddev->ro = 0; -		} -		if (!mddev->in_sync || mddev->flags) { -			/* mark array as shutdown cleanly */ -			mddev->in_sync = 1; -			md_update_sb(mddev, 1); -		} -		if (mode == 1) -			set_disk_ro(disk, 1); -		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +		if (mddev->ro) +			mddev->ro = 0; +		  		err = 0;  	} -out:  	mutex_unlock(&mddev->open_mutex);  	if (err)  		return err; @@ -4586,52 +4741,12 @@ out:  		export_array(mddev); -		mddev->array_sectors = 0; -		mddev->external_size = 0; -		mddev->dev_sectors = 0; -		mddev->raid_disks = 0; -		mddev->recovery_cp = 0; -		mddev->resync_min = 0; -		mddev->resync_max = MaxSector; -		mddev->reshape_position = MaxSector; -		mddev->external = 0; -		mddev->persistent = 0; -		mddev->level = LEVEL_NONE; -		mddev->clevel[0] = 0; -		mddev->flags = 0; -		mddev->ro = 0; -		mddev->metadata_type[0] = 0; -		mddev->chunk_sectors = 0; -		mddev->ctime = mddev->utime = 0; -		mddev->layout = 0; -		mddev->max_disks = 0; -		mddev->events = 0; -		mddev->delta_disks = 0; -		mddev->new_level = LEVEL_NONE; -		mddev->new_layout = 0; -		mddev->new_chunk_sectors = 0; -		mddev->curr_resync = 0; -		mddev->resync_mismatches = 0; -		mddev->suspend_lo = mddev->suspend_hi = 0; -		mddev->sync_speed_min = mddev->sync_speed_max = 0; -		mddev->recovery = 0; -		mddev->in_sync = 0; -		mddev->changed = 0; -		mddev->degraded = 0; -		mddev->barriers_work = 0; -		mddev->safemode = 0; -		mddev->bitmap_info.offset = 0; -		mddev->bitmap_info.default_offset = 0; -		mddev->bitmap_info.chunksize = 0; -		mddev->bitmap_info.daemon_sleep = 0; -		mddev->bitmap_info.max_write_behind = 0; +		md_clean(mddev);  		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);  		if (mddev->hold_active == UNTIL_STOP)  			mddev->hold_active = 0; -	} else if (mddev->pers) -		printk(KERN_INFO "md: %s switched to read-only mode.\n", -			mdname(mddev)); +	}  	err = 0;  	blk_integrity_unregister(disk);  	md_new_event(mddev); @@ -5349,7 +5464,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)  	if (mddev->pers->check_reshape == NULL)  		return -EINVAL;  	if (raid_disks <= 0 || -	    raid_disks >= mddev->max_disks) +	    (mddev->max_disks && raid_disks >= mddev->max_disks))  		return -EINVAL;  	if (mddev->sync_thread || mddev->reshape_position != MaxSector)  		return -EBUSY; @@ -5486,7 +5601,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)  	geo->heads = 2;  	geo->sectors = 4; -	geo->cylinders = get_capacity(mddev->gendisk) / 8; +	geo->cylinders = mddev->array_sectors / 8;  	return 0;  } @@ -5496,6 +5611,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,  	int err = 0;  	void __user *argp = (void __user *)arg;  	mddev_t *mddev = NULL; +	int ro;  	if (!capable(CAP_SYS_ADMIN))  		return -EACCES; @@ -5628,9 +5744,37 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,  			goto done_unlock;  		case STOP_ARRAY_RO: -			err = do_md_stop(mddev, 1, 1); +			err = md_set_readonly(mddev, 1);  			goto done_unlock; +		case BLKROSET: +			if (get_user(ro, (int __user *)(arg))) { +				err = -EFAULT; +				goto done_unlock; +			} +			err = -EINVAL; + +			/* if the bdev is going readonly the value of mddev->ro +			 * does not matter, no writes are coming +			 */ +			if (ro) +				goto done_unlock; + +			/* are we are already prepared for writes? */ +			if (mddev->ro != 1) +				goto done_unlock; + +			/* transitioning to readauto need only happen for +			 * arrays that call md_write_start +			 */ +			if (mddev->pers) { +				err = restart_array(mddev); +				if (err == 0) { +					mddev->ro = 2; +					set_disk_ro(mddev->gendisk, 0); +				} +			} +			goto done_unlock;  	}  	/* @@ -5751,7 +5895,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)  	atomic_inc(&mddev->openers);  	mutex_unlock(&mddev->open_mutex); -	check_disk_change(bdev);   out:  	return err;  } @@ -5766,21 +5909,6 @@ static int md_release(struct gendisk *disk, fmode_t mode)  	return 0;  } - -static int md_media_changed(struct gendisk *disk) -{ -	mddev_t *mddev = disk->private_data; - -	return mddev->changed; -} - -static int md_revalidate(struct gendisk *disk) -{ -	mddev_t *mddev = disk->private_data; - -	mddev->changed = 0; -	return 0; -}  static const struct block_device_operations md_fops =  {  	.owner		= THIS_MODULE, @@ -5791,8 +5919,6 @@ static const struct block_device_operations md_fops =  	.compat_ioctl	= md_compat_ioctl,  #endif  	.getgeo		= md_getgeo, -	.media_changed	= md_media_changed, -	.revalidate_disk= md_revalidate,  };  static int md_thread(void * arg) @@ -5906,7 +6032,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)  	mddev->pers->error_handler(mddev,rdev);  	if (mddev->degraded)  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); -	set_bit(StateChanged, &rdev->flags); +	sysfs_notify_dirent(rdev->sysfs_state);  	set_bit(MD_RECOVERY_INTR, &mddev->recovery);  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);  	md_wakeup_thread(mddev->thread); @@ -6898,11 +7024,6 @@ void md_check_recovery(mddev_t *mddev)  		if (mddev->flags)  			md_update_sb(mddev, 0); -		list_for_each_entry(rdev, &mddev->disks, same_set) -			if (test_and_clear_bit(StateChanged, &rdev->flags)) -				sysfs_notify_dirent(rdev->sysfs_state); - -  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&  		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {  			/* resync/recovery still happening */ @@ -7039,7 +7160,7 @@ static int md_notify_reboot(struct notifier_block *this,  				 * appears to still be in use.  Hence  				 * the '100'.  				 */ -				do_md_stop(mddev, 1, 100); +				md_set_readonly(mddev, 100);  				mddev_unlock(mddev);  			}  		/* diff --git a/drivers/md/md.h b/drivers/md/md.h index 8e4c75c00d46..7ab5ea155452 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -74,9 +74,6 @@ struct mdk_rdev_s  #define Blocked		8		/* An error occured on an externally  					 * managed array, don't allow writes  					 * until it is cleared */ -#define StateChanged	9		/* Faulty or Blocked has changed during -					 * interrupt, so it needs to be -					 * notified by the thread */  	wait_queue_head_t blocked_wait;  	int desc_nr;			/* descriptor index in the superblock */ @@ -153,6 +150,12 @@ struct mddev_s  	int				external_size; /* size managed  							* externally */  	__u64				events; +	/* If the last 'event' was simply a clean->dirty transition, and +	 * we didn't write it to the spares, then it is safe and simple +	 * to just decrement the event count on a dirty->clean transition. +	 * So we record that possibility here. +	 */ +	int				can_decrease_events;  	char				uuid[16]; @@ -240,7 +243,6 @@ struct mddev_s  	atomic_t			active;		/* general refcount */  	atomic_t			openers;	/* number of active opens */ -	int				changed;	/* true if we might need to reread partition info */  	int				degraded;	/* whether md should consider  							 * adding a spare  							 */ @@ -279,9 +281,6 @@ struct mddev_s  	atomic_t			writes_pending;   	struct request_queue		*queue;	/* for plugging ... */ -	atomic_t                        write_behind; /* outstanding async IO */ -	unsigned int                    max_write_behind; /* 0 = sync */ -  	struct bitmap                   *bitmap; /* the bitmap for the device */  	struct {  		struct file		*file; /* the bitmap file */ @@ -305,6 +304,7 @@ struct mddev_s  	atomic_t 			max_corr_read_errors; /* max read retries */  	struct list_head		all_mddevs; +	struct attribute_group		*to_remove;  	/* Generic barrier handling.  	 * If there is a pending barrier request, all other  	 * writes are blocked while the devices are flushed. @@ -336,7 +336,7 @@ struct mdk_personality  	int level;  	struct list_head list;  	struct module *owner; -	int (*make_request)(struct request_queue *q, struct bio *bio); +	int (*make_request)(mddev_t *mddev, struct bio *bio);  	int (*run)(mddev_t *mddev);  	int (*stop)(mddev_t *mddev);  	void (*status)(struct seq_file *seq, mddev_t *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 789bf535d29c..410fb60699ac 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -85,7 +85,7 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)  static void multipath_end_request(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); +	struct multipath_bh *mp_bh = bio->bi_private;  	multipath_conf_t *conf = mp_bh->mddev->private;  	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; @@ -136,14 +136,11 @@ static void multipath_unplug(struct request_queue *q)  } -static int multipath_make_request (struct request_queue *q, struct bio * bio) +static int multipath_make_request(mddev_t *mddev, struct bio * bio)  { -	mddev_t *mddev = q->queuedata;  	multipath_conf_t *conf = mddev->private;  	struct multipath_bh * mp_bh;  	struct multipath_info *multipath; -	const int rw = bio_data_dir(bio); -	int cpu;  	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {  		md_barrier_request(mddev, bio); @@ -155,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)  	mp_bh->master_bio = bio;  	mp_bh->mddev = mddev; -	cpu = part_stat_lock(); -	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); -	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], -		      bio_sectors(bio)); -	part_stat_unlock(); -  	mp_bh->path = multipath_map(conf);  	if (mp_bh->path < 0) {  		bio_endio(bio, -EIO); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c3bec024612e..e70f004c99e8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -23,15 +23,17 @@  #include <linux/slab.h>  #include "md.h"  #include "raid0.h" +#include "raid5.h"  static void raid0_unplug(struct request_queue *q)  {  	mddev_t *mddev = q->queuedata;  	raid0_conf_t *conf = mddev->private;  	mdk_rdev_t **devlist = conf->devlist; +	int raid_disks = conf->strip_zone[0].nb_dev;  	int i; -	for (i=0; i<mddev->raid_disks; i++) { +	for (i=0; i < raid_disks; i++) {  		struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);  		blk_unplug(r_queue); @@ -43,12 +45,13 @@ static int raid0_congested(void *data, int bits)  	mddev_t *mddev = data;  	raid0_conf_t *conf = mddev->private;  	mdk_rdev_t **devlist = conf->devlist; +	int raid_disks = conf->strip_zone[0].nb_dev;  	int i, ret = 0;  	if (mddev_congested(mddev, bits))  		return 1; -	for (i = 0; i < mddev->raid_disks && !ret ; i++) { +	for (i = 0; i < raid_disks && !ret ; i++) {  		struct request_queue *q = bdev_get_queue(devlist[i]->bdev);  		ret |= bdi_congested(&q->backing_dev_info, bits); @@ -66,16 +69,17 @@ static void dump_zones(mddev_t *mddev)  	sector_t zone_start = 0;  	char b[BDEVNAME_SIZE];  	raid0_conf_t *conf = mddev->private; +	int raid_disks = conf->strip_zone[0].nb_dev;  	printk(KERN_INFO "******* %s configuration *********\n",  		mdname(mddev));  	h = 0;  	for (j = 0; j < conf->nr_strip_zones; j++) {  		printk(KERN_INFO "zone%d=[", j);  		for (k = 0; k < conf->strip_zone[j].nb_dev; k++) -			printk("%s/", -			bdevname(conf->devlist[j*mddev->raid_disks +			printk(KERN_CONT "%s/", +			bdevname(conf->devlist[j*raid_disks  						+ k]->bdev, b)); -		printk("]\n"); +		printk(KERN_CONT "]\n");  		zone_size  = conf->strip_zone[j].zone_end - zone_start;  		printk(KERN_INFO "        zone offset=%llukb " @@ -88,7 +92,7 @@ static void dump_zones(mddev_t *mddev)  	printk(KERN_INFO "**********************************\n\n");  } -static int create_strip_zones(mddev_t *mddev) +static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)  {  	int i, c, err;  	sector_t curr_zone_end, sectors; @@ -101,8 +105,9 @@ static int create_strip_zones(mddev_t *mddev)  	if (!conf)  		return -ENOMEM;  	list_for_each_entry(rdev1, &mddev->disks, same_set) { -		printk(KERN_INFO "raid0: looking at %s\n", -			bdevname(rdev1->bdev,b)); +		printk(KERN_INFO "md/raid0:%s: looking at %s\n", +		       mdname(mddev), +		       bdevname(rdev1->bdev, b));  		c = 0;  		/* round size to chunk_size */ @@ -111,14 +116,16 @@ static int create_strip_zones(mddev_t *mddev)  		rdev1->sectors = sectors * mddev->chunk_sectors;  		list_for_each_entry(rdev2, &mddev->disks, same_set) { -			printk(KERN_INFO "raid0:   comparing %s(%llu)", +			printk(KERN_INFO "md/raid0:%s:   comparing %s(%llu)", +			       mdname(mddev),  			       bdevname(rdev1->bdev,b),  			       (unsigned long long)rdev1->sectors); -			printk(KERN_INFO " with %s(%llu)\n", +			printk(KERN_CONT " with %s(%llu)\n",  			       bdevname(rdev2->bdev,b),  			       (unsigned long long)rdev2->sectors);  			if (rdev2 == rdev1) { -				printk(KERN_INFO "raid0:   END\n"); +				printk(KERN_INFO "md/raid0:%s:   END\n", +				       mdname(mddev));  				break;  			}  			if (rdev2->sectors == rdev1->sectors) { @@ -126,20 +133,24 @@ static int create_strip_zones(mddev_t *mddev)  				 * Not unique, don't count it as a new  				 * group  				 */ -				printk(KERN_INFO "raid0:   EQUAL\n"); +				printk(KERN_INFO "md/raid0:%s:   EQUAL\n", +				       mdname(mddev));  				c = 1;  				break;  			} -			printk(KERN_INFO "raid0:   NOT EQUAL\n"); +			printk(KERN_INFO "md/raid0:%s:   NOT EQUAL\n", +			       mdname(mddev));  		}  		if (!c) { -			printk(KERN_INFO "raid0:   ==> UNIQUE\n"); +			printk(KERN_INFO "md/raid0:%s:   ==> UNIQUE\n", +			       mdname(mddev));  			conf->nr_strip_zones++; -			printk(KERN_INFO "raid0: %d zones\n", -				conf->nr_strip_zones); +			printk(KERN_INFO "md/raid0:%s: %d zones\n", +			       mdname(mddev), conf->nr_strip_zones);  		}  	} -	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); +	printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n", +	       mdname(mddev), conf->nr_strip_zones);  	err = -ENOMEM;  	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*  				conf->nr_strip_zones, GFP_KERNEL); @@ -162,14 +173,18 @@ static int create_strip_zones(mddev_t *mddev)  	list_for_each_entry(rdev1, &mddev->disks, same_set) {  		int j = rdev1->raid_disk; +		if (mddev->level == 10) +			/* taking over a raid10-n2 array */ +			j /= 2; +  		if (j < 0 || j >= mddev->raid_disks) { -			printk(KERN_ERR "raid0: bad disk number %d - " -				"aborting!\n", j); +			printk(KERN_ERR "md/raid0:%s: bad disk number %d - " +			       "aborting!\n", mdname(mddev), j);  			goto abort;  		}  		if (dev[j]) { -			printk(KERN_ERR "raid0: multiple devices for %d - " -				"aborting!\n", j); +			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " +			       "aborting!\n", mdname(mddev), j);  			goto abort;  		}  		dev[j] = rdev1; @@ -191,8 +206,8 @@ static int create_strip_zones(mddev_t *mddev)  		cnt++;  	}  	if (cnt != mddev->raid_disks) { -		printk(KERN_ERR "raid0: too few disks (%d of %d) - " -			"aborting!\n", cnt, mddev->raid_disks); +		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " +		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);  		goto abort;  	}  	zone->nb_dev = cnt; @@ -208,39 +223,44 @@ static int create_strip_zones(mddev_t *mddev)  		zone = conf->strip_zone + i;  		dev = conf->devlist + i * mddev->raid_disks; -		printk(KERN_INFO "raid0: zone %d\n", i); +		printk(KERN_INFO "md/raid0:%s: zone %d\n", +		       mdname(mddev), i);  		zone->dev_start = smallest->sectors;  		smallest = NULL;  		c = 0;  		for (j=0; j<cnt; j++) {  			rdev = conf->devlist[j]; -			printk(KERN_INFO "raid0: checking %s ...", -				bdevname(rdev->bdev, b)); +			printk(KERN_INFO "md/raid0:%s: checking %s ...", +			       mdname(mddev), +			       bdevname(rdev->bdev, b));  			if (rdev->sectors <= zone->dev_start) { -				printk(KERN_INFO " nope.\n"); +				printk(KERN_CONT " nope.\n");  				continue;  			} -			printk(KERN_INFO " contained as device %d\n", c); +			printk(KERN_CONT " contained as device %d\n", c);  			dev[c] = rdev;  			c++;  			if (!smallest || rdev->sectors < smallest->sectors) {  				smallest = rdev; -				printk(KERN_INFO "  (%llu) is smallest!.\n", -					(unsigned long long)rdev->sectors); +				printk(KERN_INFO "md/raid0:%s:  (%llu) is smallest!.\n", +				       mdname(mddev), +				       (unsigned long long)rdev->sectors);  			}  		}  		zone->nb_dev = c;  		sectors = (smallest->sectors - zone->dev_start) * c; -		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", -			zone->nb_dev, (unsigned long long)sectors); +		printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", +		       mdname(mddev), +		       zone->nb_dev, (unsigned long long)sectors);  		curr_zone_end += sectors;  		zone->zone_end = curr_zone_end; -		printk(KERN_INFO "raid0: current zone start: %llu\n", -			(unsigned long long)smallest->sectors); +		printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n", +		       mdname(mddev), +		       (unsigned long long)smallest->sectors);  	}  	mddev->queue->unplug_fn = raid0_unplug;  	mddev->queue->backing_dev_info.congested_fn = raid0_congested; @@ -251,7 +271,7 @@ static int create_strip_zones(mddev_t *mddev)  	 * chunk size is a multiple of that sector size  	 */  	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { -		printk(KERN_ERR "%s chunk_size of %d not valid\n", +		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",  		       mdname(mddev),  		       mddev->chunk_sectors << 9);  		goto abort; @@ -261,14 +281,15 @@ static int create_strip_zones(mddev_t *mddev)  	blk_queue_io_opt(mddev->queue,  			 (mddev->chunk_sectors << 9) * mddev->raid_disks); -	printk(KERN_INFO "raid0: done.\n"); -	mddev->private = conf; +	printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev)); +	*private_conf = conf; +  	return 0;  abort:  	kfree(conf->strip_zone);  	kfree(conf->devlist);  	kfree(conf); -	mddev->private = NULL; +	*private_conf = NULL;  	return err;  } @@ -319,10 +340,12 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)  static int raid0_run(mddev_t *mddev)  { +	raid0_conf_t *conf;  	int ret;  	if (mddev->chunk_sectors == 0) { -		printk(KERN_ERR "md/raid0: chunk size must be set.\n"); +		printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", +		       mdname(mddev));  		return -EINVAL;  	}  	if (md_check_no_bitmap(mddev)) @@ -330,15 +353,27 @@ static int raid0_run(mddev_t *mddev)  	blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);  	mddev->queue->queue_lock = &mddev->queue->__queue_lock; -	ret = create_strip_zones(mddev); -	if (ret < 0) -		return ret; +	/* if private is not null, we are here after takeover */ +	if (mddev->private == NULL) { +		ret = create_strip_zones(mddev, &conf); +		if (ret < 0) +			return ret; +		mddev->private = conf; +	} +	conf = mddev->private; +	if (conf->scale_raid_disks) { +		int i; +		for (i=0; i < conf->strip_zone[0].nb_dev; i++) +			conf->devlist[i]->raid_disk /= conf->scale_raid_disks; +		/* FIXME update sysfs rd links */ +	}  	/* calculate array device size */  	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); -	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", -		(unsigned long long)mddev->array_sectors); +	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", +	       mdname(mddev), +	       (unsigned long long)mddev->array_sectors);  	/* calculate the max read-ahead size.  	 * For read-ahead of large files to be effective, we need to  	 * readahead at least twice a whole stripe. i.e. number of devices @@ -402,6 +437,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,  	unsigned int sect_in_chunk;  	sector_t chunk;  	raid0_conf_t *conf = mddev->private; +	int raid_disks = conf->strip_zone[0].nb_dev;  	unsigned int chunk_sects = mddev->chunk_sectors;  	if (is_power_of_2(chunk_sects)) { @@ -424,7 +460,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,  	*	+ the position in the chunk  	*/  	*sector_offset = (chunk * chunk_sects) + sect_in_chunk; -	return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks +	return conf->devlist[(zone - conf->strip_zone)*raid_disks  			     + sector_div(sector, zone->nb_dev)];  } @@ -444,27 +480,18 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev,  	}  } -static int raid0_make_request(struct request_queue *q, struct bio *bio) +static int raid0_make_request(mddev_t *mddev, struct bio *bio)  { -	mddev_t *mddev = q->queuedata;  	unsigned int chunk_sects;  	sector_t sector_offset;  	struct strip_zone *zone;  	mdk_rdev_t *tmp_dev; -	const int rw = bio_data_dir(bio); -	int cpu;  	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {  		md_barrier_request(mddev, bio);  		return 0;  	} -	cpu = part_stat_lock(); -	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); -	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], -		      bio_sectors(bio)); -	part_stat_unlock(); -  	chunk_sects = mddev->chunk_sectors;  	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {  		sector_t sector = bio->bi_sector; @@ -482,9 +509,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)  		else  			bp = bio_split(bio, chunk_sects -  				       sector_div(sector, chunk_sects)); -		if (raid0_make_request(q, &bp->bio1)) +		if (raid0_make_request(mddev, &bp->bio1))  			generic_make_request(&bp->bio1); -		if (raid0_make_request(q, &bp->bio2)) +		if (raid0_make_request(mddev, &bp->bio2))  			generic_make_request(&bp->bio2);  		bio_pair_release(bp); @@ -504,9 +531,10 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)  	return 1;  bad_map: -	printk("raid0_make_request bug: can't convert block across chunks" -		" or bigger than %dk %llu %d\n", chunk_sects / 2, -		(unsigned long long)bio->bi_sector, bio->bi_size >> 10); +	printk("md/raid0:%s: make_request bug: can't convert block across chunks" +	       " or bigger than %dk %llu %d\n", +	       mdname(mddev), chunk_sects / 2, +	       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);  	bio_io_error(bio);  	return 0; @@ -519,6 +547,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)  	int j, k, h;  	char b[BDEVNAME_SIZE];  	raid0_conf_t *conf = mddev->private; +	int raid_disks = conf->strip_zone[0].nb_dev;  	sector_t zone_size;  	sector_t zone_start = 0; @@ -529,7 +558,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)  		seq_printf(seq, "=[");  		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)  			seq_printf(seq, "%s/", bdevname( -				conf->devlist[j*mddev->raid_disks + k] +				conf->devlist[j*raid_disks + k]  						->bdev, b));  		zone_size  = conf->strip_zone[j].zone_end - zone_start; @@ -544,6 +573,104 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)  	return;  } +static void *raid0_takeover_raid5(mddev_t *mddev) +{ +	mdk_rdev_t *rdev; +	raid0_conf_t *priv_conf; + +	if (mddev->degraded != 1) { +		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", +		       mdname(mddev), +		       mddev->degraded); +		return ERR_PTR(-EINVAL); +	} + +	list_for_each_entry(rdev, &mddev->disks, same_set) { +		/* check slot number for a disk */ +		if (rdev->raid_disk == mddev->raid_disks-1) { +			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", +			       mdname(mddev)); +			return ERR_PTR(-EINVAL); +		} +	} + +	/* Set new parameters */ +	mddev->new_level = 0; +	mddev->new_chunk_sectors = mddev->chunk_sectors; +	mddev->raid_disks--; +	mddev->delta_disks = -1; +	/* make sure it will be not marked as dirty */ +	mddev->recovery_cp = MaxSector; + +	create_strip_zones(mddev, &priv_conf); +	return priv_conf; +} + +static void *raid0_takeover_raid10(mddev_t *mddev) +{ +	raid0_conf_t *priv_conf; + +	/* Check layout: +	 *  - far_copies must be 1 +	 *  - near_copies must be 2 +	 *  - disks number must be even +	 *  - all mirrors must be already degraded +	 */ +	if (mddev->layout != ((1 << 8) + 2)) { +		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", +		       mdname(mddev), +		       mddev->layout); +		return ERR_PTR(-EINVAL); +	} +	if (mddev->raid_disks & 1) { +		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", +		       mdname(mddev)); +		return ERR_PTR(-EINVAL); +	} +	if (mddev->degraded != (mddev->raid_disks>>1)) { +		printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", +		       mdname(mddev)); +		return ERR_PTR(-EINVAL); +	} + +	/* Set new parameters */ +	mddev->new_level = 0; +	mddev->new_chunk_sectors = mddev->chunk_sectors; +	mddev->delta_disks = - mddev->raid_disks / 2; +	mddev->raid_disks += mddev->delta_disks; +	mddev->degraded = 0; +	/* make sure it will be not marked as dirty */ +	mddev->recovery_cp = MaxSector; + +	create_strip_zones(mddev, &priv_conf); +	priv_conf->scale_raid_disks = 2; +	return priv_conf; +} + +static void *raid0_takeover(mddev_t *mddev) +{ +	/* raid0 can take over: +	 *  raid5 - providing it is Raid4 layout and one disk is faulty +	 *  raid10 - assuming we have all necessary active disks +	 */ +	if (mddev->level == 5) { +		if (mddev->layout == ALGORITHM_PARITY_N) +			return raid0_takeover_raid5(mddev); + +		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", +		       mdname(mddev), ALGORITHM_PARITY_N); +	} + +	if (mddev->level == 10) +		return raid0_takeover_raid10(mddev); + +	return ERR_PTR(-EINVAL); +} + +static void raid0_quiesce(mddev_t *mddev, int state) +{ +} +  static struct mdk_personality raid0_personality=  {  	.name		= "raid0", @@ -554,6 +681,8 @@ static struct mdk_personality raid0_personality=  	.stop		= raid0_stop,  	.status		= raid0_status,  	.size		= raid0_size, +	.takeover	= raid0_takeover, +	.quiesce	= raid0_quiesce,  };  static int __init raid0_init (void) diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 91f8e876ee64..d724e664ca4d 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -13,6 +13,9 @@ struct raid0_private_data  	struct strip_zone *strip_zone;  	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */  	int nr_strip_zones; +	int scale_raid_disks; /* divide rdev->raid_disks by this in run() +			       * to handle conversion from raid10 +			       */  };  typedef struct raid0_private_data raid0_conf_t; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e59b10e66edb..a948da8012de 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -263,7 +263,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio)  static void raid1_end_read_request(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	r1bio_t *r1_bio = bio->bi_private;  	int mirror;  	conf_t *conf = r1_bio->mddev->private; @@ -297,7 +297,8 @@ static void raid1_end_read_request(struct bio *bio, int error)  		 */  		char b[BDEVNAME_SIZE];  		if (printk_ratelimit()) -			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", +			printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", +			       mdname(conf->mddev),  			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);  		reschedule_retry(r1_bio);  	} @@ -308,7 +309,7 @@ static void raid1_end_read_request(struct bio *bio, int error)  static void raid1_end_write_request(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	r1bio_t *r1_bio = bio->bi_private;  	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);  	conf_t *conf = r1_bio->mddev->private;  	struct bio *to_put = NULL; @@ -418,7 +419,7 @@ static void raid1_end_write_request(struct bio *bio, int error)   */  static int read_balance(conf_t *conf, r1bio_t *r1_bio)  { -	const unsigned long this_sector = r1_bio->sector; +	const sector_t this_sector = r1_bio->sector;  	int new_disk = conf->last_used, disk = new_disk;  	int wonly_disk = -1;  	const int sectors = r1_bio->sectors; @@ -434,7 +435,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)   retry:  	if (conf->mddev->recovery_cp < MaxSector &&  	    (this_sector + sectors >= conf->next_resync)) { -		/* Choose the first operation device, for consistancy */ +		/* Choose the first operational device, for consistancy */  		new_disk = 0;  		for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); @@ -774,9 +775,8 @@ do_sync_io:  	return NULL;  } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio)  { -	mddev_t *mddev = q->queuedata;  	conf_t *conf = mddev->private;  	mirror_info_t *mirror;  	r1bio_t *r1_bio; @@ -788,7 +788,6 @@ static int make_request(struct request_queue *q, struct bio * bio)  	struct page **behind_pages = NULL;  	const int rw = bio_data_dir(bio);  	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); -	int cpu;  	bool do_barriers;  	mdk_rdev_t *blocked_rdev; @@ -834,12 +833,6 @@ static int make_request(struct request_queue *q, struct bio * bio)  	bitmap = mddev->bitmap; -	cpu = part_stat_lock(); -	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); -	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], -		      bio_sectors(bio)); -	part_stat_unlock(); -  	/*  	 * make_request() can abort the operation when READA is being  	 * used and no empty request is available. @@ -866,6 +859,15 @@ static int make_request(struct request_queue *q, struct bio * bio)  		}  		mirror = conf->mirrors + rdisk; +		if (test_bit(WriteMostly, &mirror->rdev->flags) && +		    bitmap) { +			/* Reading from a write-mostly device must +			 * take care not to over-take any writes +			 * that are 'behind' +			 */ +			wait_event(bitmap->behind_wait, +				   atomic_read(&bitmap->behind_writes) == 0); +		}  		r1_bio->read_disk = rdisk;  		read_bio = bio_clone(bio, GFP_NOIO); @@ -912,9 +914,10 @@ static int make_request(struct request_queue *q, struct bio * bio)  			if (test_bit(Faulty, &rdev->flags)) {  				rdev_dec_pending(rdev, mddev);  				r1_bio->bios[i] = NULL; -			} else +			} else {  				r1_bio->bios[i] = bio; -			targets++; +				targets++; +			}  		} else  			r1_bio->bios[i] = NULL;  	} @@ -942,10 +945,14 @@ static int make_request(struct request_queue *q, struct bio * bio)  		set_bit(R1BIO_Degraded, &r1_bio->state);  	} -	/* do behind I/O ? */ +	/* do behind I/O ? +	 * Not if there are too many, or cannot allocate memory, +	 * or a reader on WriteMostly is waiting for behind writes  +	 * to flush */  	if (bitmap &&  	    (atomic_read(&bitmap->behind_writes)  	     < mddev->bitmap_info.max_write_behind) && +	    !waitqueue_active(&bitmap->behind_wait) &&  	    (behind_pages = alloc_behind_pages(bio)) != NULL)  		set_bit(R1BIO_BehindIO, &r1_bio->state); @@ -1070,21 +1077,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)  	} else  		set_bit(Faulty, &rdev->flags);  	set_bit(MD_CHANGE_DEVS, &mddev->flags); -	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" -		"raid1: Operation continuing on %d devices.\n", -		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); +	printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" +	       KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", +	       mdname(mddev), bdevname(rdev->bdev, b), +	       mdname(mddev), conf->raid_disks - mddev->degraded);  }  static void print_conf(conf_t *conf)  {  	int i; -	printk("RAID1 conf printout:\n"); +	printk(KERN_DEBUG "RAID1 conf printout:\n");  	if (!conf) { -		printk("(!conf)\n"); +		printk(KERN_DEBUG "(!conf)\n");  		return;  	} -	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, +	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,  		conf->raid_disks);  	rcu_read_lock(); @@ -1092,7 +1100,7 @@ static void print_conf(conf_t *conf)  		char b[BDEVNAME_SIZE];  		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);  		if (rdev) -			printk(" disk %d, wo:%d, o:%d, dev:%s\n", +			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",  			       i, !test_bit(In_sync, &rdev->flags),  			       !test_bit(Faulty, &rdev->flags),  			       bdevname(rdev->bdev,b)); @@ -1223,7 +1231,7 @@ abort:  static void end_sync_read(struct bio *bio, int error)  { -	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	r1bio_t *r1_bio = bio->bi_private;  	int i;  	for (i=r1_bio->mddev->raid_disks; i--; ) @@ -1246,7 +1254,7 @@ static void end_sync_read(struct bio *bio, int error)  static void end_sync_write(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	r1bio_t *r1_bio = bio->bi_private;  	mddev_t *mddev = r1_bio->mddev;  	conf_t *conf = mddev->private;  	int i; @@ -1453,9 +1461,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)  				char b[BDEVNAME_SIZE];  				/* Cannot read from anywhere, array is toast */  				md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); -				printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" +				printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"  				       " for block %llu\n", -				       bdevname(bio->bi_bdev,b), +				       mdname(mddev), +				       bdevname(bio->bi_bdev, b),  				       (unsigned long long)r1_bio->sector);  				md_done_sync(mddev, r1_bio->sectors, 0);  				put_buf(r1_bio); @@ -1577,7 +1586,7 @@ static void fix_read_error(conf_t *conf, int read_disk,  				else {  					atomic_add(s, &rdev->corrected_errors);  					printk(KERN_INFO -					       "raid1:%s: read error corrected " +					       "md/raid1:%s: read error corrected "  					       "(%d sectors at %llu on %s)\n",  					       mdname(mddev), s,  					       (unsigned long long)(sect + @@ -1682,8 +1691,9 @@ static void raid1d(mddev_t *mddev)  			bio = r1_bio->bios[r1_bio->read_disk];  			if ((disk=read_balance(conf, r1_bio)) == -1) { -				printk(KERN_ALERT "raid1: %s: unrecoverable I/O" +				printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"  				       " read error for block %llu\n", +				       mdname(mddev),  				       bdevname(bio->bi_bdev,b),  				       (unsigned long long)r1_bio->sector);  				raid_end_bio_io(r1_bio); @@ -1697,10 +1707,11 @@ static void raid1d(mddev_t *mddev)  				r1_bio->bios[r1_bio->read_disk] = bio;  				rdev = conf->mirrors[disk].rdev;  				if (printk_ratelimit()) -					printk(KERN_ERR "raid1: %s: redirecting sector %llu to" -					       " another mirror\n", -					       bdevname(rdev->bdev,b), -					       (unsigned long long)r1_bio->sector); +					printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" +					       " other mirror: %s\n", +					       mdname(mddev), +					       (unsigned long long)r1_bio->sector, +					       bdevname(rdev->bdev,b));  				bio->bi_sector = r1_bio->sector + rdev->data_offset;  				bio->bi_bdev = rdev->bdev;  				bio->bi_end_io = raid1_end_read_request; @@ -1755,13 +1766,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i  	int still_degraded = 0;  	if (!conf->r1buf_pool) -	{ -/* -		printk("sync start - bitmap %p\n", mddev->bitmap); -*/  		if (init_resync(conf))  			return 0; -	}  	max_sector = mddev->dev_sectors;  	if (sector_nr >= max_sector) { @@ -2042,7 +2048,7 @@ static conf_t *setup_conf(mddev_t *mddev)  	err = -EIO;  	if (conf->last_used < 0) { -		printk(KERN_ERR "raid1: no operational mirrors for %s\n", +		printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",  		       mdname(mddev));  		goto abort;  	} @@ -2050,7 +2056,7 @@ static conf_t *setup_conf(mddev_t *mddev)  	conf->thread = md_register_thread(raid1d, mddev, NULL);  	if (!conf->thread) {  		printk(KERN_ERR -		       "raid1: couldn't allocate thread for %s\n", +		       "md/raid1:%s: couldn't allocate thread\n",  		       mdname(mddev));  		goto abort;  	} @@ -2076,12 +2082,12 @@ static int run(mddev_t *mddev)  	mdk_rdev_t *rdev;  	if (mddev->level != 1) { -		printk("raid1: %s: raid level not set to mirroring (%d)\n", +		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",  		       mdname(mddev), mddev->level);  		return -EIO;  	}  	if (mddev->reshape_position != MaxSector) { -		printk("raid1: %s: reshape_position set but not supported\n", +		printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",  		       mdname(mddev));  		return -EIO;  	} @@ -2124,11 +2130,11 @@ static int run(mddev_t *mddev)  		mddev->recovery_cp = MaxSector;  	if (mddev->recovery_cp != MaxSector) -		printk(KERN_NOTICE "raid1: %s is not clean" +		printk(KERN_NOTICE "md/raid1:%s: not clean"  		       " -- starting background reconstruction\n",  		       mdname(mddev));  	printk(KERN_INFO  -		"raid1: raid set %s active with %d out of %d mirrors\n", +		"md/raid1:%s: active with %d out of %d mirrors\n",  		mdname(mddev), mddev->raid_disks - mddev->degraded,   		mddev->raid_disks); @@ -2152,15 +2158,14 @@ static int stop(mddev_t *mddev)  {  	conf_t *conf = mddev->private;  	struct bitmap *bitmap = mddev->bitmap; -	int behind_wait = 0;  	/* wait for behind writes to complete */ -	while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { -		behind_wait++; -		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); -		set_current_state(TASK_UNINTERRUPTIBLE); -		schedule_timeout(HZ); /* wait a second */ +	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { +		printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", +		       mdname(mddev));  		/* need to kick something here to make sure I/O goes? */ +		wait_event(bitmap->behind_wait, +			   atomic_read(&bitmap->behind_writes) == 0);  	}  	raise_barrier(conf); @@ -2191,7 +2196,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)  	if (mddev->array_sectors > raid1_size(mddev, sectors, 0))  		return -EINVAL;  	set_capacity(mddev->gendisk, mddev->array_sectors); -	mddev->changed = 1;  	revalidate_disk(mddev->gendisk);  	if (sectors > mddev->dev_sectors &&  	    mddev->recovery_cp == MaxSector) { @@ -2286,9 +2290,9 @@ static int raid1_reshape(mddev_t *mddev)  			if (sysfs_create_link(&mddev->kobj,  					      &rdev->kobj, nm))  				printk(KERN_WARNING -				       "md/raid1: cannot register " -				       "%s for %s\n", -				       nm, mdname(mddev)); +				       "md/raid1:%s: cannot register " +				       "%s\n", +				       mdname(mddev), nm);  		}  		if (rdev)  			newmirrors[d2++].rdev = rdev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e2766d8251a1..03724992cdf2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -24,6 +24,7 @@  #include <linux/seq_file.h>  #include "md.h"  #include "raid10.h" +#include "raid0.h"  #include "bitmap.h"  /* @@ -255,7 +256,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)  static void raid10_end_read_request(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	r10bio_t *r10_bio = bio->bi_private;  	int slot, dev;  	conf_t *conf = r10_bio->mddev->private; @@ -285,7 +286,8 @@ static void raid10_end_read_request(struct bio *bio, int error)  		 */  		char b[BDEVNAME_SIZE];  		if (printk_ratelimit()) -			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", +			printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", +			       mdname(conf->mddev),  			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);  		reschedule_retry(r10_bio);  	} @@ -296,7 +298,7 @@ static void raid10_end_read_request(struct bio *bio, int error)  static void raid10_end_write_request(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	r10bio_t *r10_bio = bio->bi_private;  	int slot, dev;  	conf_t *conf = r10_bio->mddev->private; @@ -494,7 +496,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,   */  static int read_balance(conf_t *conf, r10bio_t *r10_bio)  { -	const unsigned long this_sector = r10_bio->sector; +	const sector_t this_sector = r10_bio->sector;  	int disk, slot, nslot;  	const int sectors = r10_bio->sectors;  	sector_t new_distance, current_distance; @@ -601,7 +603,7 @@ static void unplug_slaves(mddev_t *mddev)  	int i;  	rcu_read_lock(); -	for (i=0; i<mddev->raid_disks; i++) { +	for (i=0; i < conf->raid_disks; i++) {  		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);  		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {  			struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -635,7 +637,7 @@ static int raid10_congested(void *data, int bits)  	if (mddev_congested(mddev, bits))  		return 1;  	rcu_read_lock(); -	for (i = 0; i < mddev->raid_disks && ret == 0; i++) { +	for (i = 0; i < conf->raid_disks && ret == 0; i++) {  		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);  		if (rdev && !test_bit(Faulty, &rdev->flags)) {  			struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -788,14 +790,12 @@ static void unfreeze_array(conf_t *conf)  	spin_unlock_irq(&conf->resync_lock);  } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio)  { -	mddev_t *mddev = q->queuedata;  	conf_t *conf = mddev->private;  	mirror_info_t *mirror;  	r10bio_t *r10_bio;  	struct bio *read_bio; -	int cpu;  	int i;  	int chunk_sects = conf->chunk_mask + 1;  	const int rw = bio_data_dir(bio); @@ -825,16 +825,16 @@ static int make_request(struct request_queue *q, struct bio * bio)  		 */  		bp = bio_split(bio,  			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); -		if (make_request(q, &bp->bio1)) +		if (make_request(mddev, &bp->bio1))  			generic_make_request(&bp->bio1); -		if (make_request(q, &bp->bio2)) +		if (make_request(mddev, &bp->bio2))  			generic_make_request(&bp->bio2);  		bio_pair_release(bp);  		return 0;  	bad_map: -		printk("raid10_make_request bug: can't convert block across chunks" -		       " or bigger than %dk %llu %d\n", chunk_sects/2, +		printk("md/raid10:%s: make_request bug: can't convert block across chunks" +		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,  		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);  		bio_io_error(bio); @@ -850,12 +850,6 @@ static int make_request(struct request_queue *q, struct bio * bio)  	 */  	wait_barrier(conf); -	cpu = part_stat_lock(); -	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); -	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], -		      bio_sectors(bio)); -	part_stat_unlock(); -  	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);  	r10_bio->master_bio = bio; @@ -1039,9 +1033,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)  	}  	set_bit(Faulty, &rdev->flags);  	set_bit(MD_CHANGE_DEVS, &mddev->flags); -	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" -		"raid10: Operation continuing on %d devices.\n", -		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); +	printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" +	       KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", +	       mdname(mddev), bdevname(rdev->bdev, b), +	       mdname(mddev), conf->raid_disks - mddev->degraded);  }  static void print_conf(conf_t *conf) @@ -1049,19 +1044,19 @@ static void print_conf(conf_t *conf)  	int i;  	mirror_info_t *tmp; -	printk("RAID10 conf printout:\n"); +	printk(KERN_DEBUG "RAID10 conf printout:\n");  	if (!conf) { -		printk("(!conf)\n"); +		printk(KERN_DEBUG "(!conf)\n");  		return;  	} -	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, +	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,  		conf->raid_disks);  	for (i = 0; i < conf->raid_disks; i++) {  		char b[BDEVNAME_SIZE];  		tmp = conf->mirrors + i;  		if (tmp->rdev) -			printk(" disk %d, wo:%d, o:%d, dev:%s\n", +			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",  				i, !test_bit(In_sync, &tmp->rdev->flags),  			        !test_bit(Faulty, &tmp->rdev->flags),  				bdevname(tmp->rdev->bdev,b)); @@ -1132,7 +1127,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)  	int mirror;  	mirror_info_t *p;  	int first = 0; -	int last = mddev->raid_disks - 1; +	int last = conf->raid_disks - 1;  	if (mddev->recovery_cp < MaxSector)  		/* only hot-add to in-sync arrays, as recovery is @@ -1224,7 +1219,7 @@ abort:  static void end_sync_read(struct bio *bio, int error)  { -	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	r10bio_t *r10_bio = bio->bi_private;  	conf_t *conf = r10_bio->mddev->private;  	int i,d; @@ -1261,7 +1256,7 @@ static void end_sync_read(struct bio *bio, int error)  static void end_sync_write(struct bio *bio, int error)  {  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	r10bio_t *r10_bio = bio->bi_private;  	mddev_t *mddev = r10_bio->mddev;  	conf_t *conf = mddev->private;  	int i,d; @@ -1510,13 +1505,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)  		if (cur_read_error_count > max_read_errors) {  			rcu_read_unlock();  			printk(KERN_NOTICE -			       "raid10: %s: Raid device exceeded " +			       "md/raid10:%s: %s: Raid device exceeded "  			       "read_error threshold "  			       "[cur %d:max %d]\n", +			       mdname(mddev),  			       b, cur_read_error_count, max_read_errors);  			printk(KERN_NOTICE -			       "raid10: %s: Failing raid " -			       "device\n", b); +			       "md/raid10:%s: %s: Failing raid " +			       "device\n", mdname(mddev), b);  			md_error(mddev, conf->mirrors[d].rdev);  			return;  		} @@ -1586,15 +1582,16 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)  				    == 0) {  					/* Well, this device is dead */  					printk(KERN_NOTICE -					       "raid10:%s: read correction " +					       "md/raid10:%s: read correction "  					       "write failed"  					       " (%d sectors at %llu on %s)\n",  					       mdname(mddev), s,  					       (unsigned long long)(sect+  					       rdev->data_offset),  					       bdevname(rdev->bdev, b)); -					printk(KERN_NOTICE "raid10:%s: failing " +					printk(KERN_NOTICE "md/raid10:%s: %s: failing "  					       "drive\n", +					       mdname(mddev),  					       bdevname(rdev->bdev, b));  					md_error(mddev, rdev);  				} @@ -1622,20 +1619,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)  						 READ) == 0) {  					/* Well, this device is dead */  					printk(KERN_NOTICE -					       "raid10:%s: unable to read back " +					       "md/raid10:%s: unable to read back "  					       "corrected sectors"  					       " (%d sectors at %llu on %s)\n",  					       mdname(mddev), s,  					       (unsigned long long)(sect+  						    rdev->data_offset),  					       bdevname(rdev->bdev, b)); -					printk(KERN_NOTICE "raid10:%s: failing drive\n", +					printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", +					       mdname(mddev),  					       bdevname(rdev->bdev, b));  					md_error(mddev, rdev);  				} else {  					printk(KERN_INFO -					       "raid10:%s: read error corrected" +					       "md/raid10:%s: read error corrected"  					       " (%d sectors at %llu on %s)\n",  					       mdname(mddev), s,  					       (unsigned long long)(sect+ @@ -1710,8 +1708,9 @@ static void raid10d(mddev_t *mddev)  				mddev->ro ? IO_BLOCKED : NULL;  			mirror = read_balance(conf, r10_bio);  			if (mirror == -1) { -				printk(KERN_ALERT "raid10: %s: unrecoverable I/O" +				printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"  				       " read error for block %llu\n", +				       mdname(mddev),  				       bdevname(bio->bi_bdev,b),  				       (unsigned long long)r10_bio->sector);  				raid_end_bio_io(r10_bio); @@ -1721,8 +1720,9 @@ static void raid10d(mddev_t *mddev)  				bio_put(bio);  				rdev = conf->mirrors[mirror].rdev;  				if (printk_ratelimit()) -					printk(KERN_ERR "raid10: %s: redirecting sector %llu to" +					printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"  					       " another mirror\n", +					       mdname(mddev),  					       bdevname(rdev->bdev,b),  					       (unsigned long long)r10_bio->sector);  				bio = bio_clone(r10_bio->master_bio, GFP_NOIO); @@ -1980,7 +1980,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i  					r10_bio = rb2;  					if (!test_and_set_bit(MD_RECOVERY_INTR,  							      &mddev->recovery)) -						printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", +						printk(KERN_INFO "md/raid10:%s: insufficient " +						       "working devices for recovery.\n",  						       mdname(mddev));  					break;  				} @@ -2140,9 +2141,9 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)  	conf_t *conf = mddev->private;  	if (!raid_disks) -		raid_disks = mddev->raid_disks; +		raid_disks = conf->raid_disks;  	if (!sectors) -		sectors = mddev->dev_sectors; +		sectors = conf->dev_sectors;  	size = sectors >> conf->chunk_shift;  	sector_div(size, conf->far_copies); @@ -2152,62 +2153,61 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)  	return size << conf->chunk_shift;  } -static int run(mddev_t *mddev) + +static conf_t *setup_conf(mddev_t *mddev)  { -	conf_t *conf; -	int i, disk_idx, chunk_size; -	mirror_info_t *disk; -	mdk_rdev_t *rdev; +	conf_t *conf = NULL;  	int nc, fc, fo;  	sector_t stride, size; +	int err = -EINVAL;  	if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||  	    !is_power_of_2(mddev->chunk_sectors)) { -		printk(KERN_ERR "md/raid10: chunk size must be " -		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); -		return -EINVAL; +		printk(KERN_ERR "md/raid10:%s: chunk size must be " +		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", +		       mdname(mddev), PAGE_SIZE); +		goto out;  	}  	nc = mddev->layout & 255;  	fc = (mddev->layout >> 8) & 255;  	fo = mddev->layout & (1<<16); +  	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||  	    (mddev->layout >> 17)) { -		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", +		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",  		       mdname(mddev), mddev->layout);  		goto out;  	} -	/* -	 * copy the already verified devices into our private RAID10 -	 * bookkeeping area. [whatever we allocate in run(), -	 * should be freed in stop()] -	 */ + +	err = -ENOMEM;  	conf = kzalloc(sizeof(conf_t), GFP_KERNEL); -	mddev->private = conf; -	if (!conf) { -		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", -			mdname(mddev)); +	if (!conf)  		goto out; -	} +  	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, -				 GFP_KERNEL); -	if (!conf->mirrors) { -		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", -		       mdname(mddev)); -		goto out_free_conf; -	} +				GFP_KERNEL); +	if (!conf->mirrors) +		goto out;  	conf->tmppage = alloc_page(GFP_KERNEL);  	if (!conf->tmppage) -		goto out_free_conf; +		goto out; +  	conf->raid_disks = mddev->raid_disks;  	conf->near_copies = nc;  	conf->far_copies = fc;  	conf->copies = nc*fc;  	conf->far_offset = fo; -	conf->chunk_mask = mddev->chunk_sectors - 1; -	conf->chunk_shift = ffz(~mddev->chunk_sectors); +	conf->chunk_mask = mddev->new_chunk_sectors - 1; +	conf->chunk_shift = ffz(~mddev->new_chunk_sectors); + +	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, +					   r10bio_pool_free, conf); +	if (!conf->r10bio_pool) +		goto out; +  	size = mddev->dev_sectors >> conf->chunk_shift;  	sector_div(size, fc);  	size = size * conf->raid_disks; @@ -2221,7 +2221,8 @@ static int run(mddev_t *mddev)  	 */  	stride += conf->raid_disks - 1;  	sector_div(stride, conf->raid_disks); -	mddev->dev_sectors = stride << conf->chunk_shift; + +	conf->dev_sectors = stride << conf->chunk_shift;  	if (fo)  		stride = 1; @@ -2229,18 +2230,63 @@ static int run(mddev_t *mddev)  		sector_div(stride, fc);  	conf->stride = stride << conf->chunk_shift; -	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, -						r10bio_pool_free, conf); -	if (!conf->r10bio_pool) { -		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", -			mdname(mddev)); -		goto out_free_conf; -	} -	conf->mddev = mddev;  	spin_lock_init(&conf->device_lock); +	INIT_LIST_HEAD(&conf->retry_list); + +	spin_lock_init(&conf->resync_lock); +	init_waitqueue_head(&conf->wait_barrier); + +	conf->thread = md_register_thread(raid10d, mddev, NULL); +	if (!conf->thread) +		goto out; + +	conf->scale_disks = 0; +	conf->mddev = mddev; +	return conf; + + out: +	printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", +	       mdname(mddev)); +	if (conf) { +		if (conf->r10bio_pool) +			mempool_destroy(conf->r10bio_pool); +		kfree(conf->mirrors); +		safe_put_page(conf->tmppage); +		kfree(conf); +	} +	return ERR_PTR(err); +} + +static int run(mddev_t *mddev) +{ +	conf_t *conf; +	int i, disk_idx, chunk_size; +	mirror_info_t *disk; +	mdk_rdev_t *rdev; +	sector_t size; + +	/* +	 * copy the already verified devices into our private RAID10 +	 * bookkeeping area. [whatever we allocate in run(), +	 * should be freed in stop()] +	 */ + +	if (mddev->private == NULL) { +		conf = setup_conf(mddev); +		if (IS_ERR(conf)) +			return PTR_ERR(conf); +		mddev->private = conf; +	} +	conf = mddev->private; +	if (!conf) +		goto out; +  	mddev->queue->queue_lock = &conf->device_lock; +	mddev->thread = conf->thread; +	conf->thread = NULL; +  	chunk_size = mddev->chunk_sectors << 9;  	blk_queue_io_min(mddev->queue, chunk_size);  	if (conf->raid_disks % conf->near_copies) @@ -2251,9 +2297,14 @@ static int run(mddev_t *mddev)  	list_for_each_entry(rdev, &mddev->disks, same_set) {  		disk_idx = rdev->raid_disk; -		if (disk_idx >= mddev->raid_disks +		if (disk_idx >= conf->raid_disks  		    || disk_idx < 0)  			continue; +		if (conf->scale_disks) { +			disk_idx *= conf->scale_disks; +			rdev->raid_disk = disk_idx; +			/* MOVE 'rd%d' link !! */ +		}  		disk = conf->mirrors + disk_idx;  		disk->rdev = rdev; @@ -2271,14 +2322,9 @@ static int run(mddev_t *mddev)  		disk->head_position = 0;  	} -	INIT_LIST_HEAD(&conf->retry_list); - -	spin_lock_init(&conf->resync_lock); -	init_waitqueue_head(&conf->wait_barrier); -  	/* need to check that every block has at least one working mirror */  	if (!enough(conf)) { -		printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", +		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",  		       mdname(mddev));  		goto out_free_conf;  	} @@ -2297,28 +2343,21 @@ static int run(mddev_t *mddev)  		}  	} - -	mddev->thread = md_register_thread(raid10d, mddev, NULL); -	if (!mddev->thread) { -		printk(KERN_ERR -		       "raid10: couldn't allocate thread for %s\n", -		       mdname(mddev)); -		goto out_free_conf; -	} -  	if (mddev->recovery_cp != MaxSector) -		printk(KERN_NOTICE "raid10: %s is not clean" +		printk(KERN_NOTICE "md/raid10:%s: not clean"  		       " -- starting background reconstruction\n",  		       mdname(mddev));  	printk(KERN_INFO -		"raid10: raid set %s active with %d out of %d devices\n", -		mdname(mddev), mddev->raid_disks - mddev->degraded, -		mddev->raid_disks); +		"md/raid10:%s: active with %d out of %d devices\n", +		mdname(mddev), conf->raid_disks - mddev->degraded, +		conf->raid_disks);  	/*  	 * Ok, everything is just fine now  	 */ -	md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); -	mddev->resync_max_sectors = raid10_size(mddev, 0, 0); +	mddev->dev_sectors = conf->dev_sectors; +	size = raid10_size(mddev, 0, 0); +	md_set_array_sectors(mddev, size); +	mddev->resync_max_sectors = size;  	mddev->queue->unplug_fn = raid10_unplug;  	mddev->queue->backing_dev_info.congested_fn = raid10_congested; @@ -2336,7 +2375,7 @@ static int run(mddev_t *mddev)  			mddev->queue->backing_dev_info.ra_pages = 2* stripe;  	} -	if (conf->near_copies < mddev->raid_disks) +	if (conf->near_copies < conf->raid_disks)  		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);  	md_integrity_register(mddev);  	return 0; @@ -2348,6 +2387,7 @@ out_free_conf:  	kfree(conf->mirrors);  	kfree(conf);  	mddev->private = NULL; +	md_unregister_thread(mddev->thread);  out:  	return -EIO;  } @@ -2384,6 +2424,61 @@ static void raid10_quiesce(mddev_t *mddev, int state)  	}  } +static void *raid10_takeover_raid0(mddev_t *mddev) +{ +	mdk_rdev_t *rdev; +	conf_t *conf; + +	if (mddev->degraded > 0) { +		printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", +		       mdname(mddev)); +		return ERR_PTR(-EINVAL); +	} + +	/* Update slot numbers to obtain +	 * degraded raid10 with missing mirrors +	 */ +	list_for_each_entry(rdev, &mddev->disks, same_set) { +		rdev->raid_disk *= 2; +	} + +	/* Set new parameters */ +	mddev->new_level = 10; +	/* new layout: far_copies = 1, near_copies = 2 */ +	mddev->new_layout = (1<<8) + 2; +	mddev->new_chunk_sectors = mddev->chunk_sectors; +	mddev->delta_disks = mddev->raid_disks; +	mddev->degraded = mddev->raid_disks; +	mddev->raid_disks *= 2; +	/* make sure it will be not marked as dirty */ +	mddev->recovery_cp = MaxSector; + +	conf = setup_conf(mddev); +	conf->scale_disks = 2; +	return conf; +} + +static void *raid10_takeover(mddev_t *mddev) +{ +	struct raid0_private_data *raid0_priv; + +	/* raid10 can take over: +	 *  raid0 - providing it has only two drives +	 */ +	if (mddev->level == 0) { +		/* for raid0 takeover only one zone is supported */ +		raid0_priv = mddev->private; +		if (raid0_priv->nr_strip_zones > 1) { +			printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" +			       " with more than one zone.\n", +			       mdname(mddev)); +			return ERR_PTR(-EINVAL); +		} +		return raid10_takeover_raid0(mddev); +	} +	return ERR_PTR(-EINVAL); +} +  static struct mdk_personality raid10_personality =  {  	.name		= "raid10", @@ -2400,6 +2495,7 @@ static struct mdk_personality raid10_personality =  	.sync_request	= sync_request,  	.quiesce	= raid10_quiesce,  	.size		= raid10_size, +	.takeover	= raid10_takeover,  };  static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 59cd1efb8d30..3824a087e17c 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,9 +33,16 @@ struct r10_private_data_s {  					       * 1 stripe.  					       */ +	sector_t		dev_sectors;  /* temp copy of mddev->dev_sectors */ +  	int chunk_shift; /* shift from chunks to sectors */  	sector_t chunk_mask; +	int			scale_disks;  /* When starting array, multiply +					       * each ->raid_disk by this. +					       * Need for raid0->raid10 migration +					       */ +  	struct list_head	retry_list;  	/* queue pending writes and submit them on unplug */  	struct bio_list		pending_bio_list; @@ -57,6 +64,11 @@ struct r10_private_data_s {  	mempool_t *r10bio_pool;  	mempool_t *r10buf_pool;  	struct page		*tmppage; + +	/* When taking over an array from a different personality, we store +	 * the new thread here until we fully activate the array. +	 */ +	struct mdk_thread_s	*thread;  };  typedef struct r10_private_data_s conf_t; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 15348c393b5d..d2c0f94fa37d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -53,6 +53,7 @@  #include <linux/slab.h>  #include "md.h"  #include "raid5.h" +#include "raid0.h"  #include "bitmap.h"  /* @@ -1509,7 +1510,7 @@ static void raid5_end_read_request(struct bio * bi, int error)  		set_bit(R5_UPTODATE, &sh->dev[i].flags);  		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {  			rdev = conf->disks[i].rdev; -			printk_rl(KERN_INFO "raid5:%s: read error corrected" +			printk_rl(KERN_INFO "md/raid:%s: read error corrected"  				  " (%lu sectors at %llu on %s)\n",  				  mdname(conf->mddev), STRIPE_SECTORS,  				  (unsigned long long)(sh->sector @@ -1529,7 +1530,7 @@ static void raid5_end_read_request(struct bio * bi, int error)  		atomic_inc(&rdev->read_errors);  		if (conf->mddev->degraded >= conf->max_degraded)  			printk_rl(KERN_WARNING -				  "raid5:%s: read error not correctable " +				  "md/raid:%s: read error not correctable "  				  "(sector %llu on %s).\n",  				  mdname(conf->mddev),  				  (unsigned long long)(sh->sector @@ -1538,7 +1539,7 @@ static void raid5_end_read_request(struct bio * bi, int error)  		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))  			/* Oh, no!!! */  			printk_rl(KERN_WARNING -				  "raid5:%s: read error NOT corrected!! " +				  "md/raid:%s: read error NOT corrected!! "  				  "(sector %llu on %s).\n",  				  mdname(conf->mddev),  				  (unsigned long long)(sh->sector @@ -1547,7 +1548,7 @@ static void raid5_end_read_request(struct bio * bi, int error)  		else if (atomic_read(&rdev->read_errors)  			 > conf->max_nr_stripes)  			printk(KERN_WARNING -			       "raid5:%s: Too many read errors, failing device %s.\n", +			       "md/raid:%s: Too many read errors, failing device %s.\n",  			       mdname(conf->mddev), bdn);  		else  			retry = 1; @@ -1619,8 +1620,8 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)  static void error(mddev_t *mddev, mdk_rdev_t *rdev)  {  	char b[BDEVNAME_SIZE]; -	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; -	pr_debug("raid5: error called\n"); +	raid5_conf_t *conf = mddev->private; +	pr_debug("raid456: error called\n");  	if (!test_bit(Faulty, &rdev->flags)) {  		set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1636,9 +1637,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)  		}  		set_bit(Faulty, &rdev->flags);  		printk(KERN_ALERT -		       "raid5: Disk failure on %s, disabling device.\n" -		       "raid5: Operation continuing on %d devices.\n", -		       bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); +		       "md/raid:%s: Disk failure on %s, disabling device.\n" +		       KERN_ALERT +		       "md/raid:%s: Operation continuing on %d devices.\n", +		       mdname(mddev), +		       bdevname(rdev->bdev, b), +		       mdname(mddev), +		       conf->raid_disks - mddev->degraded);  	}  } @@ -1714,8 +1719,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,  			pd_idx = data_disks;  			break;  		default: -			printk(KERN_ERR "raid5: unsupported algorithm %d\n", -				algorithm);  			BUG();  		}  		break; @@ -1832,10 +1835,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,  			qd_idx = raid_disks - 1;  			break; -  		default: -			printk(KERN_CRIT "raid6: unsupported algorithm %d\n", -			       algorithm);  			BUG();  		}  		break; @@ -1898,8 +1898,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)  		case ALGORITHM_PARITY_N:  			break;  		default: -			printk(KERN_ERR "raid5: unsupported algorithm %d\n", -			       algorithm);  			BUG();  		}  		break; @@ -1958,8 +1956,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)  			i -= 1;  			break;  		default: -			printk(KERN_CRIT "raid6: unsupported algorithm %d\n", -			       algorithm);  			BUG();  		}  		break; @@ -1972,7 +1968,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)  				     previous, &dummy1, &sh2);  	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx  		|| sh2.qd_idx != sh->qd_idx) { -		printk(KERN_ERR "compute_blocknr: map not correct\n"); +		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", +		       mdname(conf->mddev));  		return 0;  	}  	return r_sector; @@ -3709,10 +3706,10 @@ static void raid5_align_endio(struct bio *bi, int error)  	bio_put(bi); -	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; -	conf = mddev->private;  	rdev = (void*)raid_bi->bi_next;  	raid_bi->bi_next = NULL; +	mddev = rdev->mddev; +	conf = mddev->private;  	rdev_dec_pending(rdev, conf->mddev); @@ -3749,9 +3746,8 @@ static int bio_fits_rdev(struct bio *bi)  } -static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) +static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)  { -	mddev_t *mddev = q->queuedata;  	raid5_conf_t *conf = mddev->private;  	int dd_idx;  	struct bio* align_bi; @@ -3866,16 +3862,15 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)  	return sh;  } -static int make_request(struct request_queue *q, struct bio * bi) +static int make_request(mddev_t *mddev, struct bio * bi)  { -	mddev_t *mddev = q->queuedata;  	raid5_conf_t *conf = mddev->private;  	int dd_idx;  	sector_t new_sector;  	sector_t logical_sector, last_sector;  	struct stripe_head *sh;  	const int rw = bio_data_dir(bi); -	int cpu, remaining; +	int remaining;  	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {  		/* Drain all pending writes.  We only really need @@ -3890,15 +3885,9 @@ static int make_request(struct request_queue *q, struct bio * bi)  	md_write_start(mddev, bi); -	cpu = part_stat_lock(); -	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); -	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], -		      bio_sectors(bi)); -	part_stat_unlock(); -  	if (rw == READ &&  	     mddev->reshape_position == MaxSector && -	     chunk_aligned_read(q,bi)) +	     chunk_aligned_read(mddev,bi))  		return 0;  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); @@ -3946,7 +3935,7 @@ static int make_request(struct request_queue *q, struct bio * bi)  		new_sector = raid5_compute_sector(conf, logical_sector,  						  previous,  						  &dd_idx, NULL); -		pr_debug("raid5: make_request, sector %llu logical %llu\n", +		pr_debug("raid456: make_request, sector %llu logical %llu\n",  			(unsigned long long)new_sector,   			(unsigned long long)logical_sector); @@ -4054,7 +4043,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped  	 * As the reads complete, handle_stripe will copy the data  	 * into the destination stripe and release that stripe.  	 */ -	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	raid5_conf_t *conf = mddev->private;  	struct stripe_head *sh;  	sector_t first_sector, last_sector;  	int raid_disks = conf->previous_raid_disks; @@ -4263,7 +4252,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped  /* FIXME go_faster isn't used */  static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)  { -	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	raid5_conf_t *conf = mddev->private;  	struct stripe_head *sh;  	sector_t max_sector = mddev->dev_sectors;  	int sync_blocks; @@ -4656,7 +4645,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,  			kfree(percpu->scribble);  			pr_err("%s: failed memory allocation for cpu%ld\n",  			       __func__, cpu); -			return NOTIFY_BAD; +			return notifier_from_errno(-ENOMEM);  		}  		break;  	case CPU_DEAD: @@ -4725,7 +4714,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)  	if (mddev->new_level != 5  	    && mddev->new_level != 4  	    && mddev->new_level != 6) { -		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", +		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",  		       mdname(mddev), mddev->new_level);  		return ERR_PTR(-EIO);  	} @@ -4733,12 +4722,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)  	     && !algorithm_valid_raid5(mddev->new_layout)) ||  	    (mddev->new_level == 6  	     && !algorithm_valid_raid6(mddev->new_layout))) { -		printk(KERN_ERR "raid5: %s: layout %d not supported\n", +		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",  		       mdname(mddev), mddev->new_layout);  		return ERR_PTR(-EIO);  	}  	if (mddev->new_level == 6 && mddev->raid_disks < 4) { -		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", +		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",  		       mdname(mddev), mddev->raid_disks);  		return ERR_PTR(-EINVAL);  	} @@ -4746,8 +4735,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)  	if (!mddev->new_chunk_sectors ||  	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||  	    !is_power_of_2(mddev->new_chunk_sectors)) { -		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", -		       mddev->new_chunk_sectors << 9, mdname(mddev)); +		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", +		       mdname(mddev), mddev->new_chunk_sectors << 9);  		return ERR_PTR(-EINVAL);  	} @@ -4789,7 +4778,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)  	if (raid5_alloc_percpu(conf) != 0)  		goto abort; -	pr_debug("raid5: run(%s) called.\n", mdname(mddev)); +	pr_debug("raid456: run(%s) called.\n", mdname(mddev));  	list_for_each_entry(rdev, &mddev->disks, same_set) {  		raid_disk = rdev->raid_disk; @@ -4802,9 +4791,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)  		if (test_bit(In_sync, &rdev->flags)) {  			char b[BDEVNAME_SIZE]; -			printk(KERN_INFO "raid5: device %s operational as raid" -				" disk %d\n", bdevname(rdev->bdev,b), -				raid_disk); +			printk(KERN_INFO "md/raid:%s: device %s operational as raid" +			       " disk %d\n", +			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);  		} else  			/* Cannot rely on bitmap to complete recovery */  			conf->fullsync = 1; @@ -4828,16 +4817,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)  		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;  	if (grow_stripes(conf, conf->max_nr_stripes)) {  		printk(KERN_ERR -			"raid5: couldn't allocate %dkB for buffers\n", memory); +		       "md/raid:%s: couldn't allocate %dkB for buffers\n", +		       mdname(mddev), memory);  		goto abort;  	} else -		printk(KERN_INFO "raid5: allocated %dkB for %s\n", -			memory, mdname(mddev)); +		printk(KERN_INFO "md/raid:%s: allocated %dkB\n", +		       mdname(mddev), memory);  	conf->thread = md_register_thread(raid5d, mddev, NULL);  	if (!conf->thread) {  		printk(KERN_ERR -		       "raid5: couldn't allocate thread for %s\n", +		       "md/raid:%s: couldn't allocate thread.\n",  		       mdname(mddev));  		goto abort;  	} @@ -4888,7 +4878,7 @@ static int run(mddev_t *mddev)  	sector_t reshape_offset = 0;  	if (mddev->recovery_cp != MaxSector) -		printk(KERN_NOTICE "raid5: %s is not clean" +		printk(KERN_NOTICE "md/raid:%s: not clean"  		       " -- starting background reconstruction\n",  		       mdname(mddev));  	if (mddev->reshape_position != MaxSector) { @@ -4902,7 +4892,7 @@ static int run(mddev_t *mddev)  		int max_degraded = (mddev->level == 6 ? 2 : 1);  		if (mddev->new_level != mddev->level) { -			printk(KERN_ERR "raid5: %s: unsupported reshape " +			printk(KERN_ERR "md/raid:%s: unsupported reshape "  			       "required - aborting.\n",  			       mdname(mddev));  			return -EINVAL; @@ -4915,8 +4905,8 @@ static int run(mddev_t *mddev)  		here_new = mddev->reshape_position;  		if (sector_div(here_new, mddev->new_chunk_sectors *  			       (mddev->raid_disks - max_degraded))) { -			printk(KERN_ERR "raid5: reshape_position not " -			       "on a stripe boundary\n"); +			printk(KERN_ERR "md/raid:%s: reshape_position not " +			       "on a stripe boundary\n", mdname(mddev));  			return -EINVAL;  		}  		reshape_offset = here_new * mddev->new_chunk_sectors; @@ -4937,8 +4927,9 @@ static int run(mddev_t *mddev)  			if ((here_new * mddev->new_chunk_sectors !=   			     here_old * mddev->chunk_sectors) ||  			    mddev->ro == 0) { -				printk(KERN_ERR "raid5: in-place reshape must be started" -				       " in read-only mode - aborting\n"); +				printk(KERN_ERR "md/raid:%s: in-place reshape must be started" +				       " in read-only mode - aborting\n", +				       mdname(mddev));  				return -EINVAL;  			}  		} else if (mddev->delta_disks < 0 @@ -4947,11 +4938,13 @@ static int run(mddev_t *mddev)  		    : (here_new * mddev->new_chunk_sectors >=  		       here_old * mddev->chunk_sectors)) {  			/* Reading from the same stripe as writing to - bad */ -			printk(KERN_ERR "raid5: reshape_position too early for " -			       "auto-recovery - aborting.\n"); +			printk(KERN_ERR "md/raid:%s: reshape_position too early for " +			       "auto-recovery - aborting.\n", +			       mdname(mddev));  			return -EINVAL;  		} -		printk(KERN_INFO "raid5: reshape will continue\n"); +		printk(KERN_INFO "md/raid:%s: reshape will continue\n", +		       mdname(mddev));  		/* OK, we should be able to continue; */  	} else {  		BUG_ON(mddev->level != mddev->new_level); @@ -4993,18 +4986,6 @@ static int run(mddev_t *mddev)  		    mddev->minor_version > 90)  			rdev->recovery_offset = reshape_offset; -		printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", -		       rdev->raid_disk, working_disks, conf->prev_algo, -		       conf->previous_raid_disks, conf->max_degraded, -		       conf->algorithm, conf->raid_disks,  -		       only_parity(rdev->raid_disk, -				   conf->prev_algo, -				   conf->previous_raid_disks, -				   conf->max_degraded), -		       only_parity(rdev->raid_disk, -				   conf->algorithm, -				   conf->raid_disks, -				   conf->max_degraded));  		if (rdev->recovery_offset < reshape_offset) {  			/* We need to check old and new layout */  			if (!only_parity(rdev->raid_disk, @@ -5025,7 +5006,7 @@ static int run(mddev_t *mddev)  			   - working_disks);  	if (mddev->degraded > conf->max_degraded) { -		printk(KERN_ERR "raid5: not enough operational devices for %s" +		printk(KERN_ERR "md/raid:%s: not enough operational devices"  			" (%d/%d failed)\n",  			mdname(mddev), mddev->degraded, conf->raid_disks);  		goto abort; @@ -5039,32 +5020,32 @@ static int run(mddev_t *mddev)  	    mddev->recovery_cp != MaxSector) {  		if (mddev->ok_start_degraded)  			printk(KERN_WARNING -			       "raid5: starting dirty degraded array: %s" -			       "- data corruption possible.\n", +			       "md/raid:%s: starting dirty degraded array" +			       " - data corruption possible.\n",  			       mdname(mddev));  		else {  			printk(KERN_ERR -			       "raid5: cannot start dirty degraded array for %s\n", +			       "md/raid:%s: cannot start dirty degraded array.\n",  			       mdname(mddev));  			goto abort;  		}  	}  	if (mddev->degraded == 0) -		printk("raid5: raid level %d set %s active with %d out of %d" -		       " devices, algorithm %d\n", conf->level, mdname(mddev), +		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" +		       " devices, algorithm %d\n", mdname(mddev), conf->level,  		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,  		       mddev->new_layout);  	else -		printk(KERN_ALERT "raid5: raid level %d set %s active with %d" -			" out of %d devices, algorithm %d\n", conf->level, -			mdname(mddev), mddev->raid_disks - mddev->degraded, -			mddev->raid_disks, mddev->new_layout); +		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" +		       " out of %d devices, algorithm %d\n", +		       mdname(mddev), conf->level, +		       mddev->raid_disks - mddev->degraded, +		       mddev->raid_disks, mddev->new_layout);  	print_raid5_conf(conf);  	if (conf->reshape_progress != MaxSector) { -		printk("...ok start reshape thread\n");  		conf->reshape_safe = conf->reshape_progress;  		atomic_set(&conf->reshape_stripes, 0);  		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -5087,9 +5068,11 @@ static int run(mddev_t *mddev)  	}  	/* Ok, everything is just fine now */ -	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) +	if (mddev->to_remove == &raid5_attrs_group) +		mddev->to_remove = NULL; +	else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))  		printk(KERN_WARNING -		       "raid5: failed to create sysfs attributes for %s\n", +		       "md/raid:%s: failed to create sysfs attributes.\n",  		       mdname(mddev));  	mddev->queue->queue_lock = &conf->device_lock; @@ -5119,22 +5102,21 @@ abort:  		free_conf(conf);  	}  	mddev->private = NULL; -	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); +	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));  	return -EIO;  } - -  static int stop(mddev_t *mddev)  { -	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	raid5_conf_t *conf = mddev->private;  	md_unregister_thread(mddev->thread);  	mddev->thread = NULL;  	mddev->queue->backing_dev_info.congested_fn = NULL;  	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/  	free_conf(conf); -	mddev->private = &raid5_attrs_group; +	mddev->private = NULL; +	mddev->to_remove = &raid5_attrs_group;  	return 0;  } @@ -5175,7 +5157,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf)  static void status(struct seq_file *seq, mddev_t *mddev)  { -	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	raid5_conf_t *conf = mddev->private;  	int i;  	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, @@ -5197,21 +5179,22 @@ static void print_raid5_conf (raid5_conf_t *conf)  	int i;  	struct disk_info *tmp; -	printk("RAID5 conf printout:\n"); +	printk(KERN_DEBUG "RAID conf printout:\n");  	if (!conf) {  		printk("(conf==NULL)\n");  		return;  	} -	printk(" --- rd:%d wd:%d\n", conf->raid_disks, -		 conf->raid_disks - conf->mddev->degraded); +	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, +	       conf->raid_disks, +	       conf->raid_disks - conf->mddev->degraded);  	for (i = 0; i < conf->raid_disks; i++) {  		char b[BDEVNAME_SIZE];  		tmp = conf->disks + i;  		if (tmp->rdev) -		printk(" disk %d, o:%d, dev:%s\n", -			i, !test_bit(Faulty, &tmp->rdev->flags), -			bdevname(tmp->rdev->bdev,b)); +			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", +			       i, !test_bit(Faulty, &tmp->rdev->flags), +			       bdevname(tmp->rdev->bdev, b));  	}  } @@ -5334,7 +5317,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)  	    raid5_size(mddev, sectors, mddev->raid_disks))  		return -EINVAL;  	set_capacity(mddev->gendisk, mddev->array_sectors); -	mddev->changed = 1;  	revalidate_disk(mddev->gendisk);  	if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {  		mddev->recovery_cp = mddev->dev_sectors; @@ -5360,7 +5342,8 @@ static int check_stripe_cache(mddev_t *mddev)  	    > conf->max_nr_stripes ||  	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4  	    > conf->max_nr_stripes) { -		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n", +		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n", +		       mdname(mddev),  		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)  			/ STRIPE_SIZE)*4);  		return 0; @@ -5431,7 +5414,7 @@ static int raid5_start_reshape(mddev_t *mddev)  	 */  	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)  	    < mddev->array_sectors) { -		printk(KERN_ERR "md: %s: array size must be reduced " +		printk(KERN_ERR "md/raid:%s: array size must be reduced "  		       "before number of disks\n", mdname(mddev));  		return -EINVAL;  	} @@ -5469,9 +5452,9 @@ static int raid5_start_reshape(mddev_t *mddev)  				if (sysfs_create_link(&mddev->kobj,  						      &rdev->kobj, nm))  					printk(KERN_WARNING -					       "raid5: failed to create " -					       " link %s for %s\n", -					       nm, mdname(mddev)); +					       "md/raid:%s: failed to create " +					       " link %s\n", +					       mdname(mddev), nm);  			} else  				break;  		} @@ -5548,7 +5531,6 @@ static void raid5_finish_reshape(mddev_t *mddev)  		if (mddev->delta_disks > 0) {  			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));  			set_capacity(mddev->gendisk, mddev->array_sectors); -			mddev->changed = 1;  			revalidate_disk(mddev->gendisk);  		} else {  			int d; @@ -5613,6 +5595,29 @@ static void raid5_quiesce(mddev_t *mddev, int state)  } +static void *raid45_takeover_raid0(mddev_t *mddev, int level) +{ +	struct raid0_private_data *raid0_priv = mddev->private; + +	/* for raid0 takeover only one zone is supported */ +	if (raid0_priv->nr_strip_zones > 1) { +		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", +		       mdname(mddev)); +		return ERR_PTR(-EINVAL); +	} + +	mddev->new_level = level; +	mddev->new_layout = ALGORITHM_PARITY_N; +	mddev->new_chunk_sectors = mddev->chunk_sectors; +	mddev->raid_disks += 1; +	mddev->delta_disks = 1; +	/* make sure it will be not marked as dirty */ +	mddev->recovery_cp = MaxSector; + +	return setup_conf(mddev); +} + +  static void *raid5_takeover_raid1(mddev_t *mddev)  {  	int chunksect; @@ -5737,12 +5742,13 @@ static int raid6_check_reshape(mddev_t *mddev)  static void *raid5_takeover(mddev_t *mddev)  {  	/* raid5 can take over: -	 *  raid0 - if all devices are the same - make it a raid4 layout +	 *  raid0 - if there is only one strip zone - make it a raid4 layout  	 *  raid1 - if there are two drives.  We need to know the chunk size  	 *  raid4 - trivial - just use a raid4 layout.  	 *  raid6 - Providing it is a *_6 layout  	 */ - +	if (mddev->level == 0) +		return raid45_takeover_raid0(mddev, 5);  	if (mddev->level == 1)  		return raid5_takeover_raid1(mddev);  	if (mddev->level == 4) { @@ -5756,6 +5762,22 @@ static void *raid5_takeover(mddev_t *mddev)  	return ERR_PTR(-EINVAL);  } +static void *raid4_takeover(mddev_t *mddev) +{ +	/* raid4 can take over: +	 *  raid0 - if there is only one strip zone +	 *  raid5 - if layout is right +	 */ +	if (mddev->level == 0) +		return raid45_takeover_raid0(mddev, 4); +	if (mddev->level == 5 && +	    mddev->layout == ALGORITHM_PARITY_N) { +		mddev->new_layout = 0; +		mddev->new_level = 4; +		return setup_conf(mddev); +	} +	return ERR_PTR(-EINVAL); +}  static struct mdk_personality raid5_personality; @@ -5871,6 +5893,7 @@ static struct mdk_personality raid4_personality =  	.start_reshape  = raid5_start_reshape,  	.finish_reshape = raid5_finish_reshape,  	.quiesce	= raid5_quiesce, +	.takeover	= raid4_takeover,  };  static int __init raid5_init(void) | 
