diff options
Diffstat (limited to 'drivers/block')
77 files changed, 4533 insertions, 8152 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 2a51dfb09c8f..a41145d52de9 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -33,6 +33,22 @@ config BLK_DEV_FD To compile this driver as a module, choose M here: the module will be called floppy. +config BLK_DEV_FD_RAWCMD + bool "Support for raw floppy disk commands (DEPRECATED)" + depends on BLK_DEV_FD + help + If you want to use actual physical floppies and expect to do + special low-level hardware accesses to them (access and use + non-standard formats, for example), then enable this. + + Note that the code enabled by this option is rarely used and + might be unstable or insecure, and distros should not enable it. + + Note: FDRAWCMD is deprecated and will be removed from the kernel + in the near future. + + If unsure, say N. + config AMIGA_FLOPPY tristate "Amiga floppy support" depends on AMIGA @@ -232,15 +248,6 @@ config BLK_DEV_NBD If unsure, say N. -config BLK_DEV_SX8 - tristate "Promise SATA SX8 support" - depends on PCI - help - Saying Y or M here will enable support for the - Promise SATA SX8 controllers. - - Use devices /dev/sx8/$N and /dev/sx8/$Np$M. - config BLK_DEV_RAM tristate "RAM block device support" help @@ -392,16 +399,20 @@ config BLK_DEV_RBD If unsure, say N. -config BLK_DEV_RSXX - tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" - depends on PCI - select CRC32 +config BLK_DEV_UBLK + tristate "Userspace block driver (Experimental)" + select IO_URING help - Device driver for IBM's high speed PCIe SSD - storage device: Flash Adapter 900GB Full Height. - - To compile this driver as a module, choose M here: the - module will be called rsxx. + io_uring based userspace block driver. Together with ublk server, ublk + has been working well, but interface with userspace or command data + definition isn't finalized yet, and might change according to future + requirement, so mark is as experimental now. + + Say Y if you want to get better performance because task_work_add() + can be used in IO path for replacing io_uring cmd, which will become + shared between IO tasks and ubq daemon, meantime task_work_add() can + can handle batch more effectively, but task_work_add() isn't exported + for module, so ublk has to be built to kernel. source "drivers/block/rnbd/Kconfig" diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 11a74f17c9ad..101612cba303 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -26,18 +26,17 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o -obj-$(CONFIG_BLK_DEV_SX8) += sx8.o - obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ -obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ +obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o + swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index bf5c124c5452..4c8b2ba579ee 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1505,7 +1505,7 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct amiga_floppy_struct *floppy = rq->rq_disk->private_data; + struct amiga_floppy_struct *floppy = rq->q->disk->private_data; blk_status_t err; if (!spin_trylock_irq(&amiflop_lock)) @@ -1790,6 +1790,7 @@ static int fd_alloc_disk(int drive, int system) disk->first_minor = drive + system; disk->minors = 1; disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; if (system) sprintf(disk->disk_name, "fd%d_msdos", drive); @@ -1801,7 +1802,7 @@ static int fd_alloc_disk(int drive, int system) unit[drive].gendisk[system] = disk; err = add_disk(disk); if (err) - blk_cleanup_disk(disk); + put_disk(disk); return err; } diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 84d0fcebd6af..749ae1246f4c 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -244,3 +244,5 @@ void aoenet_exit(void); void aoenet_xmit(struct sk_buff_head *); int is_aoe_netif(struct net_device *ifp); int set_aoe_iflist(const char __user *str, size_t size); + +extern struct workqueue_struct *aoe_wq; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 52484bcdedb9..128722cf6c3c 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -12,7 +12,6 @@ #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/ratelimit.h> -#include <linux/genhd.h> #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/export.h> @@ -109,7 +108,7 @@ static ssize_t aoedisk_show_payload(struct device *dev, return sysfs_emit(page, "%lu\n", d->maxbcnt); } -static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) +static int aoe_debugfs_show(struct seq_file *s, void *ignored) { struct aoedev *d; struct aoetgt **t, **te; @@ -152,11 +151,7 @@ static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) return 0; } - -static int aoe_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, aoedisk_debugfs_show, inode->i_private); -} +DEFINE_SHOW_ATTRIBUTE(aoe_debugfs); static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL); static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL); @@ -185,13 +180,6 @@ static const struct attribute_group *aoe_attr_groups[] = { NULL, }; -static const struct file_operations aoe_debugfs_fops = { - .open = aoe_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static void aoedisk_add_debugfs(struct aoedev *d) { @@ -428,7 +416,7 @@ aoeblk_gdalloc(void *vp) return; out_disk_cleanup: - blk_cleanup_disk(gd); + put_disk(gd); err_tagset: blk_mq_free_tag_set(set); err_mempool: @@ -436,7 +424,7 @@ err_mempool: err: spin_lock_irqsave(&d->lock, flags); d->flags &= ~DEVFL_GD_NOW; - schedule_work(&d->work); + queue_work(aoe_wq, &d->work); spin_unlock_irqrestore(&d->lock, flags); } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 588889bea7c3..d7317425be51 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -10,7 +10,6 @@ #include <linux/blk-mq.h> #include <linux/skbuff.h> #include <linux/netdevice.h> -#include <linux/genhd.h> #include <linux/moduleparam.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -122,7 +121,7 @@ newtag(struct aoedev *d) register ulong n; n = jiffies & 0xffff; - return n |= (++d->lasttag & 0x7fff) << 16; + return n | (++d->lasttag & 0x7fff) << 16; } static u32 @@ -969,7 +968,7 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) d->flags |= DEVFL_NEWSIZE; else d->flags |= DEVFL_GDALLOC; - schedule_work(&d->work); + queue_work(aoe_wq, &d->work); } static void @@ -1019,9 +1018,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt) iter.bi_size = cnt; __bio_for_each_segment(bv, bio, iter, iter) { - char *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + char *p = bvec_kmap_local(&bv); skb_copy_bits(skb, soff, p, bv.bv_len); - kunmap_atomic(p); + kunmap_local(p); soff += bv.bv_len; } } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index c5753c6bfe80..3523dd82d7a0 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -277,7 +277,7 @@ freedev(struct aoedev *d) if (d->gd) { aoedisk_rm_debugfs(d); del_gendisk(d->gd); - blk_cleanup_disk(d->gd); + put_disk(d->gd); blk_mq_free_tag_set(&d->tag_set); } t = d->targets; @@ -321,7 +321,7 @@ flush(const char __user *str, size_t cnt, int exiting) specified = 1; } - flush_scheduled_work(); + flush_workqueue(aoe_wq); /* pass one: do aoedev_downdev, which might sleep */ restart1: spin_lock_irqsave(&devlist_lock, flags); @@ -520,7 +520,7 @@ freetgt(struct aoedev *d, struct aoetgt *t) void aoedev_exit(void) { - flush_scheduled_work(); + flush_workqueue(aoe_wq); flush(NULL, 0, EXITING); } diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 1e4e2971171c..6238c4c87cfc 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -16,6 +16,7 @@ MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels"); MODULE_VERSION(VERSION); static struct timer_list timer; +struct workqueue_struct *aoe_wq; static void discover_timer(struct timer_list *t) { @@ -35,6 +36,7 @@ aoe_exit(void) aoechr_exit(); aoedev_exit(); aoeblk_exit(); /* free cache after de-allocating bufs */ + destroy_workqueue(aoe_wq); } static int __init @@ -42,9 +44,13 @@ aoe_init(void) { int ret; + aoe_wq = alloc_workqueue("aoe_wq", 0, 0); + if (!aoe_wq) + return -ENOMEM; + ret = aoedev_init(); if (ret) - return ret; + goto dev_fail; ret = aoechr_init(); if (ret) goto chr_fail; @@ -77,6 +83,8 @@ aoe_init(void) aoechr_exit(); chr_fail: aoedev_exit(); + dev_fail: + destroy_workqueue(aoe_wq); printk(KERN_INFO "aoe: initialisation failure.\n"); return ret; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index bf769e6e32fe..9deb4df6bdb8 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -303,6 +303,7 @@ static struct atari_floppy_struct { int ref; int type; struct blk_mq_tag_set tag_set; + int error_count; } unit[FD_MAX_UNITS]; #define UD unit[drive] @@ -705,14 +706,14 @@ static void fd_error( void ) if (!fd_request) return; - fd_request->error_count++; - if (fd_request->error_count >= MAX_ERRORS) { + unit[SelectedDrive].error_count++; + if (unit[SelectedDrive].error_count >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); fd_end_request_cur(BLK_STS_IOERR); finish_fdc(); return; } - else if (fd_request->error_count == RECALIBRATE_ERRORS) { + else if (unit[SelectedDrive].error_count == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); if (SelectedDrive != -1) SUD.track = -1; @@ -1491,7 +1492,7 @@ static void setup_req_params( int drive ) ReqData = ReqBuffer + 512 * ReqCnt; if (UseTrackbuffer) - read_track = (ReqCmd == READ && fd_request->error_count == 0); + read_track = (ReqCmd == READ && unit[drive].error_count == 0); else read_track = 0; @@ -1502,7 +1503,7 @@ static void setup_req_params( int drive ) static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct atari_floppy_struct *floppy = bd->rq->rq_disk->private_data; + struct atari_floppy_struct *floppy = bd->rq->q->disk->private_data; int drive = floppy - unit; int type = floppy->type; @@ -1520,6 +1521,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_RESOURCE; } fd_request = bd->rq; + unit[drive].error_count = 0; blk_mq_start_request(fd_request); atari_disable_irq( IRQ_MFP_FDC ); @@ -1538,7 +1540,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!UDT) { Probing = 1; UDT = atari_disk_type + StartDiskType[DriveType]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 1; } } @@ -1558,7 +1560,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, } type = minor2disktype[type].index; UDT = &atari_disk_type[type]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 0; } @@ -2000,6 +2002,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type) disk->minors = 1; sprintf(disk->disk_name, "fd%d", drive); disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = &unit[drive]; set_capacity(disk, MAX_DISK_SIZE * 2); @@ -2028,7 +2031,7 @@ static void ataflop_probe(dev_t dev) return; cleanup_disk: - blk_cleanup_disk(unit[drive].disk[type]); + put_disk(unit[drive].disk[type]); unit[drive].disk[type] = NULL; } @@ -2042,7 +2045,6 @@ static void atari_floppy_cleanup(void) if (!unit[i].disk[type]) continue; del_gendisk(unit[i].disk[type]); - blk_cleanup_queue(unit[i].disk[type]->queue); put_disk(unit[i].disk[type]); } blk_mq_free_tag_set(&unit[i].tag_set); @@ -2061,7 +2063,7 @@ static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs) continue; if (fs->registered[type]) del_gendisk(fs->disk[type]); - blk_cleanup_disk(fs->disk[type]); + put_disk(fs->disk[type]); } blk_mq_free_tag_set(&fs->tag_set); } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a896ee175d86..20acc4a1fd6d 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -256,7 +256,7 @@ static void copy_from_brd(void *dst, struct brd_device *brd, * Process a single bvec of a bio. */ static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, unsigned int op, + unsigned int len, unsigned int off, enum req_op op, sector_t sector) { void *mem; @@ -310,7 +310,7 @@ static void brd_submit_bio(struct bio *bio) } static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { struct brd_device *brd = bdev->bd_disk->private_data; int err; @@ -362,7 +362,6 @@ __setup("ramdisk_size=", ramdisk_size); * (should share code eventually). */ static LIST_HEAD(brd_devices); -static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; static int brd_alloc(int i) @@ -372,21 +371,14 @@ static int brd_alloc(int i) char buf[DISK_NAME_LEN]; int err = -ENOMEM; - mutex_lock(&brd_devices_mutex); - list_for_each_entry(brd, &brd_devices, brd_list) { - if (brd->brd_number == i) { - mutex_unlock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) + if (brd->brd_number == i) return -EEXIST; - } - } brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) { - mutex_unlock(&brd_devices_mutex); + if (!brd) return -ENOMEM; - } brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); - mutex_unlock(&brd_devices_mutex); spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); @@ -405,8 +397,7 @@ static int brd_alloc(int i) disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; - disk->flags = GENHD_FL_EXT_DEVT; - strlcpy(disk->disk_name, buf, DISK_NAME_LEN); + strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); /* @@ -428,11 +419,9 @@ static int brd_alloc(int i) return 0; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_free_dev: - mutex_lock(&brd_devices_mutex); list_del(&brd->brd_list); - mutex_unlock(&brd_devices_mutex); kfree(brd); return err; } @@ -442,15 +431,19 @@ static void brd_probe(dev_t dev) brd_alloc(MINOR(dev) / max_part); } -static void brd_del_one(struct brd_device *brd) +static void brd_cleanup(void) { - del_gendisk(brd->brd_disk); - blk_cleanup_disk(brd->brd_disk); - brd_free_pages(brd); - mutex_lock(&brd_devices_mutex); - list_del(&brd->brd_list); - mutex_unlock(&brd_devices_mutex); - kfree(brd); + struct brd_device *brd, *next; + + debugfs_remove_recursive(brd_debugfs_dir); + + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { + del_gendisk(brd->brd_disk); + put_disk(brd->brd_disk); + brd_free_pages(brd); + list_del(&brd->brd_list); + kfree(brd); + } } static inline void brd_check_and_reset_par(void) @@ -474,9 +467,18 @@ static inline void brd_check_and_reset_par(void) static int __init brd_init(void) { - struct brd_device *brd, *next; int err, i; + brd_check_and_reset_par(); + + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); + + for (i = 0; i < rd_nr; i++) { + err = brd_alloc(i); + if (err) + goto out_free; + } + /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. @@ -492,28 +494,16 @@ static int __init brd_init(void) * dynamically. */ - if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) - return -EIO; - - brd_check_and_reset_par(); - - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - - for (i = 0; i < rd_nr; i++) { - err = brd_alloc(i); - if (err) - goto out_free; + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { + err = -EIO; + goto out_free; } pr_info("brd: module loaded\n"); return 0; out_free: - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); + brd_cleanup(); pr_info("brd: module NOT loaded !!!\n"); return err; @@ -521,13 +511,9 @@ out_free: static void __exit brd_exit(void) { - struct brd_device *brd, *next; unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); + brd_cleanup(); pr_info("brd: module unloaded\n"); } diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 72cf7603d51f..e27478ae579c 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -124,12 +124,13 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b static int _drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - sector_t sector, int op) + sector_t sector, enum req_op op) { struct bio *bio; /* we do all our meta data IO in aligned 4k blocks. */ const int size = 4096; - int err, op_flags = 0; + int err; + blk_opf_t op_flags = 0; device->md_io.done = 0; device->md_io.error = -ENODEV; @@ -138,15 +139,14 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, op_flags |= REQ_FUA | REQ_PREFLUSH; op_flags |= REQ_SYNC; - bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); - bio_set_dev(bio, bdev->md_bdev); + bio = bio_alloc_bioset(bdev->md_bdev, 1, op | op_flags, GFP_NOIO, + &drbd_md_io_bio_set); bio->bi_iter.bi_sector = sector; err = -EIO; if (bio_add_page(bio, device->md_io.page, size, 0) != size) goto out; bio->bi_private = device; bio->bi_end_io = drbd_md_endio; - bio_set_op_attrs(bio, op, op_flags); if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL) /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ @@ -175,7 +175,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, } int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - sector_t sector, int op) + sector_t sector, enum req_op op) { int err; D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); @@ -386,7 +386,7 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; rcu_read_unlock(); if (write_al_updates) { - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) { err = -EIO; drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); } else { diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index c1f816f896a8..7d9db33363de 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -683,7 +683,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi } } - want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + want = PFN_UP(words*sizeof(long)); have = b->bm_number_of_pages; if (want == have) { D_ASSERT(device, b->bm_pages != NULL); @@ -974,24 +974,58 @@ static void drbd_bm_endio(struct bio *bio) } } +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.al_offset -1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect -1; + } +} + static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); struct drbd_device *device = ctx->device; + enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE; struct drbd_bitmap *b = device->bitmap; + struct bio *bio; struct page *page; + sector_t last_bm_sect; + sector_t first_bm_sect; + sector_t on_disk_sector; unsigned int len; - unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE; - sector_t on_disk_sector = - device->ldev->md.md_offset + device->ldev->md.bm_offset; - on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset; + on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT)); /* this might happen with very small * flexible external meta data device, * or with PAGE_SIZE > 4k */ - len = min_t(unsigned int, PAGE_SIZE, - (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9); + last_bm_sect = drbd_md_last_bitmap_sector(device->ldev); + if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) { + sector_t len_sect = last_bm_sect - on_disk_sector + 1; + if (len_sect < PAGE_SIZE/SECTOR_SIZE) + len = (unsigned int)len_sect*SECTOR_SIZE; + else + len = PAGE_SIZE; + } else { + if (__ratelimit(&drbd_ratelimit_state)) { + drbd_err(device, "Invalid offset during on-disk bitmap access: " + "page idx %u, sector %llu\n", page_nr, on_disk_sector); + } + ctx->error = -EIO; + bm_set_page_io_err(b->bm_pages[page_nr]); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&device->misc_wait); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + } + return; + } /* serialize IO on this page */ bm_page_lock_io(device, page_nr); @@ -1006,14 +1040,14 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_store_page_idx(page, page_nr); } else page = b->bm_pages[page_nr]; - bio_set_dev(bio, device->ldev->md_bdev); + bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO, + &drbd_md_io_bio_set); bio->bi_iter.bi_sector = on_disk_sector; /* bio_add_page of a single page to an empty bio will always succeed, * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = drbd_bm_endio; - bio_set_op_attrs(bio, op, 0); if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { bio_io_error(bio); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f27d5b0f9a0b..4d661282ff41 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -27,7 +27,6 @@ #include <linux/major.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> -#include <linux/genhd.h> #include <linux/idr.h> #include <linux/dynamic_debug.h> #include <net/tcp.h> @@ -638,9 +637,6 @@ enum { STATE_SENT, /* Do not change state/UUIDs while this is set */ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) * pending, from drbd worker context. - * If set, bdi_write_congested() returns true, - * so shrink_page_list() would not recurse into, - * and potentially deadlock on, this drbd worker. */ DISCONNECT_SENT, @@ -1499,7 +1495,7 @@ extern int drbd_resync_finished(struct drbd_device *device); extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); extern void drbd_md_put_buffer(struct drbd_device *device); extern int drbd_md_sync_page_io(struct drbd_device *device, - struct drbd_backing_dev *bdev, sector_t sector, int op); + struct drbd_backing_dev *bdev, sector_t sector, enum req_op op); extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); extern void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int *done); @@ -1533,7 +1529,6 @@ extern int w_send_read_req(struct drbd_work *, int); extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); -extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(struct timer_list *t); extern void start_resync_timer_fn(struct timer_list *t); @@ -1551,8 +1546,7 @@ extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); extern int drbd_submit_peer_request(struct drbd_device *, - struct drbd_peer_request *, const unsigned, - const unsigned, const int); + struct drbd_peer_request *, blk_opf_t, int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, @@ -1642,22 +1636,22 @@ struct sib_info { }; void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); -extern void notify_resource_state(struct sk_buff *, +extern int notify_resource_state(struct sk_buff *, unsigned int, struct drbd_resource *, struct resource_info *, enum drbd_notification_type); -extern void notify_device_state(struct sk_buff *, +extern int notify_device_state(struct sk_buff *, unsigned int, struct drbd_device *, struct device_info *, enum drbd_notification_type); -extern void notify_connection_state(struct sk_buff *, +extern int notify_connection_state(struct sk_buff *, unsigned int, struct drbd_connection *, struct connection_info *, enum drbd_notification_type); -extern void notify_peer_device_state(struct sk_buff *, +extern int notify_peer_device_state(struct sk_buff *, unsigned int, struct drbd_peer_device *, struct peer_device_info *, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 53ba2dddba6e..f3e4db16fd07 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -171,7 +171,7 @@ void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, unsigned int set_size) { struct drbd_request *r; - struct drbd_request *req = NULL; + struct drbd_request *req = NULL, *tmp = NULL; int expect_epoch = 0; int expect_size = 0; @@ -225,8 +225,11 @@ void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, * to catch requests being barrier-acked "unexpectedly". * It usually should find the same req again, or some READ preceding it. */ list_for_each_entry(req, &connection->transfer_log, tl_requests) - if (req->epoch == expect_epoch) + if (req->epoch == expect_epoch) { + tmp = req; break; + } + req = list_prepare_entry(tmp, &connection->transfer_log, tl_requests); list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) { if (req->epoch != expect_epoch) break; @@ -729,7 +732,8 @@ int drbd_send_sync_param(struct drbd_peer_device *peer_device) cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; /* initialize verify_alg and csums_alg */ - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX); + memset(&p->algs, 0, sizeof(p->algs)); if (get_ldev(peer_device->device)) { dc = rcu_dereference(peer_device->device->ldev->disk_conf); @@ -899,31 +903,6 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) } } -/* communicated if (agreed_features & DRBD_FF_WSAME) */ -static void -assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, - struct request_queue *q) -{ - if (q) { - p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); - p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); - p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q)); - p->qlim->io_min = cpu_to_be32(queue_io_min(q)); - p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); - p->qlim->discard_enabled = blk_queue_discard(q); - p->qlim->write_same_capable = !!q->limits.max_write_same_sectors; - } else { - q = device->rq_queue; - p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); - p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); - p->qlim->alignment_offset = 0; - p->qlim->io_min = cpu_to_be32(queue_io_min(q)); - p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); - p->qlim->discard_enabled = 0; - p->qlim->write_same_capable = 0; - } -} - int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) { struct drbd_device *device = peer_device->device; @@ -945,7 +924,9 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu memset(p, 0, packet_size); if (get_ldev_if_state(device, D_NEGOTIATING)) { - struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + d_size = drbd_get_max_capacity(device->ldev); rcu_read_lock(); u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; @@ -953,14 +934,32 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu q_order_type = drbd_queue_order_type(device); max_bio_size = queue_max_hw_sectors(q) << 9; max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); - assign_p_sizes_qlim(device, p, q); + p->qlim->physical_block_size = + cpu_to_be32(bdev_physical_block_size(bdev)); + p->qlim->logical_block_size = + cpu_to_be32(bdev_logical_block_size(bdev)); + p->qlim->alignment_offset = + cpu_to_be32(bdev_alignment_offset(bdev)); + p->qlim->io_min = cpu_to_be32(bdev_io_min(bdev)); + p->qlim->io_opt = cpu_to_be32(bdev_io_opt(bdev)); + p->qlim->discard_enabled = !!bdev_max_discard_sectors(bdev); put_ldev(device); } else { + struct request_queue *q = device->rq_queue; + + p->qlim->physical_block_size = + cpu_to_be32(queue_physical_block_size(q)); + p->qlim->logical_block_size = + cpu_to_be32(queue_logical_block_size(q)); + p->qlim->alignment_offset = 0; + p->qlim->io_min = cpu_to_be32(queue_io_min(q)); + p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); + p->qlim->discard_enabled = 0; + d_size = 0; u_size = 0; q_order_type = QUEUE_ORDERED_NONE; max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ - assign_p_sizes_qlim(device, p, NULL); } if (peer_device->connection->agreed_pro_version <= 94) @@ -1590,9 +1589,6 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio) ? 0 : MSG_MORE); if (err) return err; - /* REQ_OP_WRITE_SAME has only one segment */ - if (bio_op(bio) == REQ_OP_WRITE_SAME) - break; } return 0; } @@ -1611,9 +1607,6 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b bio_iter_last(bvec, iter) ? 0 : MSG_MORE); if (err) return err; - /* REQ_OP_WRITE_SAME has only one segment */ - if (bio_op(bio) == REQ_OP_WRITE_SAME) - break; } return 0; } @@ -1645,7 +1638,6 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, return (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) | (bio->bi_opf & REQ_FUA ? DP_FUA : 0) | (bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) | - (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) | (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) | (bio_op(bio) == REQ_OP_WRITE_ZEROES ? ((connection->agreed_features & DRBD_FF_WZEROES) ? @@ -1664,7 +1656,6 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * struct drbd_device *device = peer_device->device; struct drbd_socket *sock; struct p_data *p; - struct p_wsame *wsame = NULL; void *digest_out; unsigned int dp_flags = 0; int digest_size; @@ -1702,29 +1693,14 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*t), NULL, 0); goto out; } - if (dp_flags & DP_WSAME) { - /* this will only work if DRBD_FF_WSAME is set AND the - * handshake agreed that all nodes and backend devices are - * WRITE_SAME capable and agree on logical_block_size */ - wsame = (struct p_wsame*)p; - digest_out = wsame + 1; - wsame->size = cpu_to_be32(req->i.size); - } else - digest_out = p + 1; + digest_out = p + 1; /* our digest is still only over the payload. * TRIM does not carry any payload. */ if (digest_size) drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out); - if (wsame) { - err = - __send_command(peer_device->connection, device->vnr, sock, P_WSAME, - sizeof(*wsame) + digest_size, NULL, - bio_iovec(req->master_bio).bv_len); - } else - err = - __send_command(peer_device->connection, device->vnr, sock, P_DATA, - sizeof(*p) + digest_size, NULL, req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, + sizeof(*p) + digest_size, NULL, req->i.size); if (!err) { /* For protocol A, we have to memcpy the payload into * socket buffers, as we may complete right away @@ -2231,7 +2207,7 @@ void drbd_destroy_device(struct kref *kref) if (device->bitmap) /* should no longer be there. */ drbd_bm_cleanup(device); __free_page(device->md_io.page); - blk_cleanup_disk(device->vdisk); + put_disk(device->vdisk); kfree(device->rs_plan_s); /* not for_each_connection(connection, resource): @@ -2734,9 +2710,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig disk->first_minor = minor; disk->minors = 1; disk->fops = &drbd_ops; + disk->flags |= GENHD_FL_NO_PART; sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); blk_queue_write_cache(disk->queue, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ @@ -2791,12 +2769,12 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig if (init_submitter(device)) { err = ERR_NOMEM; - goto out_idr_remove_vol; + goto out_idr_remove_from_resource; } err = add_disk(disk); if (err) - goto out_idr_remove_vol; + goto out_idr_remove_from_resource; /* inherit the connection state */ device->state.conn = first_connection(resource)->cstate; @@ -2810,8 +2788,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_debugfs_device_add(device); return NO_ERROR; -out_idr_remove_vol: - idr_remove(&connection->peer_devices, vnr); out_idr_remove_from_resource: for_each_connection(connection, resource) { peer_device = idr_remove(&connection->peer_devices, vnr); @@ -2831,7 +2807,7 @@ out_no_minor_idr: out_no_bitmap: __free_page(device->md_io.page); out_no_io_page: - blk_cleanup_disk(disk); + put_disk(disk); out_no_disk: kref_put(&resource->kref, drbd_destroy_resource); kfree(device); @@ -3605,9 +3581,8 @@ const char *cmdname(enum drbd_packet cmd) * when we want to support more than * one PRO_VERSION */ static const char *cmdnames[] = { + [P_DATA] = "Data", - [P_WSAME] = "WriteSame", - [P_TRIM] = "Trim", [P_DATA_REPLY] = "DataReply", [P_RS_DATA_REPLY] = "RSDataReply", [P_BARRIER] = "Barrier", @@ -3618,7 +3593,6 @@ const char *cmdname(enum drbd_packet cmd) [P_DATA_REQUEST] = "DataRequest", [P_RS_DATA_REQUEST] = "RSDataRequest", [P_SYNC_PARAM] = "SyncParam", - [P_SYNC_PARAM89] = "SyncParam89", [P_PROTOCOL] = "ReportProtocol", [P_UUIDS] = "ReportUUIDs", [P_SIZES] = "ReportSizes", @@ -3626,6 +3600,7 @@ const char *cmdname(enum drbd_packet cmd) [P_SYNC_UUID] = "ReportSyncUUID", [P_AUTH_CHALLENGE] = "AuthChallenge", [P_AUTH_RESPONSE] = "AuthResponse", + [P_STATE_CHG_REQ] = "StateChgRequest", [P_PING] = "Ping", [P_PING_ACK] = "PingAck", [P_RECV_ACK] = "RecvAck", @@ -3636,23 +3611,25 @@ const char *cmdname(enum drbd_packet cmd) [P_NEG_DREPLY] = "NegDReply", [P_NEG_RS_DREPLY] = "NegRSDReply", [P_BARRIER_ACK] = "BarrierAck", - [P_STATE_CHG_REQ] = "StateChgRequest", [P_STATE_CHG_REPLY] = "StateChgReply", [P_OV_REQUEST] = "OVRequest", [P_OV_REPLY] = "OVReply", [P_OV_RESULT] = "OVResult", [P_CSUM_RS_REQUEST] = "CsumRSRequest", [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_SYNC_PARAM89] = "SyncParam89", [P_COMPRESSED_BITMAP] = "CBitmap", [P_DELAY_PROBE] = "DelayProbe", [P_OUT_OF_SYNC] = "OutOfSync", - [P_RETRY_WRITE] = "RetryWrite", [P_RS_CANCEL] = "RSCancel", [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", [P_PROTOCOL_UPDATE] = "protocol_update", + [P_TRIM] = "Trim", [P_RS_THIN_REQ] = "rs_thin_req", [P_RS_DEALLOCATED] = "rs_deallocated", + [P_WSAME] = "WriteSame", + [P_ZEROES] = "Zeroes", /* enum drbd_packet, but not commands - obsoleted flags: * P_MAY_IGNORE diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 44ccf8b4f4b2..864c98e74875 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -770,6 +770,7 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) struct set_role_parms parms; int err; enum drbd_ret_code retcode; + enum drbd_state_rv rv; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) @@ -790,14 +791,14 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, - R_PRIMARY, parms.assume_uptodate); + rv = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else - retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, - R_SECONDARY, 0); + rv = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); mutex_unlock(&adm_ctx.resource->adm_mutex); genl_lock(); + drbd_adm_finish(&adm_ctx, info, rv); + return 0; out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -1204,50 +1205,40 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) } static void decide_on_discard_support(struct drbd_device *device, - struct request_queue *q, - struct request_queue *b, - bool discard_zeroes_if_aligned) + struct drbd_backing_dev *bdev) { - /* q = drbd device queue (device->rq_queue) - * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue), - * or NULL if diskless - */ - struct drbd_connection *connection = first_peer_device(device)->connection; - bool can_do = b ? blk_queue_discard(b) : true; - - if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { - can_do = false; - drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); - } - if (can_do) { - /* We don't care for the granularity, really. - * Stacking limits below should fix it for the local - * device. Whether or not it is a suitable granularity - * on the remote device is not our problem, really. If - * you care, you need to use devices with similar - * topology on all peers. */ - blk_queue_discard_granularity(q, 512); - q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection); - } else { - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); - blk_queue_discard_granularity(q, 0); - q->limits.max_discard_sectors = 0; - q->limits.max_write_zeroes_sectors = 0; - } -} + struct drbd_connection *connection = + first_peer_device(device)->connection; + struct request_queue *q = device->rq_queue; -static void fixup_discard_if_not_supported(struct request_queue *q) -{ - /* To avoid confusion, if this queue does not support discard, clear - * max_discard_sectors, which is what lsblk -D reports to the user. - * Older kernels got this wrong in "stack limits". - * */ - if (!blk_queue_discard(q)) { - blk_queue_max_discard_sectors(q, 0); - blk_queue_discard_granularity(q, 0); + if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) + goto not_supported; + + if (connection->cstate >= C_CONNECTED && + !(connection->agreed_features & DRBD_FF_TRIM)) { + drbd_info(connection, + "peer DRBD too old, does not support TRIM: disabling discards\n"); + goto not_supported; } + + /* + * We don't care for the granularity, really. + * + * Stacking limits below should fix it for the local device. Whether or + * not it is a suitable granularity on the remote device is not our + * problem, really. If you care, you need to use devices with similar + * topology on all peers. + */ + blk_queue_discard_granularity(q, 512); + q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); + q->limits.max_write_zeroes_sectors = + drbd_max_discard_sectors(connection); + return; + +not_supported: + blk_queue_discard_granularity(q, 0); + q->limits.max_discard_sectors = 0; + q->limits.max_write_zeroes_sectors = 0; } static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) @@ -1265,71 +1256,6 @@ static void fixup_write_zeroes(struct drbd_device *device, struct request_queue q->limits.max_write_zeroes_sectors = 0; } -static void decide_on_write_same_support(struct drbd_device *device, - struct request_queue *q, - struct request_queue *b, struct o_qlim *o, - bool disable_write_same) -{ - struct drbd_peer_device *peer_device = first_peer_device(device); - struct drbd_connection *connection = peer_device->connection; - bool can_do = b ? b->limits.max_write_same_sectors : true; - - if (can_do && disable_write_same) { - can_do = false; - drbd_info(peer_device, "WRITE_SAME disabled by config\n"); - } - - if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) { - can_do = false; - drbd_info(peer_device, "peer does not support WRITE_SAME\n"); - } - - if (o) { - /* logical block size; queue_logical_block_size(NULL) is 512 */ - unsigned int peer_lbs = be32_to_cpu(o->logical_block_size); - unsigned int me_lbs_b = queue_logical_block_size(b); - unsigned int me_lbs = queue_logical_block_size(q); - - if (me_lbs_b != me_lbs) { - drbd_warn(device, - "logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n", - me_lbs, me_lbs_b); - /* rather disable write same than trigger some BUG_ON later in the scsi layer. */ - can_do = false; - } - if (me_lbs_b != peer_lbs) { - drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n", - me_lbs, peer_lbs); - if (can_do) { - drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n"); - can_do = false; - } - me_lbs = max(me_lbs, me_lbs_b); - /* We cannot change the logical block size of an in-use queue. - * We can only hope that access happens to be properly aligned. - * If not, the peer will likely produce an IO error, and detach. */ - if (peer_lbs > me_lbs) { - if (device->state.role != R_PRIMARY) { - blk_queue_logical_block_size(q, peer_lbs); - drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs); - } else { - drbd_warn(peer_device, - "current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n", - me_lbs, peer_lbs); - } - } - } - if (can_do && !o->write_same_capable) { - /* If we introduce an open-coded write-same loop on the receiving side, - * the peer would present itself as "capable". */ - drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n"); - can_do = false; - } - } - - blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0); -} - static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int max_bio_size, struct o_qlim *o) { @@ -1338,8 +1264,6 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi unsigned int max_segments = 0; struct request_queue *b = NULL; struct disk_conf *dc; - bool discard_zeroes_if_aligned = true; - bool disable_write_same = false; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; @@ -1348,8 +1272,6 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi rcu_read_lock(); dc = rcu_dereference(device->ldev->disk_conf); max_segments = dc->max_bio_bvecs; - discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned; - disable_write_same = dc->disable_write_same; rcu_read_unlock(); blk_set_stacking_limits(&q->limits); @@ -1359,14 +1281,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi /* This is the workaround for "bio would need to, but cannot, be split" */ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_SIZE-1); - decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); - decide_on_write_same_support(device, q, b, o, disable_write_same); + decide_on_discard_support(device, bdev); if (b) { blk_stack_limits(&q->limits, &b->limits, 0); disk_update_readahead(device->vdisk); } - fixup_discard_if_not_supported(q); fixup_write_zeroes(device, q); } @@ -1505,14 +1425,14 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf, struct drbd_backing_dev *nbc) { - struct request_queue * const q = nbc->backing_bdev->bd_disk->queue; + struct block_device *bdev = nbc->backing_bdev; if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; if (disk_conf->al_extents > drbd_al_extents_max(nbc)) disk_conf->al_extents = drbd_al_extents_max(nbc); - if (!blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(bdev)) { if (disk_conf->rs_discard_granularity) { disk_conf->rs_discard_granularity = 0; /* disable feature */ drbd_info(device, "rs_discard_granularity feature disabled\n"); @@ -1521,16 +1441,19 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis if (disk_conf->rs_discard_granularity) { int orig_value = disk_conf->rs_discard_granularity; + sector_t discard_size = bdev_max_discard_sectors(bdev) << 9; + unsigned int discard_granularity = bdev_discard_granularity(bdev); int remainder; - if (q->limits.discard_granularity > disk_conf->rs_discard_granularity) - disk_conf->rs_discard_granularity = q->limits.discard_granularity; + if (discard_granularity > disk_conf->rs_discard_granularity) + disk_conf->rs_discard_granularity = discard_granularity; - remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity; + remainder = disk_conf->rs_discard_granularity % + discard_granularity; disk_conf->rs_discard_granularity += remainder; - if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9) - disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9; + if (disk_conf->rs_discard_granularity > discard_size) + disk_conf->rs_discard_granularity = discard_size; if (disk_conf->rs_discard_granularity != orig_value) drbd_info(device, "rs_discard_granularity changed to %d\n", @@ -1666,8 +1589,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (write_ordering_changed(old_disk_conf, new_disk_conf)) drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); - if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned - || old_disk_conf->disable_write_same != new_disk_conf->disable_write_same) + if (old_disk_conf->discard_zeroes_if_aligned != + new_disk_conf->discard_zeroes_if_aligned) drbd_reconsider_queue_parameters(device, device->ldev, NULL); drbd_md_sync(device); @@ -1679,8 +1602,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) drbd_send_sync_param(peer_device); } - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); kfree(old_plan); mod_timer(&device->request_timer, jiffies + HZ); goto success; @@ -2511,8 +2433,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu(old_net_conf); if (connection->cstate >= C_WF_REPORT_PARAMS) { struct drbd_peer_device *peer_device; @@ -2570,6 +2491,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; + enum drbd_state_rv rv; int i; int err; @@ -2689,12 +2611,11 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - retcode = (enum drbd_ret_code)conn_request_state(connection, - NS(conn, C_UNCONNECTED), CS_VERBOSE); + rv = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, retcode); + drbd_adm_finish(&adm_ctx, info, rv); return 0; fail: @@ -2802,11 +2723,12 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); - if (rv < SS_SUCCESS) - retcode = (enum drbd_ret_code)rv; - else - retcode = NO_ERROR; mutex_unlock(&adm_ctx.resource->adm_mutex); + if (rv < SS_SUCCESS) { + drbd_adm_finish(&adm_ctx, info, rv); + return 0; + } + retcode = NO_ERROR; fail: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -2925,8 +2847,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) new_disk_conf->disk_size = (sector_t)rs.resize_size; rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&device->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); new_disk_conf = NULL; } @@ -4617,7 +4538,7 @@ static int nla_put_notification_header(struct sk_buff *msg, return drbd_notification_header_to_skb(msg, &nh, true); } -void notify_resource_state(struct sk_buff *skb, +int notify_resource_state(struct sk_buff *skb, unsigned int seq, struct drbd_resource *resource, struct resource_info *resource_info, @@ -4659,16 +4580,17 @@ void notify_resource_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } -void notify_device_state(struct sk_buff *skb, +int notify_device_state(struct sk_buff *skb, unsigned int seq, struct drbd_device *device, struct device_info *device_info, @@ -4708,16 +4630,17 @@ void notify_device_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } -void notify_connection_state(struct sk_buff *skb, +int notify_connection_state(struct sk_buff *skb, unsigned int seq, struct drbd_connection *connection, struct connection_info *connection_info, @@ -4757,16 +4680,17 @@ void notify_connection_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } -void notify_peer_device_state(struct sk_buff *skb, +int notify_peer_device_state(struct sk_buff *skb, unsigned int seq, struct drbd_peer_device *peer_device, struct peer_device_info *peer_device_info, @@ -4807,13 +4731,14 @@ void notify_peer_device_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } void notify_helper(enum drbd_notification_type type, @@ -4827,7 +4752,7 @@ void notify_helper(enum drbd_notification_type type, struct drbd_genlmsghdr *dh; int err; - strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); + strscpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); helper_info.helper_status = status; @@ -4864,7 +4789,7 @@ fail: err, seq); } -static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) +static int notify_initial_state_done(struct sk_buff *skb, unsigned int seq) { struct drbd_genlmsghdr *dh; int err; @@ -4878,11 +4803,12 @@ static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) if (nla_put_notification_header(skb, NOTIFY_EXISTS)) goto nla_put_failure; genlmsg_end(skb, dh); - return; + return 0; nla_put_failure: nlmsg_free(skb); pr_err("Error %d sending event. Event seq:%u\n", err, seq); + return err; } static void free_state_changes(struct list_head *list) @@ -4909,6 +4835,7 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) unsigned int seq = cb->args[2]; unsigned int n; enum drbd_notification_type flags = 0; + int err = 0; /* There is no need for taking notification_mutex here: it doesn't matter if the initial state events mix with later state chage @@ -4917,32 +4844,32 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) cb->args[5]--; if (cb->args[5] == 1) { - notify_initial_state_done(skb, seq); + err = notify_initial_state_done(skb, seq); goto out; } n = cb->args[4]++; if (cb->args[4] < cb->args[3]) flags |= NOTIFY_CONTINUES; if (n < 1) { - notify_resource_state_change(skb, seq, state_change->resource, + err = notify_resource_state_change(skb, seq, state_change->resource, NOTIFY_EXISTS | flags); goto next; } n--; if (n < state_change->n_connections) { - notify_connection_state_change(skb, seq, &state_change->connections[n], + err = notify_connection_state_change(skb, seq, &state_change->connections[n], NOTIFY_EXISTS | flags); goto next; } n -= state_change->n_connections; if (n < state_change->n_devices) { - notify_device_state_change(skb, seq, &state_change->devices[n], + err = notify_device_state_change(skb, seq, &state_change->devices[n], NOTIFY_EXISTS | flags); goto next; } n -= state_change->n_devices; if (n < state_change->n_devices * state_change->n_connections) { - notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], + err = notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], NOTIFY_EXISTS | flags); goto next; } @@ -4957,7 +4884,10 @@ next: cb->args[4] = 0; } out: - return skb->len; + if (err) + return err; + else + return skb->len; } int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index dea59c92ecc1..a882b65ab5d2 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -283,8 +283,10 @@ struct p_rs_param_89 { struct p_rs_param_95 { u32 resync_rate; - char verify_alg[SHARED_SECRET_MAX]; - char csums_alg[SHARED_SECRET_MAX]; + struct_group(algs, + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + ); u32 c_plan_ahead; u32 c_delay_target; u32 c_fill_target; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1f740e42e457..ee69d50ba4fd 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -364,7 +364,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned int nr_pages = PFN_UP(payload_size); if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; @@ -781,7 +781,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, timeo = connect_int * HZ; /* 28.5% random jitter */ - timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; + timeo += prandom_u32_max(2) ? timeo / 7 : -timeo / 7; err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); if (err <= 0) @@ -1004,7 +1004,7 @@ retry: drbd_warn(connection, "Error receiving initial packet\n"); sock_release(s); randomize: - if (prandom_u32() & 1) + if (prandom_u32_max(2)) goto retry; } } @@ -1279,16 +1279,16 @@ static void one_flush_endio(struct bio *bio) static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0, + REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO); struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); - if (!bio || !octx) { - drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n"); + + if (!octx) { + drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n"); /* FIXME: what else can I do now? disconnecting or detaching * really does not help to improve the state of the world, either. */ - kfree(octx); - if (bio) - bio_put(bio); + bio_put(bio); ctx->error = -ENOMEM; put_ldev(device); @@ -1298,10 +1298,8 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont octx->device = device; octx->ctx = ctx; - bio_set_dev(bio, device->ldev->backing_bdev); bio->bi_private = octx; bio->bi_end_io = one_flush_endio; - bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; device->flush_jif = jiffies; set_bit(FLUSH_PENDING, &device->flags); @@ -1513,7 +1511,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) { struct block_device *bdev = device->ldev->backing_bdev; - struct request_queue *q = bdev_get_queue(bdev); sector_t tmp, nr; unsigned int max_discard_sectors, granularity; int alignment; @@ -1523,10 +1520,10 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u goto zero_out; /* Zero-sector (unknown) and one-sector granularities are the same. */ - granularity = max(q->limits.discard_granularity >> 9, 1U); + granularity = max(bdev_discard_granularity(bdev) >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) goto zero_out; @@ -1550,7 +1547,8 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u start = tmp; } while (nr_sectors >= max_discard_sectors) { - err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, + GFP_NOIO); nr_sectors -= max_discard_sectors; start += max_discard_sectors; } @@ -1562,7 +1560,7 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u nr = nr_sectors; nr -= (unsigned int)nr % granularity; if (nr) { - err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO); nr_sectors -= nr; start += nr; } @@ -1577,11 +1575,10 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u static bool can_do_reliable_discards(struct drbd_device *device) { - struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); struct disk_conf *dc; bool can_do; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(device->ldev->backing_bdev)) return false; rcu_read_lock(); @@ -1606,19 +1603,7 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru drbd_endio_write_sec_final(peer_req); } -static void drbd_issue_peer_wsame(struct drbd_device *device, - struct drbd_peer_request *peer_req) -{ - struct block_device *bdev = device->ldev->backing_bdev; - sector_t s = peer_req->i.sector; - sector_t nr = peer_req->i.size >> 9; - if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages)) - peer_req->flags |= EE_WAS_ERROR; - drbd_endio_write_sec_final(peer_req); -} - - -/* +/** * drbd_submit_peer_request() * @device: DRBD device. * @peer_req: peer request @@ -1636,17 +1621,15 @@ static void drbd_issue_peer_wsame(struct drbd_device *device, /* TODO allocate from our own bio_set. */ int drbd_submit_peer_request(struct drbd_device *device, struct drbd_peer_request *peer_req, - const unsigned op, const unsigned op_flags, - const int fault_type) + const blk_opf_t opf, const int fault_type) { struct bio *bios = NULL; struct bio *bio; struct page *page = peer_req->pages; sector_t sector = peer_req->i.sector; - unsigned data_size = peer_req->i.size; - unsigned n_bios = 0; - unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; - int err = -ENOMEM; + unsigned int data_size = peer_req->i.size; + unsigned int n_bios = 0; + unsigned int nr_pages = PFN_UP(data_size); /* TRIM/DISCARD: for now, always use the helper function * blkdev_issue_zeroout(..., discard=true). @@ -1654,7 +1637,7 @@ int drbd_submit_peer_request(struct drbd_device *device, * Correctness first, performance later. Next step is to code an * asynchronous variant of the same. */ - if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) { + if (peer_req->flags & (EE_TRIM | EE_ZEROOUT)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1671,10 +1654,7 @@ int drbd_submit_peer_request(struct drbd_device *device, spin_unlock_irq(&device->resource->req_lock); } - if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) - drbd_issue_peer_discard_or_zero_out(device, peer_req); - else /* EE_WRITE_SAME */ - drbd_issue_peer_wsame(device, peer_req); + drbd_issue_peer_discard_or_zero_out(device, peer_req); return 0; } @@ -1687,15 +1667,9 @@ int drbd_submit_peer_request(struct drbd_device *device, * generated bio, but a bio allocated on behalf of the peer. */ next_bio: - bio = bio_alloc(GFP_NOIO, nr_pages); - if (!bio) { - drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); - goto fail; - } + bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO); /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, device->ldev->backing_bdev); - bio_set_op_attrs(bio, op, op_flags); bio->bi_private = peer_req; bio->bi_end_io = drbd_peer_request_endio; @@ -1726,14 +1700,6 @@ next_bio: drbd_submit_bio_noacct(device, fault_type, bio); } while (bios); return 0; - -fail: - while (bios) { - bio = bios; - bios = bios->bi_next; - bio_put(bio); - } - return err; } static void drbd_remove_epoch_entry_interval(struct drbd_device *device, @@ -1870,7 +1836,6 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; - struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; digest_size = 0; if (!trim && peer_device->connection->peer_integrity_tfm) { @@ -1885,7 +1850,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= digest_size; } - /* assume request_size == data_size, but special case trim and wsame. */ + /* assume request_size == data_size, but special case trim. */ ds = data_size; if (trim) { if (!expect(data_size == 0)) @@ -1895,23 +1860,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(data_size == 0)) return NULL; ds = be32_to_cpu(zeroes->size); - } else if (wsame) { - if (data_size != queue_logical_block_size(device->rq_queue)) { - drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", - data_size, queue_logical_block_size(device->rq_queue)); - return NULL; - } - if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) { - drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n", - data_size, bdev_logical_block_size(device->ldev->backing_bdev)); - return NULL; - } - ds = be32_to_cpu(wsame->size); } if (!expect(IS_ALIGNED(ds, 512))) return NULL; - if (trim || wsame || zeroes) { + if (trim || zeroes) { if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) @@ -1943,8 +1896,6 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, peer_req->flags |= EE_ZEROOUT; return peer_req; } - if (wsame) - peer_req->flags |= EE_WRITE_SAME; /* receive payload size bytes into page chain */ ds = data_size; @@ -2033,10 +1984,10 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); bio_for_each_segment(bvec, bio, iter) { - void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; + void *mapped = bvec_kmap_local(&bvec); expect = min_t(int, data_size, bvec.bv_len); err = drbd_recv_all_warn(peer_device->connection, mapped, expect); - kunmap(bvec.bv_page); + kunmap_local(mapped); if (err) return err; data_size -= expect; @@ -2107,7 +2058,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0, + if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, DRBD_FAULT_RS_WR) == 0) return 0; @@ -2162,9 +2113,6 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i if (unlikely(!req)) return -EIO; - /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid - * special casing it there for the various failure cases. - * still no race with drbd_fail_pending_reads */ err = recv_dless_read(peer_device, req, sector, pi->size); if (!err) req_mod(req, DATA_RECEIVED); @@ -2430,21 +2378,19 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co /* see also bio_flags_to_wire() * DRBD_REQ_*, because we need to semantically map the flags to data packet * flags and back. We may replicate to other kernel versions. */ -static unsigned long wire_flags_to_bio_flags(u32 dpf) +static blk_opf_t wire_flags_to_bio_flags(u32 dpf) { return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | (dpf & DP_FUA ? REQ_FUA : 0) | (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); } -static unsigned long wire_flags_to_bio_op(u32 dpf) +static enum req_op wire_flags_to_bio_op(u32 dpf) { if (dpf & DP_ZEROES) return REQ_OP_WRITE_ZEROES; if (dpf & DP_DISCARD) return REQ_OP_DISCARD; - if (dpf & DP_WSAME) - return REQ_OP_WRITE_SAME; else return REQ_OP_WRITE; } @@ -2592,7 +2538,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * struct drbd_peer_request *peer_req; struct p_data *p = pi->data; u32 peer_seq = be32_to_cpu(p->seq_num); - int op, op_flags; + enum req_op op; + blk_opf_t op_flags; u32 dp_flags; int err, tp; @@ -2711,11 +2658,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - /* TRIM and WRITE_SAME are processed synchronously, + /* TRIM and is processed synchronously, * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0) + if ((peer_req->flags & (EE_TRIM | EE_ZEROOUT)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -2730,7 +2677,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_CALL_AL_COMPLETE_IO; } - err = drbd_submit_peer_request(device, peer_req, op, op_flags, + err = drbd_submit_peer_request(device, peer_req, op | op_flags, DRBD_FAULT_DT_WR); if (!err) return 0; @@ -3028,7 +2975,7 @@ submit_for_resync: submit: update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, + if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, fault_type) == 0) return 0; @@ -3799,8 +3746,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu(old_net_conf); return 0; disconnect_rcu_unlock: @@ -3921,7 +3867,8 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i /* initialize verify_alg and csums_alg */ p = pi->data; - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX); + memset(&p->algs, 0, sizeof(p->algs)); err = drbd_recv_all(peer_device->connection, p, header_size); if (err) @@ -3950,7 +3897,6 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i drbd_err(device, "verify-alg of wrong size, " "peer wants %u, accepting only up to %u byte\n", data_size, SHARED_SECRET_MAX); - err = -EIO; goto reconnect; } @@ -4168,8 +4114,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", (unsigned long)p_usize, (unsigned long)my_usize); @@ -5002,7 +4947,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac if (get_ldev(device)) { struct drbd_peer_request *peer_req; - const int op = REQ_OP_WRITE_ZEROES; + const enum req_op op = REQ_OP_WRITE_ZEROES; peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, size, 0, GFP_NOIO); @@ -5020,7 +4965,8 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); + err = drbd_submit_peer_request(device, peer_req, op, + DRBD_FAULT_RS_WR); if (err) { spin_lock_irq(&device->resource->req_lock); @@ -5083,7 +5029,6 @@ static struct data_cmd drbd_cmd_handler[] = { [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, - [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; static void drbdd(struct drbd_connection *connection) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3235532ae077..7f9bcc82fc9c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -30,12 +30,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio return NULL; memset(req, 0, sizeof(*req)); - req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); - req->private_bio->bi_private = req; - req->private_bio->bi_end_io = drbd_request_endio; - req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) - | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); req->device = device; @@ -180,7 +175,8 @@ void start_new_tl_epoch(struct drbd_connection *connection) void complete_master_bio(struct drbd_device *device, struct bio_and_error *m) { - m->bio->bi_status = errno_to_blk_status(m->error); + if (unlikely(m->error)) + m->bio->bi_status = errno_to_blk_status(m->error); bio_endio(m->bio); dec_ap_bio(device); } @@ -332,17 +328,21 @@ static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct dr static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) { struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + struct drbd_request *iter = req; if (!connection) return; if (connection->req_next != req) return; - list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { - const unsigned s = req->rq_state; - if (s & RQ_NET_QUEUED) + + req = NULL; + list_for_each_entry_continue(iter, &connection->transfer_log, tl_requests) { + const unsigned int s = iter->rq_state; + + if (s & RQ_NET_QUEUED) { + req = iter; break; + } } - if (&req->tl_requests == &connection->transfer_log) - req = NULL; connection->req_next = req; } @@ -358,17 +358,21 @@ static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, st static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) { struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + struct drbd_request *iter = req; if (!connection) return; if (connection->req_ack_pending != req) return; - list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { - const unsigned s = req->rq_state; - if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) + + req = NULL; + list_for_each_entry_continue(iter, &connection->transfer_log, tl_requests) { + const unsigned int s = iter->rq_state; + + if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) { + req = iter; break; + } } - if (&req->tl_requests == &connection->transfer_log) - req = NULL; connection->req_ack_pending = req; } @@ -384,17 +388,21 @@ static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, s static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) { struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + struct drbd_request *iter = req; if (!connection) return; if (connection->req_not_net_done != req) return; - list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { - const unsigned s = req->rq_state; - if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) + + req = NULL; + list_for_each_entry_continue(iter, &connection->transfer_log, tl_requests) { + const unsigned int s = iter->rq_state; + + if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) { + req = iter; break; + } } - if (&req->tl_requests == &connection->transfer_log) - req = NULL; connection->req_not_net_done = req; } @@ -510,16 +518,14 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) { - char b[BDEVNAME_SIZE]; - if (!__ratelimit(&drbd_ratelimit_state)) return; - drbd_warn(device, "local %s IO error sector %llu+%u on %s\n", + drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n", (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", (unsigned long long)req->i.sector, req->i.size >> 9, - bdevname(device->ldev->backing_bdev, b)); + device->ldev->backing_bdev); } /* Helper for HANDED_OVER_TO_NETWORK. @@ -909,8 +915,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - return bdi_read_congested( - device->ldev->backing_bdev->bd_disk->bdi); + return false; case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); @@ -1151,8 +1156,6 @@ drbd_submit_req_private_bio(struct drbd_request *req) else type = DRBD_FAULT_DT_RD; - bio_set_dev(bio, device->ldev->backing_bdev); - /* State may have changed since we grabbed our reference on the * ->ldev member. Double check, and short-circuit to endio. * In case the last activity log transaction failed to get on @@ -1211,9 +1214,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio) /* Update disk stats */ req->start_jif = bio_start_io_acct(req->master_bio); - if (!get_ldev(device)) { - bio_put(req->private_bio); - req->private_bio = NULL; + if (get_ldev(device)) { + req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, + bio, GFP_NOIO, + &drbd_io_bio_set); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; } /* process discards always from our submitter thread */ @@ -1600,7 +1606,7 @@ void drbd_submit_bio(struct bio *bio) { struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); /* * what we "blindly" assume: diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 511f39a08de4..6237fa1dcb0e 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -266,8 +266,6 @@ struct bio_and_error { extern void start_new_tl_epoch(struct drbd_connection *connection); extern void drbd_req_destroy(struct kref *kref); -extern void _req_may_be_done(struct drbd_request *req, - struct bio_and_error *m); extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_device *device, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index b8a27818ab3f..3f7bf9f2d874 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1537,7 +1537,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, return rv; } -void notify_resource_state_change(struct sk_buff *skb, +int notify_resource_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_resource_state_change *resource_state_change, enum drbd_notification_type type) @@ -1550,10 +1550,10 @@ void notify_resource_state_change(struct sk_buff *skb, .res_susp_fen = resource_state_change->susp_fen[NEW], }; - notify_resource_state(skb, seq, resource, &resource_info, type); + return notify_resource_state(skb, seq, resource, &resource_info, type); } -void notify_connection_state_change(struct sk_buff *skb, +int notify_connection_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_connection_state_change *connection_state_change, enum drbd_notification_type type) @@ -1564,10 +1564,10 @@ void notify_connection_state_change(struct sk_buff *skb, .conn_role = connection_state_change->peer_role[NEW], }; - notify_connection_state(skb, seq, connection, &connection_info, type); + return notify_connection_state(skb, seq, connection, &connection_info, type); } -void notify_device_state_change(struct sk_buff *skb, +int notify_device_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_device_state_change *device_state_change, enum drbd_notification_type type) @@ -1577,10 +1577,10 @@ void notify_device_state_change(struct sk_buff *skb, .dev_disk_state = device_state_change->disk_state[NEW], }; - notify_device_state(skb, seq, device, &device_info, type); + return notify_device_state(skb, seq, device, &device_info, type); } -void notify_peer_device_state_change(struct sk_buff *skb, +int notify_peer_device_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_peer_device_state_change *p, enum drbd_notification_type type) @@ -1594,7 +1594,7 @@ void notify_peer_device_state_change(struct sk_buff *skb, .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], }; - notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); + return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); } static void broadcast_state_change(struct drbd_state_change *state_change) @@ -1602,7 +1602,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change) struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; bool resource_state_has_changed; unsigned int n_device, n_connection, n_peer_device, n_peer_devices; - void (*last_func)(struct sk_buff *, unsigned int, void *, + int (*last_func)(struct sk_buff *, unsigned int, void *, enum drbd_notification_type) = NULL; void *last_arg = NULL; @@ -2071,8 +2071,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_conf); + kvfree_rcu(old_conf); } if (ns_max.susp_fen) { diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h index ba80f612d6ab..d5b0479bc9a6 100644 --- a/drivers/block/drbd/drbd_state_change.h +++ b/drivers/block/drbd/drbd_state_change.h @@ -44,19 +44,19 @@ extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_ extern void copy_old_to_new_state_change(struct drbd_state_change *); extern void forget_state_change(struct drbd_state_change *); -extern void notify_resource_state_change(struct sk_buff *, +extern int notify_resource_state_change(struct sk_buff *, unsigned int, struct drbd_resource_state_change *, enum drbd_notification_type type); -extern void notify_connection_state_change(struct sk_buff *, +extern int notify_connection_state_change(struct sk_buff *, unsigned int, struct drbd_connection_state_change *, enum drbd_notification_type type); -extern void notify_device_state_change(struct sk_buff *, +extern int notify_device_state_change(struct sk_buff *, unsigned int, struct drbd_device_state_change *, enum drbd_notification_type type); -extern void notify_peer_device_state_change(struct sk_buff *, +extern int notify_peer_device_state_change(struct sk_buff *, unsigned int, struct drbd_peer_device_state_change *, enum drbd_notification_type type); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 64563bfdf0da..0bb1a900c2d5 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -326,14 +326,9 @@ void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest) bio_for_each_segment(bvec, bio, iter) { u8 *src; - src = kmap_atomic(bvec.bv_page); - crypto_shash_update(desc, src + bvec.bv_offset, bvec.bv_len); - kunmap_atomic(src); - - /* REQ_OP_WRITE_SAME has only one segment, - * checksum the payload only once. */ - if (bio_op(bio) == REQ_OP_WRITE_SAME) - break; + src = bvec_kmap_local(&bvec); + crypto_shash_update(desc, src, bvec.bv_len); + kunmap_local(src); } crypto_shash_final(desc, digest); shash_desc_zero(desc); @@ -410,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, spin_unlock_irq(&device->resource->req_lock); atomic_add(size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, + if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, DRBD_FAULT_RS_RD) == 0) return 0; @@ -1035,7 +1030,7 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_ { if (drbd_peer_req_has_active_page(peer_req)) { /* This might happen if sendpage() has not finished */ - int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + int i = PFN_UP(peer_req->i.size); atomic_add(i, &device->pp_in_use_by_net); atomic_sub(i, &device->pp_in_use); spin_lock_irq(&device->resource->req_lock); @@ -1523,9 +1518,9 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) drbd_al_begin_io(device, &req->i); - req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO, + req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, + req->master_bio, GFP_NOIO, &drbd_io_bio_set); - bio_set_dev(req->private_bio, device->ldev->backing_bdev); req->private_bio->bi_private = req; req->private_bio->bi_end_io = drbd_request_endio; submit_bio_noacct(req->private_bio); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c4267da716fe..ccad3d7b3ddd 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -509,8 +509,8 @@ static unsigned long fdc_busy; static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); static DECLARE_WAIT_QUEUE_HEAD(command_done); -/* Errors during formatting are counted here. */ -static int format_errors; +/* errors encountered on the current (or last) request */ +static int floppy_errors; /* Format request descriptor. */ static struct format_descr format_req; @@ -530,7 +530,6 @@ static struct format_descr format_req; static char *floppy_track_buffer; static int max_buffer_sectors; -static int *errors; typedef void (*done_f)(int); static const struct cont_t { void (*interrupt)(void); @@ -1015,7 +1014,7 @@ static DECLARE_DELAYED_WORK(fd_timer, fd_timer_workfn); static void cancel_activity(void) { do_floppy = NULL; - cancel_delayed_work_sync(&fd_timer); + cancel_delayed_work(&fd_timer); cancel_work_sync(&floppy_work); } @@ -1455,7 +1454,7 @@ static int interpret_errors(void) if (drive_params[current_drive].flags & FTD_MSG) DPRINT("Over/Underrun - retrying\n"); bad = 0; - } else if (*errors >= drive_params[current_drive].max_errors.reporting) { + } else if (floppy_errors >= drive_params[current_drive].max_errors.reporting) { print_errors(); } if (reply_buffer[ST2] & ST2_WC || reply_buffer[ST2] & ST2_BC) @@ -2095,7 +2094,7 @@ static void bad_flp_intr(void) if (!next_valid_format(current_drive)) return; } - err_count = ++(*errors); + err_count = ++floppy_errors; INFBOUND(write_errors[current_drive].badness, err_count); if (err_count > drive_params[current_drive].max_errors.abort) cont->done(0); @@ -2241,9 +2240,8 @@ static int do_format(int drive, struct format_descr *tmp_format_req) return -EINVAL; } format_req = *tmp_format_req; - format_errors = 0; cont = &format_cont; - errors = &format_errors; + floppy_errors = 0; ret = wait_til_done(redo_format, true); if (ret == -EINTR) return -EINTR; @@ -2259,7 +2257,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) static void floppy_end_request(struct request *req, blk_status_t error) { unsigned int nr_sectors = current_count_sectors; - unsigned int drive = (unsigned long)req->rq_disk->private_data; + unsigned int drive = (unsigned long)req->q->disk->private_data; /* current_count_sectors can be zero if transfer failed */ if (error) @@ -2485,11 +2483,9 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) } if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) - memcpy_to_page(bv.bv_page, bv.bv_offset, dma_buffer, - size); + memcpy_to_bvec(&bv, dma_buffer); else - memcpy_from_page(dma_buffer, bv.bv_page, bv.bv_offset, - size); + memcpy_from_bvec(dma_buffer, &bv); remaining -= size; dma_buffer += size; @@ -2550,7 +2546,7 @@ static int make_raw_rw_request(void) if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n")) return 0; - set_fdc((long)current_req->rq_disk->private_data); + set_fdc((long)current_req->q->disk->private_data); raw_cmd = &default_raw_cmd; raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; @@ -2761,10 +2757,11 @@ static int set_next_request(void) current_req = list_first_entry_or_null(&floppy_reqs, struct request, queuelist); if (current_req) { - current_req->error_count = 0; + floppy_errors = 0; list_del_init(¤t_req->queuelist); + return 1; } - return current_req != NULL; + return 0; } /* Starts or continues processing request. Will automatically unlock the @@ -2792,7 +2789,7 @@ do_request: return; } } - drive = (long)current_req->rq_disk->private_data; + drive = (long)current_req->q->disk->private_data; set_fdc(drive); reschedule_timeout(current_drive, "redo fd request"); @@ -2823,7 +2820,6 @@ do_request: _floppy = floppy_type + drive_params[current_drive].autodetect[drive_state[current_drive].probed_format]; } else probing = 0; - errors = &(current_req->error_count); tmp = make_raw_rw_request(); if (tmp < 2) { request_done(tmp); @@ -2863,7 +2859,7 @@ static blk_status_t floppy_queue_rq(struct blk_mq_hw_ctx *hctx, if (WARN(atomic_read(&usage_count) == 0, "warning: usage count=0, current_req=%p sect=%ld flags=%llx\n", current_req, (long)blk_rq_pos(current_req), - (unsigned long long) current_req->cmd_flags)) + (__force unsigned long long) current_req->cmd_flags)) return BLK_STS_IOERR; if (test_and_set_bit(0, &fdc_busy)) { @@ -2984,6 +2980,8 @@ static const char *drive_name(int type, int drive) return "(null)"; } +#ifdef CONFIG_BLK_DEV_FD_RAWCMD + /* raw commands */ static void raw_cmd_done(int flag) { @@ -3081,6 +3079,8 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) } } +#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT) + static int raw_cmd_copyin(int cmd, void __user *param, struct floppy_raw_cmd **rcmd) { @@ -3108,7 +3108,7 @@ loop: ptr->resultcode = 0; if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { - if (ptr->length <= 0) + if (ptr->length <= 0 || ptr->length >= MAX_LEN) return -EINVAL; ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length); fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length); @@ -3181,6 +3181,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param) return ret; } +static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, + void __user *param) +{ + int ret; + + pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n"); + + if (type) + return -EINVAL; + if (lock_fdc(drive)) + return -EINTR; + set_floppy(drive); + ret = raw_cmd_ioctl(cmd, param); + if (ret == -EINTR) + return -EINTR; + process_fd_request(); + return ret; +} + +#else /* CONFIG_BLK_DEV_FD_RAWCMD */ + +static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, + void __user *param) +{ + return -EOPNOTSUPP; +} + +#endif + static int invalidate_drive(struct block_device *bdev) { /* invalidate the buffer track to force a reread */ @@ -3369,7 +3398,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int { int drive = (long)bdev->bd_disk->private_data; int type = ITYPE(drive_state[drive].fd_device); - int i; int ret; int size; union inparam { @@ -3520,16 +3548,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int outparam = &write_errors[drive]; break; case FDRAWCMD: - if (type) - return -EINVAL; - if (lock_fdc(drive)) - return -EINTR; - set_floppy(drive); - i = raw_cmd_ioctl(cmd, (void __user *)param); - if (i == -EINTR) - return -EINTR; - process_fd_request(); - return i; + return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param); case FDTWADDLE: if (lock_fdc(drive)) return -EINTR; @@ -4127,15 +4146,13 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) cbdata.drive = drive; - bio_init(&bio, &bio_vec, 1); - bio_set_dev(&bio, bdev); + bio_init(&bio, bdev, &bio_vec, 1, REQ_OP_READ); bio_add_page(&bio, page, block_size(bdev), 0); bio.bi_iter.bi_sector = 0; bio.bi_flags |= (1 << BIO_QUIET); bio.bi_private = &cbdata; bio.bi_end_io = floppy_rb0_cb; - bio_set_op_attrs(&bio, REQ_OP_READ, 0); init_completion(&cbdata.complete); @@ -4503,6 +4520,7 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type) disk->first_minor = TOMINOR(drive) | (type << 2); disk->minors = 1; disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; if (type) sprintf(disk->disk_name, "fd%d_type%d", drive, type); @@ -4539,7 +4557,7 @@ out: return; cleanup_disk: - blk_cleanup_disk(disks[drive][type]); + put_disk(disks[drive][type]); disks[drive][type] = NULL; mutex_unlock(&floppy_probe_lock); } @@ -4735,7 +4753,7 @@ out_put_disk: if (!disks[drive][0]) break; del_timer_sync(&motor_off_timer[drive]); - blk_cleanup_disk(disks[drive][0]); + put_disk(disks[drive][0]); blk_mq_free_tag_set(&tag_sets[drive]); } return err; @@ -4967,7 +4985,7 @@ static void __exit floppy_module_exit(void) } for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { if (disks[drive][i]) - blk_cleanup_disk(disks[drive][i]); + put_disk(disks[drive][i]); } blk_mq_free_tag_set(&tag_sets[drive]); } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c3a36cfaa855..ad92192c7d61 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,54 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll <rkroll@exploits.org> 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe <axboe@suse.de>, Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * + * Copyright 1993 by Theodore Ts'o. */ - #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> @@ -59,7 +12,6 @@ #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> -#include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> @@ -79,12 +31,66 @@ #include <linux/ioprio.h> #include <linux/blk-cgroup.h> #include <linux/sched/mm.h> +#include <linux/statfs.h> +#include <linux/uaccess.h> +#include <linux/blk-mq.h> +#include <linux/spinlock.h> +#include <uapi/linux/loop.h> + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; -#include "loop.h" +struct loop_func_table; + +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file * lo_backing_file; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; -#include <linux/uaccess.h> +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH (128) static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -184,8 +190,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) */ if (dio) { if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO) + !(lo->lo_offset & dio_align) && + (file->f_mode & FMODE_CAN_ODIRECT)) use_dio = true; else use_dio = false; @@ -308,27 +314,22 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; - struct request_queue *q = lo->lo_queue; int ret; mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(q)) { - ret = -EOPNOTSUPP; - goto out; - } + if (!bdev_max_discard_sectors(lo->lo_device)) + return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) - ret = -EIO; - out: + return -EIO; return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { - struct file *file = lo->lo_backing_file; - int ret = vfs_fsync(file, 0); + int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; @@ -572,6 +573,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!file) return -EBADF; + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) @@ -626,13 +631,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (partscan) loop_reread_partitions(lo); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); - return error; + goto done; } /* loop sysfs attributes */ @@ -680,33 +690,33 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); - return sprintf(buf, "%s\n", autoclear ? "1" : "0"); + return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); - return sprintf(buf, "%s\n", partscan ? "1" : "0"); + return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); - return sprintf(buf, "%s\n", dio ? "1" : "0"); + return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); @@ -762,7 +772,7 @@ static void loop_config_discard(struct loop_device *lo) struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = backingq->limits.discard_granularity ?: + granularity = bdev_discard_granularity(I_BDEV(inode)) ?: queue_physical_block_size(backingq); /* @@ -774,22 +784,24 @@ static void loop_config_discard(struct loop_device *lo) granularity = 0; } else { + struct kstatfs sbuf; + max_discard_sectors = UINT_MAX >> 9; - granularity = inode->i_sb->s_blocksize; + if (!vfs_statfs(&file->f_path, &sbuf)) + granularity = sbuf.f_bsize; + else + max_discard_sectors = 0; } if (max_discard_sectors) { q->limits.discard_granularity = granularity; blk_queue_max_discard_sectors(q, max_discard_sectors); blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } else { q->limits.discard_granularity = 0; blk_queue_max_discard_sectors(q, 0); blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); } - q->limits.discard_alignment = 0; } struct loop_worker { @@ -803,8 +815,6 @@ struct loop_worker { }; static void loop_workfn(struct work_struct *work); -static void loop_rootcg_workfn(struct work_struct *work); -static void loop_free_idle_workers(struct timer_list *timer); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -820,7 +830,7 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL; + struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; @@ -888,6 +898,39 @@ queue_work: spin_unlock_irq(&lo->lo_work_lock); } +static void loop_set_timer(struct loop_device *lo) +{ + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} + +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) +{ + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); +} + +static void loop_free_idle_workers_timer(struct timer_list *timer) +{ + struct loop_device *lo = container_of(timer, struct loop_device, timer); + + return loop_free_idle_workers(lo, false); +} + static void loop_update_rotational(struct loop_device *lo) { struct file *file = lo->lo_backing_file; @@ -898,7 +941,7 @@ static void loop_update_rotational(struct loop_device *lo) /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ if (file_bdev) - nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); + nonrot = bdev_nonrot(file_bdev); if (nonrot) blk_queue_flag_set(QUEUE_FLAG_NONROT, q); @@ -936,6 +979,11 @@ loop_set_status_from_info(struct loop_device *lo, lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; + + /* loff_t vars have been assigned __u64 */ + if (lo->lo_offset < 0 || lo->lo_sizelimit < 0) + return -EOVERFLOW; + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; lo->lo_flags = info->lo_flags; @@ -962,6 +1010,9 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. @@ -1006,24 +1057,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - lo->workqueue = alloc_workqueue("loop%d", - WQ_UNBOUND | WQ_FREEZABLE, - 0, - lo->lo_number); if (!lo->workqueue) { - error = -ENOMEM; - goto out_unlock; + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } } disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); - INIT_LIST_HEAD(&lo->rootcg_cmd_list); - INIT_LIST_HEAD(&lo->idle_worker_list); - lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers, - TIMER_DEFERRABLE); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; @@ -1061,14 +1107,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_unlock: loop_global_unlock(lo, is_loop); @@ -1079,61 +1130,27 @@ out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - return error; + goto done; } -static int __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo, bool release) { - struct file *filp = NULL; + struct file *filp; gfp_t gfp = lo->old_gfp_mask; - int err = 0; - bool partscan = false; - int lo_number; - struct loop_worker *pos, *worker; - - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - mutex_lock(&lo->lo_mutex); - if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { - err = -ENXIO; - goto out_unlock; - } - - filp = lo->lo_backing_file; - if (filp == NULL) { - err = -EINVAL; - goto out_unlock; - } if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); - - destroy_workqueue(lo->workqueue); - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - spin_unlock_irq(&lo->lo_work_lock); - del_timer_sync(&lo->timer); + /* + * Freeze the request queue when unbinding on a live file descriptor and + * thus an open device. When called from ->release we are guaranteed + * that there is no I/O in progress already. + */ + if (!release) + blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); + filp = lo->lo_backing_file; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1151,14 +1168,14 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); + if (!release) + blk_mq_unfreeze_queue(lo->lo_queue); - partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; - lo_number = lo->lo_number; disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); -out_unlock: - mutex_unlock(&lo->lo_mutex); - if (partscan) { + + if (lo->lo_flags & LO_FLAGS_PARTSCAN) { + int err; + /* * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. @@ -1174,24 +1191,20 @@ out_unlock: mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", - __func__, lo_number, err); + __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ - err = 0; } /* * lo->lo_state is set to Lo_unbound here after above partscan has - * finished. - * - * There cannot be anybody else entering __loop_clr_fd() as - * lo->lo_backing_file is already cleared and Lo_rundown state - * protects us from all the other places trying to change the 'lo' - * device. + * finished. There cannot be anybody else entering __loop_clr_fd() as + * Lo_rundown state protects us from all the other places trying to + * change the 'lo' device. */ - mutex_lock(&lo->lo_mutex); lo->lo_flags = 0; if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); @@ -1200,20 +1213,27 @@ out_unlock: * lo_mutex triggers a circular lock dependency possibility warning as * fput can take open_mutex which is usually taken before lo_mutex. */ - if (filp) - fput(filp); - return err; + fput(filp); } static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* @@ -1226,15 +1246,16 @@ static int loop_clr_fd(struct loop_device *lo) * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ - if (atomic_read(&lo->lo_refcnt) > 1) { + if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); - return __loop_clr_fd(lo, false); + __loop_clr_fd(lo, false); + return 0; } static int @@ -1263,15 +1284,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); - if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { - /* If any pages were dirtied after invalidate_bdev(), try again */ - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - prev_lo_flags = lo->lo_flags; err = loop_set_status_from_info(lo, info); @@ -1301,7 +1313,7 @@ out_unfreeze: if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); partscan = true; } out_unlock: @@ -1482,21 +1494,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - - /* invalidate_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - blk_queue_logical_block_size(lo->lo_queue, arg); blk_queue_physical_block_size(lo->lo_queue, arg); blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); -out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); return err; @@ -1597,6 +1598,7 @@ struct compat_loop_info { compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; + compat_int_t lo_encrypt_type; /* obsolete, ignored */ compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; @@ -1725,33 +1727,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) -{ - struct loop_device *lo = bdev->bd_disk->private_data; - int err; - - err = mutex_lock_killable(&lo->lo_mutex); - if (err) - return err; - if (lo->lo_state == Lo_deleting) - err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); - mutex_unlock(&lo->lo_mutex); - return err; -} - static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); /* @@ -1760,27 +1744,30 @@ static void lo_release(struct gendisk *disk, fmode_t mode) */ __loop_clr_fd(lo, true); return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); } - -out_unlock: mutex_unlock(&lo->lo_mutex); } +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + del_timer_sync(&lo->timer); + mutex_destroy(&lo->lo_mutex); + kfree(lo); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* @@ -1791,6 +1778,24 @@ module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) +{ + int ret = kstrtoint(s, 10, &hw_queue_depth); + + return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0; +} + +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1821,12 +1826,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (rq->bio && rq->bio->bi_blkg) { - cmd->blkcg_css = &bio_blkcg(rq->bio)->css; + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } #endif } #endif @@ -1875,11 +1882,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_set_timer(struct loop_device *lo) -{ - timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); -} - static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { @@ -1928,27 +1930,6 @@ static void loop_rootcg_workfn(struct work_struct *work) loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } -static void loop_free_idle_workers(struct timer_list *timer) -{ - struct loop_device *lo = container_of(timer, struct loop_device, timer); - struct loop_worker *pos, *worker; - - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - if (time_is_after_jiffies(worker->last_ran_at + - LOOP_IDLE_WORKER_TIMEOUT)) - break; - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - if (!list_empty(&lo->idle_worker_list)) - loop_set_timer(lo); - spin_unlock_irq(&lo->lo_work_lock); -} - static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, @@ -1964,6 +1945,9 @@ static int loop_add(int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); @@ -1985,7 +1969,7 @@ static int loop_add(int i) lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | @@ -2032,13 +2016,13 @@ static int loop_add(int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART_SCAN; - disk->flags |= GENHD_FL_EXT_DEVT; - atomic_set(&lo->lo_refcnt, 0); + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; @@ -2061,7 +2045,7 @@ static int loop_add(int i) return i; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: @@ -2078,14 +2062,13 @@ static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_disk(lo->lo_disk); blk_mq_free_tag_set(&lo->tag_set); + mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); - /* There is no route which can find this loop device. */ - mutex_destroy(&lo->lo_mutex); - kfree(lo); + + put_disk(lo->lo_disk); } static void loop_probe(dev_t dev) @@ -2124,13 +2107,12 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } - /* Mark this loop device no longer open()-able. */ + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); diff --git a/drivers/block/loop.h b/drivers/block/loop.h deleted file mode 100644 index 082d4b6bfc6a..000000000000 --- a/drivers/block/loop.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * loop.h - * - * Written by Theodore Ts'o, 3/29/93. - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - */ -#ifndef _LINUX_LOOP_H -#define _LINUX_LOOP_H - -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/spinlock.h> -#include <linux/mutex.h> -#include <uapi/linux/loop.h> - -/* Possible states of device */ -enum { - Lo_unbound, - Lo_bound, - Lo_rundown, - Lo_deleting, -}; - -struct loop_func_table; - -struct loop_device { - int lo_number; - atomic_t lo_refcnt; - loff_t lo_offset; - loff_t lo_sizelimit; - int lo_flags; - char lo_file_name[LO_NAME_SIZE]; - - struct file * lo_backing_file; - struct block_device *lo_device; - - gfp_t old_gfp_mask; - - spinlock_t lo_lock; - int lo_state; - spinlock_t lo_work_lock; - struct workqueue_struct *workqueue; - struct work_struct rootcg_work; - struct list_head rootcg_cmd_list; - struct list_head idle_worker_list; - struct rb_root worker_tree; - struct timer_list timer; - bool use_dio; - bool sysfs_inited; - - struct request_queue *lo_queue; - struct blk_mq_tag_set tag_set; - struct gendisk *lo_disk; - struct mutex lo_mutex; - bool idr_visible; -}; - -struct loop_cmd { - struct list_head list_entry; - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ - long ret; - struct kiocb iocb; - struct bio_vec *bvec; - struct cgroup_subsys_state *blkcg_css; - struct cgroup_subsys_state *memcg_css; -}; - -#endif diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c91b9010c1a6..815d77ba6381 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -19,7 +19,6 @@ #include <linux/compat.h> #include <linux/fs.h> #include <linux/module.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/bio.h> @@ -95,17 +94,12 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; -static LIST_HEAD(online_list); -static LIST_HEAD(removing_list); -static DEFINE_SPINLOCK(dev_lock); - /* * Global variable used to hold the major block device number * allocated in mtip_init(). */ static int mtip_major; static struct dentry *dfs_parent; -static struct dentry *dfs_device_status; static u32 cpu_use[NR_CPUS]; @@ -136,23 +130,19 @@ struct mtip_compat_ide_task_request_s { * return value * true if device removed, else false */ -static bool mtip_check_surprise_removal(struct pci_dev *pdev) +static bool mtip_check_surprise_removal(struct driver_data *dd) { u16 vendor_id = 0; - struct driver_data *dd = pci_get_drvdata(pdev); if (dd->sr) return true; /* Read the vendorID from the configuration space */ - pci_read_config_word(pdev, 0x00, &vendor_id); + pci_read_config_word(dd->pdev, 0x00, &vendor_id); if (vendor_id == 0xFFFF) { dd->sr = true; - if (dd->queue) - blk_queue_flag_set(QUEUE_FLAG_DEAD, dd->queue); - else - dev_warn(&dd->pdev->dev, - "%s: dd->queue is NULL\n", __func__); + if (dd->disk) + blk_mark_disk_dead(dd->disk); return true; /* device removed */ } @@ -162,9 +152,7 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, unsigned int tag) { - struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; - - return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(hctx->tags, tag)); + return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(dd->tags.tags[0], tag)); } /* @@ -447,7 +435,7 @@ static int mtip_device_reset(struct driver_data *dd) { int rv = 0; - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return 0; if (mtip_hba_reset(dd) < 0) @@ -727,7 +715,7 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) dev_warn(&dd->pdev->dev, "Port stat errors %x unhandled\n", (port_stat & ~PORT_IRQ_HANDLED)); - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return; } if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) { @@ -752,7 +740,7 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) /* Acknowledge the interrupt status on the port.*/ port_stat = readl(port->mmio + PORT_IRQ_STAT); if (unlikely(port_stat == 0xFFFFFFFF)) { - mtip_check_surprise_removal(dd->pdev); + mtip_check_surprise_removal(dd); return IRQ_HANDLED; } writel(port_stat, port->mmio + PORT_IRQ_STAT); @@ -796,7 +784,7 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) } if (unlikely(port_stat & PORT_IRQ_ERR)) { - if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + if (unlikely(mtip_check_surprise_removal(dd))) { /* don't proceed further */ return IRQ_HANDLED; } @@ -915,7 +903,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) msleep(100); - if (mtip_check_surprise_removal(port->dd->pdev)) + if (mtip_check_surprise_removal(port->dd)) goto err_fault; active = mtip_commands_active(port); @@ -980,7 +968,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return -EFAULT; rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED); @@ -1015,14 +1003,14 @@ static int mtip_exec_internal_command(struct mtip_port *port, rq->timeout = timeout; /* insert request and run queue */ - blk_execute_rq(NULL, rq, true); + blk_execute_rq(rq, true); if (int_cmd->status) { dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", fis->command, int_cmd->status); rv = -EIO; - if (mtip_check_surprise_removal(dd->pdev) || + if (mtip_check_surprise_removal(dd) || test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { dev_err(&dd->pdev->dev, @@ -1409,15 +1397,15 @@ static void mtip_dump_identify(struct mtip_port *port) if (!port->identify_valid) return; - strlcpy(cbuf, (char *)(port->identify+10), 21); + strscpy(cbuf, (char *)(port->identify + 10), 21); dev_info(&port->dd->pdev->dev, "Serial No.: %s\n", cbuf); - strlcpy(cbuf, (char *)(port->identify+23), 9); + strscpy(cbuf, (char *)(port->identify + 23), 9); dev_info(&port->dd->pdev->dev, "Firmware Ver.: %s\n", cbuf); - strlcpy(cbuf, (char *)(port->identify+27), 41); + strscpy(cbuf, (char *)(port->identify + 27), 41); dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", @@ -1433,13 +1421,13 @@ static void mtip_dump_identify(struct mtip_port *port) pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); switch (revid & 0xFF) { case 0x1: - strlcpy(cbuf, "A0", 3); + strscpy(cbuf, "A0", 3); break; case 0x3: - strlcpy(cbuf, "A2", 3); + strscpy(cbuf, "A2", 3); break; default: - strlcpy(cbuf, "?", 2); + strscpy(cbuf, "?", 2); break; } dev_info(&port->dd->pdev->dev, @@ -2174,106 +2162,6 @@ static const struct attribute_group *mtip_disk_attr_groups[] = { NULL, }; -/* debugsfs entries */ - -static ssize_t show_device_status(struct device_driver *drv, char *buf) -{ - int size = 0; - struct driver_data *dd, *tmp; - unsigned long flags; - char id_buf[42]; - u16 status = 0; - - spin_lock_irqsave(&dev_lock, flags); - size += sprintf(&buf[size], "Devices Present:\n"); - list_for_each_entry_safe(dd, tmp, &online_list, online_list) { - if (dd->pdev) { - if (dd->port && - dd->port->identify && - dd->port->identify_valid) { - strlcpy(id_buf, - (char *) (dd->port->identify + 10), 21); - status = *(dd->port->identify + 141); - } else { - memset(id_buf, 0, 42); - status = 0; - } - - if (dd->port && - test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { - size += sprintf(&buf[size], - " device %s %s (ftl rebuild %d %%)\n", - dev_name(&dd->pdev->dev), - id_buf, - status); - } else { - size += sprintf(&buf[size], - " device %s %s\n", - dev_name(&dd->pdev->dev), - id_buf); - } - } - } - - size += sprintf(&buf[size], "Devices Being Removed:\n"); - list_for_each_entry_safe(dd, tmp, &removing_list, remove_list) { - if (dd->pdev) { - if (dd->port && - dd->port->identify && - dd->port->identify_valid) { - strlcpy(id_buf, - (char *) (dd->port->identify+10), 21); - status = *(dd->port->identify + 141); - } else { - memset(id_buf, 0, 42); - status = 0; - } - - if (dd->port && - test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { - size += sprintf(&buf[size], - " device %s %s (ftl rebuild %d %%)\n", - dev_name(&dd->pdev->dev), - id_buf, - status); - } else { - size += sprintf(&buf[size], - " device %s %s\n", - dev_name(&dd->pdev->dev), - id_buf); - } - } - } - spin_unlock_irqrestore(&dev_lock, flags); - - return size; -} - -static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, - size_t len, loff_t *offset) -{ - int size = *offset; - char *buf; - int rv = 0; - - if (!len || *offset) - return 0; - - buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - size += show_device_status(NULL, buf); - - *offset = size <= len ? size : len; - size = copy_to_user(ubuf, buf, *offset); - if (size) - rv = -EFAULT; - - kfree(buf); - return rv ? rv : *offset; -} - static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, size_t len, loff_t *offset) { @@ -2367,13 +2255,6 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, return rv ? rv : *offset; } -static const struct file_operations mtip_device_status_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = mtip_hw_read_device_status, - .llseek = no_llseek, -}; - static const struct file_operations mtip_regs_fops = { .owner = THIS_MODULE, .open = simple_open, @@ -2513,7 +2394,7 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) return -EFAULT; - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return -EFAULT; if (mtip_get_identify(dd->port, NULL) < 0) @@ -2560,7 +2441,7 @@ static void mtip_softirq_done_fn(struct request *rq) blk_mq_end_request(rq, cmd->status); } -static bool mtip_abort_cmd(struct request *req, void *data, bool reserved) +static bool mtip_abort_cmd(struct request *req, void *data) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); struct driver_data *dd = data; @@ -2573,7 +2454,7 @@ static bool mtip_abort_cmd(struct request *req, void *data, bool reserved) return true; } -static bool mtip_queue_cmd(struct request *req, void *data, bool reserved) +static bool mtip_queue_cmd(struct request *req, void *data) { struct driver_data *dd = data; @@ -2733,7 +2614,7 @@ static int mtip_dma_alloc(struct driver_data *dd) { struct mtip_port *port = dd->port; - /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ + /* Allocate dma memory for RX Fis, Identify, and Sector Buffer */ port->block1 = dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, &port->block1_dma, GFP_KERNEL); @@ -2891,7 +2772,7 @@ static int mtip_hw_init(struct driver_data *dd) time_before(jiffies, timeout)) { mdelay(100); } - if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + if (unlikely(mtip_check_surprise_removal(dd))) { timetaken = jiffies - timetaken; dev_warn(&dd->pdev->dev, "Surprise removal detected at %u ms\n", @@ -3301,26 +3182,12 @@ static int mtip_block_getgeo(struct block_device *dev, return 0; } -static int mtip_block_open(struct block_device *dev, fmode_t mode) +static void mtip_block_free_disk(struct gendisk *disk) { - struct driver_data *dd; - - if (dev && dev->bd_disk) { - dd = (struct driver_data *) dev->bd_disk->private_data; + struct driver_data *dd = disk->private_data; - if (dd) { - if (test_bit(MTIP_DDF_REMOVAL_BIT, - &dd->dd_flag)) { - return -ENODEV; - } - return 0; - } - } - return -ENODEV; -} - -static void mtip_block_release(struct gendisk *disk, fmode_t mode) -{ + ida_free(&rssd_index_ida, dd->index); + kfree(dd); } /* @@ -3330,13 +3197,12 @@ static void mtip_block_release(struct gendisk *disk, fmode_t mode) * layer. */ static const struct block_device_operations mtip_block_ops = { - .open = mtip_block_open, - .release = mtip_block_release, .ioctl = mtip_block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = mtip_block_compat_ioctl, #endif .getgeo = mtip_block_getgeo, + .free_disk = mtip_block_free_disk, .owner = THIS_MODULE }; @@ -3491,12 +3357,11 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, return 0; } -static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, - bool reserved) +static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req) { struct driver_data *dd = req->q->queuedata; - if (reserved) { + if (blk_mq_is_reserved_rq(req)) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); cmd->status = BLK_STS_TIMEOUT; @@ -3668,7 +3533,7 @@ init_hw_cmds_error: disk_index_error: ida_free(&rssd_index_ida, index); ida_get_error: - blk_cleanup_disk(dd->disk); + put_disk(dd->disk); block_queue_alloc_init_error: blk_mq_free_tag_set(&dd->tags); block_queue_alloc_tag_error: @@ -3677,72 +3542,6 @@ protocol_init_error: return rv; } -static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) -{ - struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - - cmd->status = BLK_STS_IOERR; - blk_mq_complete_request(rq); - return true; -} - -/* - * Block layer deinitialization function. - * - * Called by the PCI layer as each P320 device is removed. - * - * @dd Pointer to the driver data structure. - * - * return value - * 0 - */ -static int mtip_block_remove(struct driver_data *dd) -{ - mtip_hw_debugfs_exit(dd); - - if (dd->mtip_svc_handler) { - set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - kthread_stop(dd->mtip_svc_handler); - } - - if (!dd->sr) { - /* - * Explicitly wait here for IOs to quiesce, - * as mtip_standby_drive usually won't wait for IOs. - */ - if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS)) - mtip_standby_drive(dd); - } - else - dev_info(&dd->pdev->dev, "device %s surprise removal\n", - dd->disk->disk_name); - - blk_freeze_queue_start(dd->queue); - blk_mq_quiesce_queue(dd->queue); - blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); - blk_mq_unquiesce_queue(dd->queue); - - if (dd->disk) { - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - del_gendisk(dd->disk); - if (dd->disk->queue) { - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - dd->queue = NULL; - } - put_disk(dd->disk); - } - dd->disk = NULL; - - ida_free(&rssd_index_ida, dd->index); - - /* De-initialize the protocol layer. */ - mtip_hw_exit(dd); - - return 0; -} - /* * Function called by the PCI layer when just before the * machine shuts down. @@ -3759,23 +3558,14 @@ static int mtip_block_shutdown(struct driver_data *dd) { mtip_hw_shutdown(dd); - /* Delete our gendisk structure, and cleanup the blk queue. */ - if (dd->disk) { - dev_info(&dd->pdev->dev, - "Shutting down %s ...\n", dd->disk->disk_name); + dev_info(&dd->pdev->dev, + "Shutting down %s ...\n", dd->disk->disk_name); - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - del_gendisk(dd->disk); - if (dd->disk->queue) { - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - } - put_disk(dd->disk); - dd->disk = NULL; - dd->queue = NULL; - } + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); - ida_free(&rssd_index_ida, dd->index); + blk_mq_free_tag_set(&dd->tags); + put_disk(dd->disk); return 0; } @@ -3909,7 +3699,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, const struct cpumask *node_mask; int cpu, i = 0, j = 0; int my_node = NUMA_NO_NODE; - unsigned long flags; /* Allocate memory for this devices private data. */ my_node = pcibus_to_node(pdev->bus); @@ -3956,9 +3745,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, dd->pdev = pdev; dd->numa_node = my_node; - INIT_LIST_HEAD(&dd->online_list); - INIT_LIST_HEAD(&dd->remove_list); - memset(dd->workq_name, 0, 32); snprintf(dd->workq_name, 31, "mtipq%d", dd->instance); @@ -4051,11 +3837,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, else rv = 0; /* device in rebuild state, return 0 from probe */ - /* Add to online list even if in ftl rebuild */ - spin_lock_irqsave(&dev_lock, flags); - list_add(&dd->online_list, &online_list); - spin_unlock_irqrestore(&dev_lock, flags); - goto done; block_initialize_err: @@ -4089,16 +3870,9 @@ done: static void mtip_pci_remove(struct pci_dev *pdev) { struct driver_data *dd = pci_get_drvdata(pdev); - unsigned long flags, to; - - set_bit(MTIP_DDF_REMOVAL_BIT, &dd->dd_flag); - - spin_lock_irqsave(&dev_lock, flags); - list_del_init(&dd->online_list); - list_add(&dd->remove_list, &removing_list); - spin_unlock_irqrestore(&dev_lock, flags); + unsigned long to; - mtip_check_surprise_removal(pdev); + mtip_check_surprise_removal(dd); synchronize_irq(dd->pdev->irq); /* Spin until workers are done */ @@ -4113,11 +3887,35 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } - blk_set_queue_dying(dd->queue); set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); - /* Clean up the block layer. */ - mtip_block_remove(dd); + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); + + mtip_hw_debugfs_exit(dd); + + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); + } + + if (!dd->sr) { + /* + * Explicitly wait here for IOs to quiesce, + * as mtip_standby_drive usually won't wait for IOs. + */ + if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS)) + mtip_standby_drive(dd); + } + else + dev_info(&dd->pdev->dev, "device %s surprise removal\n", + dd->disk->disk_name); + + blk_mq_free_tag_set(&dd->tags); + + /* De-initialize the protocol layer. */ + mtip_hw_exit(dd); if (dd->isr_workq) { destroy_workqueue(dd->isr_workq); @@ -4128,14 +3926,10 @@ static void mtip_pci_remove(struct pci_dev *pdev) pci_disable_msi(pdev); - spin_lock_irqsave(&dev_lock, flags); - list_del_init(&dd->remove_list); - spin_unlock_irqrestore(&dev_lock, flags); - - kfree(dd); - pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); + + put_disk(dd->disk); } /* @@ -4145,36 +3939,17 @@ static void mtip_pci_remove(struct pci_dev *pdev) * 0 Success * <0 Error */ -static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) +static int __maybe_unused mtip_pci_suspend(struct device *dev) { int rv = 0; - struct driver_data *dd = pci_get_drvdata(pdev); - - if (!dd) { - dev_err(&pdev->dev, - "Driver private datastructure is NULL\n"); - return -EFAULT; - } + struct driver_data *dd = dev_get_drvdata(dev); set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); /* Disable ports & interrupts then send standby immediate */ rv = mtip_block_suspend(dd); - if (rv < 0) { - dev_err(&pdev->dev, - "Failed to suspend controller\n"); - return rv; - } - - /* - * Save the pci config space to pdev structure & - * disable the device - */ - pci_save_state(pdev); - pci_disable_device(pdev); - - /* Move to Low power state*/ - pci_set_power_state(pdev, PCI_D3hot); + if (rv < 0) + dev_err(dev, "Failed to suspend controller\n"); return rv; } @@ -4186,32 +3961,10 @@ static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) * 0 Success * <0 Error */ -static int mtip_pci_resume(struct pci_dev *pdev) +static int __maybe_unused mtip_pci_resume(struct device *dev) { int rv = 0; - struct driver_data *dd; - - dd = pci_get_drvdata(pdev); - if (!dd) { - dev_err(&pdev->dev, - "Driver private datastructure is NULL\n"); - return -EFAULT; - } - - /* Move the device to active State */ - pci_set_power_state(pdev, PCI_D0); - - /* Restore PCI configuration space */ - pci_restore_state(pdev); - - /* Enable the PCI device*/ - rv = pcim_enable_device(pdev); - if (rv < 0) { - dev_err(&pdev->dev, - "Failed to enable card during resume\n"); - goto err; - } - pci_set_master(pdev); + struct driver_data *dd = dev_get_drvdata(dev); /* * Calls hbaReset, initPort, & startPort function @@ -4219,9 +3972,8 @@ static int mtip_pci_resume(struct pci_dev *pdev) */ rv = mtip_block_resume(dd); if (rv < 0) - dev_err(&pdev->dev, "Unable to resume\n"); + dev_err(dev, "Unable to resume\n"); -err: clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); return rv; @@ -4252,14 +4004,15 @@ static const struct pci_device_id mtip_pci_tbl[] = { { 0 } }; +static SIMPLE_DEV_PM_OPS(mtip_pci_pm_ops, mtip_pci_suspend, mtip_pci_resume); + /* Structure that describes the PCI driver functions. */ static struct pci_driver mtip_pci_driver = { .name = MTIP_DRV_NAME, .id_table = mtip_pci_tbl, .probe = mtip_pci_probe, .remove = mtip_pci_remove, - .suspend = mtip_pci_suspend, - .resume = mtip_pci_resume, + .driver.pm = &mtip_pci_pm_ops, .shutdown = mtip_pci_shutdown, }; @@ -4295,15 +4048,6 @@ static int __init mtip_init(void) pr_warn("Error creating debugfs parent\n"); dfs_parent = NULL; } - if (dfs_parent) { - dfs_device_status = debugfs_create_file("device_status", - 0444, dfs_parent, NULL, - &mtip_device_status_fops); - if (IS_ERR_OR_NULL(dfs_device_status)) { - pr_err("Error creating device_status node\n"); - dfs_device_status = NULL; - } - } /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 88f4206310e4..f7328f19ac5c 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -15,7 +15,6 @@ #include <linux/rwsem.h> #include <linux/ata.h> #include <linux/interrupt.h> -#include <linux/genhd.h> /* Offset of Subsystem Device ID in pci confoguration space */ #define PCI_SUBSYSTEM_DEVICEID 0x2E @@ -150,7 +149,6 @@ enum { MTIP_DDF_RESUME_BIT = 6, MTIP_DDF_INIT_DONE_BIT = 7, MTIP_DDF_REBUILD_FAILED_BIT = 8, - MTIP_DDF_REMOVAL_BIT = 9, MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | (1 << MTIP_DDF_SEC_LOCK_BIT) | @@ -463,10 +461,6 @@ struct driver_data { int isr_binding; - struct list_head online_list; /* linkage for online list */ - - struct list_head remove_list; /* linkage for removing list */ - int unal_qdepth; /* qdepth of unaligned IO queue */ }; diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 78282f01f581..d914156db2d8 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -88,7 +88,7 @@ static void n64cart_submit_bio(struct bio *bio) { struct bio_vec bvec; struct bvec_iter iter; - struct device *dev = bio->bi_disk->private_data; + struct device *dev = bio->bi_bdev->bd_disk->private_data; u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT; bio_for_each_segment(bvec, bio, iter) { @@ -136,7 +136,7 @@ static int __init n64cart_probe(struct platform_device *pdev) goto out; disk->first_minor = 0; - disk->flags = GENHD_FL_NO_PART_SCAN; + disk->flags = GENHD_FL_NO_PART; disk->fops = &n64cart_fops; disk->private_data = &pdev->dev; strcpy(disk->disk_name, "n64cart"); @@ -157,7 +157,7 @@ static int __init n64cart_probe(struct platform_device *pdev) return 0; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out: return err; } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5a1f98494ddd..5cffd96ef2d7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -11,6 +11,8 @@ * (part of code stolen from loop.c) */ +#define pr_fmt(fmt) "nbd: " fmt + #include <linux/major.h> #include <linux/blkdev.h> @@ -155,8 +157,6 @@ static struct dentry *nbd_dbg_dir; #define nbd_name(nbd) ((nbd)->disk->disk_name) -#define NBD_MAGIC 0x68797548 - #define NBD_DEF_BLKSIZE_BITS 10 static unsigned int nbds_max = 16; @@ -250,7 +250,7 @@ static void nbd_dev_remove(struct nbd_device *nbd) struct gendisk *disk = nbd->disk; del_gendisk(disk); - blk_cleanup_disk(disk); + put_disk(disk); blk_mq_free_tag_set(&nbd->tag_set); /* @@ -333,7 +333,6 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { nbd->disk->queue->limits.discard_granularity = blksize; - nbd->disk->queue->limits.discard_alignment = blksize; blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); } blk_queue_logical_block_size(nbd->disk->queue, blksize); @@ -394,8 +393,7 @@ static u32 req_to_nbd_cmd_type(struct request *req) } } -static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, - bool reserved) +static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_device *nbd = cmd->nbd; @@ -404,13 +402,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; } if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } @@ -479,6 +478,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); sock_shutdown(nbd); nbd_config_put(nbd); @@ -746,7 +746,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", tag, cmd->status, cmd->flags); ret = -ENOENT; @@ -855,8 +855,16 @@ static void recv_work(struct work_struct *work) } rq = blk_mq_rq_from_pdu(cmd); - if (likely(!blk_should_fake_timeout(rq->q))) - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) { + bool complete; + + mutex_lock(&cmd->lock); + complete = __test_and_clear_bit(NBD_CMD_INFLIGHT, + &cmd->flags); + mutex_unlock(&cmd->lock); + if (complete) + blk_mq_complete_request(rq); + } percpu_ref_put(&q->q_usage_counter); } @@ -871,7 +879,7 @@ static void recv_work(struct work_struct *work) kfree(args); } -static bool nbd_clear_req(struct request *req, void *data, bool reserved) +static bool nbd_clear_req(struct request *req, void *data) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); @@ -947,11 +955,15 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) + + if (!wait_event_timeout(config->conn_wait, + test_bit(NBD_RT_DISCONNECTED, + &config->runtime_flags) || + atomic_read(&config->live_connections) > 0, + config->dead_conn_timeout)) return 0; - return wait_event_timeout(config->conn_wait, - atomic_read(&config->live_connections) > 0, - config->dead_conn_timeout) > 0; + + return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); } static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) @@ -1217,11 +1229,11 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) return -ENOSPC; } -static void nbd_bdev_reset(struct block_device *bdev) +static void nbd_bdev_reset(struct nbd_device *nbd) { - if (bdev->bd_openers > 1) + if (disk_openers(nbd->disk) > 1) return; - set_capacity(bdev->bd_disk, 0); + set_capacity(nbd->disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) @@ -1231,8 +1243,6 @@ static void nbd_parse_flags(struct nbd_device *nbd) set_disk_ro(nbd->disk, true); else set_disk_ro(nbd->disk, false); - if (config->flags & NBD_FLAG_SEND_TRIM) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue); if (config->flags & NBD_FLAG_SEND_FLUSH) { if (config->flags & NBD_FLAG_SEND_FUA) blk_queue_write_cache(nbd->disk->queue, true, true); @@ -1318,9 +1328,7 @@ static void nbd_config_put(struct nbd_device *nbd) nbd->tag_set.timeout = 0; nbd->disk->queue->limits.discard_granularity = 0; - nbd->disk->queue->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue); + blk_queue_max_discard_sectors(nbd->disk->queue, 0); mutex_unlock(&nbd->config_lock); nbd_put(nbd); @@ -1389,7 +1397,7 @@ static int nbd_start_device(struct nbd_device *nbd) return nbd_set_size(nbd, config->bytesize, nbd_blksize(config)); } -static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) +static int nbd_start_device_ioctl(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; int ret; @@ -1403,12 +1411,14 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b mutex_unlock(&nbd->config_lock); ret = wait_event_interruptible(config->recv_wq, atomic_read(&config->recv_threads) == 0); - if (ret) + if (ret) { sock_shutdown(nbd); - flush_workqueue(nbd->recv_workq); + nbd_clear_que(nbd); + } + flush_workqueue(nbd->recv_workq); mutex_lock(&nbd->config_lock); - nbd_bdev_reset(bdev); + nbd_bdev_reset(nbd); /* user requested, ignore socket errors */ if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; @@ -1420,9 +1430,9 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b static void nbd_clear_sock_ioctl(struct nbd_device *nbd, struct block_device *bdev) { - sock_shutdown(nbd); + nbd_clear_sock(nbd); __invalidate_device(bdev, true); - nbd_bdev_reset(bdev); + nbd_bdev_reset(nbd); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); @@ -1468,7 +1478,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, config->flags = arg; return 0; case NBD_DO_IT: - return nbd_start_device_ioctl(nbd, bdev); + return nbd_start_device_ioctl(nbd); case NBD_CLEAR_QUE: /* * This is for compatibility only. The queue is always cleared @@ -1519,15 +1529,20 @@ static struct nbd_config *nbd_alloc_config(void) { struct nbd_config *config; + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); - if (!config) - return NULL; + if (!config) { + module_put(THIS_MODULE); + return ERR_PTR(-ENOMEM); + } + atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); - try_module_get(THIS_MODULE); return config; } @@ -1554,12 +1569,13 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&nbd->config_lock); goto out; } - config = nbd->config = nbd_alloc_config(); - if (!config) { - ret = -ENOMEM; + config = nbd_alloc_config(); + if (IS_ERR(config)) { + ret = PTR_ERR(config); mutex_unlock(&nbd->config_lock); goto out; } + nbd->config = config; refcount_set(&nbd->config_refs, 1); refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); @@ -1579,7 +1595,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && - disk->part0->bd_openers == 0) + disk_openers(disk) == 0) nbd_disconnect_and_put(nbd); nbd_config_put(nbd); @@ -1784,7 +1800,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; - disk->queue->limits.discard_alignment = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); blk_queue_max_segments(disk->queue, USHRT_MAX); @@ -1800,17 +1815,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) refcount_set(&nbd->refs, 0); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; - - /* Too big first_minor can cause duplicate creation of - * sysfs files/links, since index << part_shift might overflow, or - * MKDEV() expect that the max bits of first_minor is 20. - */ disk->first_minor = index << part_shift; - if (disk->first_minor < index || disk->first_minor > MINORMASK) { - err = -EINVAL; - goto out_free_work; - } - disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; @@ -1829,7 +1834,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) out_free_work: destroy_workqueue(nbd->recv_workq); out_err_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_free_idr: mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); @@ -1915,14 +1920,25 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (info->attrs[NBD_ATTR_INDEX]) + if (info->attrs[NBD_ATTR_INDEX]) { index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + + /* + * Too big first_minor can cause duplicate creation of + * sysfs files/links, since index << part_shift might overflow, or + * MKDEV() expect that the max bits of first_minor is 20. + */ + if (index < 0 || index > MINORMASK >> part_shift) { + pr_err("illegal input index %d\n", index); + return -EINVAL; + } + } if (!info->attrs[NBD_ATTR_SOCKETS]) { - printk(KERN_ERR "nbd: must specify at least one socket\n"); + pr_err("must specify at least one socket\n"); return -EINVAL; } if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { - printk(KERN_ERR "nbd: must specify a size in bytes for the device\n"); + pr_err("must specify a size in bytes for the device\n"); return -EINVAL; } again: @@ -1936,7 +1952,7 @@ again: test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) || !refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - pr_err("nbd: device at index %d is going down\n", + pr_err("device at index %d is going down\n", index); return -EINVAL; } @@ -1947,7 +1963,7 @@ again: if (!nbd) { nbd = nbd_dev_add(index, 2); if (IS_ERR(nbd)) { - pr_err("nbd: failed to add new device\n"); + pr_err("failed to add new device\n"); return PTR_ERR(nbd); } } @@ -1958,7 +1974,7 @@ again: nbd_put(nbd); if (index == -1) goto again; - printk(KERN_ERR "nbd: nbd%d already in use\n", index); + pr_err("nbd%d already in use\n", index); return -EBUSY; } if (WARN_ON(nbd->config)) { @@ -1966,13 +1982,14 @@ again: nbd_put(nbd); return -EINVAL; } - config = nbd->config = nbd_alloc_config(); - if (!nbd->config) { + config = nbd_alloc_config(); + if (IS_ERR(config)) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); - printk(KERN_ERR "nbd: couldn't allocate config\n"); - return -ENOMEM; + pr_err("couldn't allocate config\n"); + return PTR_ERR(config); } + nbd->config = config; refcount_set(&nbd->config_refs, 1); set_bit(NBD_RT_BOUND, &config->runtime_flags); @@ -2025,7 +2042,7 @@ again: struct nlattr *socks[NBD_SOCK_MAX+1]; if (nla_type(attr) != NBD_SOCK_ITEM) { - printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); + pr_err("socks must be embedded in a SOCK_ITEM attr\n"); ret = -EINVAL; goto out; } @@ -2034,7 +2051,7 @@ again: nbd_sock_policy, info->extack); if (ret != 0) { - printk(KERN_ERR "nbd: error processing sock list\n"); + pr_err("error processing sock list\n"); ret = -EINVAL; goto out; } @@ -2082,6 +2099,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) mutex_lock(&nbd->config_lock); nbd_disconnect(nbd); sock_shutdown(nbd); + wake_up(&nbd->config->conn_wait); /* * Make sure recv thread has finished, we can safely call nbd_clear_que() * to cancel the inflight I/Os. @@ -2105,7 +2123,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) return -EPERM; if (!info->attrs[NBD_ATTR_INDEX]) { - printk(KERN_ERR "nbd: must specify an index to disconnect\n"); + pr_err("must specify an index to disconnect\n"); return -EINVAL; } index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); @@ -2113,14 +2131,12 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) nbd = idr_find(&nbd_index_idr, index); if (!nbd) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: couldn't find device at index %d\n", - index); + pr_err("couldn't find device at index %d\n", index); return -EINVAL; } if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); + pr_err("device at index %d is going down\n", index); return -EINVAL; } mutex_unlock(&nbd_index_mutex); @@ -2145,7 +2161,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) return -EPERM; if (!info->attrs[NBD_ATTR_INDEX]) { - printk(KERN_ERR "nbd: must specify a device to reconfigure\n"); + pr_err("must specify a device to reconfigure\n"); return -EINVAL; } index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); @@ -2153,8 +2169,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) nbd = idr_find(&nbd_index_idr, index); if (!nbd) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: couldn't find a device at index %d\n", - index); + pr_err("couldn't find a device at index %d\n", index); return -EINVAL; } if (nbd->backend) { @@ -2175,8 +2190,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) } if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); + pr_err("device at index %d is going down\n", index); return -EINVAL; } mutex_unlock(&nbd_index_mutex); @@ -2240,7 +2254,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) struct nlattr *socks[NBD_SOCK_MAX+1]; if (nla_type(attr) != NBD_SOCK_ITEM) { - printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); + pr_err("socks must be embedded in a SOCK_ITEM attr\n"); ret = -EINVAL; goto out; } @@ -2249,7 +2263,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) nbd_sock_policy, info->extack); if (ret != 0) { - printk(KERN_ERR "nbd: error processing sock list\n"); + pr_err("error processing sock list\n"); ret = -EINVAL; goto out; } @@ -2308,6 +2322,7 @@ static struct genl_family nbd_genl_family __ro_after_init = { .module = THIS_MODULE, .small_ops = nbd_connect_genl_ops, .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops), + .resv_start_op = NBD_CMD_STATUS + 1, .maxattr = NBD_ATTR_MAX, .policy = nbd_attr_policy, .mcgrps = nbd_mcast_grps, @@ -2466,7 +2481,7 @@ static int __init nbd_init(void) BUILD_BUG_ON(sizeof(struct nbd_request) != 28); if (max_part < 0) { - printk(KERN_ERR "nbd: max_part must be >= 0\n"); + pr_err("max_part must be >= 0\n"); return -EINVAL; } @@ -2529,6 +2544,12 @@ static void __exit nbd_cleanup(void) struct nbd_device *nbd; LIST_HEAD(del_list); + /* + * Unregister netlink interface prior to waiting + * for the completion of netlink commands. + */ + genl_unregister_family(&nbd_genl_family); + nbd_dbg_close(); mutex_lock(&nbd_index_mutex); @@ -2538,8 +2559,11 @@ static void __exit nbd_cleanup(void) while (!list_empty(&del_list)) { nbd = list_first_entry(&del_list, struct nbd_device, list); list_del_init(&nbd->list); + if (refcount_read(&nbd->config_refs)) + pr_err("possibly leaking nbd_config (ref %d)\n", + refcount_read(&nbd->config_refs)); if (refcount_read(&nbd->refs) != 1) - printk(KERN_ERR "nbd: possibly leaking a device\n"); + pr_err("possibly leaking a device\n"); nbd_put(nbd); } @@ -2547,7 +2571,6 @@ static void __exit nbd_cleanup(void) destroy_workqueue(nbd_del_wq); idr_destroy(&nbd_index_idr); - genl_unregister_family(&nbd_genl_family); unregister_blkdev(NBD_MAJOR, "nbd"); } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 323af5c9c802..1f154f92f4c2 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -11,6 +11,9 @@ #include <linux/init.h> #include "null_blk.h" +#undef pr_fmt +#define pr_fmt(fmt) "null_blk: " fmt + #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL @@ -74,12 +77,6 @@ enum { NULL_IRQ_TIMER = 2, }; -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - static bool g_virt_boundary = false; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); @@ -204,6 +201,22 @@ static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +static bool g_memory_backed; +module_param_named(memory_backed, g_memory_backed, bool, 0444); +MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); + +static bool g_discard; +module_param_named(discard, g_discard, bool, 0444); +MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); + +static unsigned long g_cache_size; +module_param_named(cache_size, g_cache_size, ulong, 0444); +MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); + +static unsigned int g_mbps; +module_param_named(mbps, g_mbps, uint, 0444); +MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); + static bool g_zoned; module_param_named(zoned, g_zoned, bool, S_IRUGO); MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); @@ -232,6 +245,7 @@ static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); +static struct nullb *null_find_dev_by_name(const char *name); static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) @@ -340,9 +354,9 @@ static int nullb_update_nr_hw_queues(struct nullb_device *dev, return 0; /* - * Make sure at least one queue exists for each of submit and poll. + * Make sure at least one submit queue exists. */ - if (!submit_queues || !poll_queues) + if (!submit_queues) return -EINVAL; /* @@ -411,6 +425,8 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); +NULLB_DEVICE_ATTR(no_sched, bool, NULL); +NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -431,9 +447,10 @@ static ssize_t nullb_device_power_store(struct config_item *item, if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) return count; - if (null_add_dev(dev)) { + ret = null_add_dev(dev); + if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return -ENOMEM; + return ret; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); @@ -533,6 +550,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, &nullb_device_attr_virt_boundary, + &nullb_device_attr_no_sched, + &nullb_device_attr_shared_tag_bitmap, NULL, }; @@ -559,6 +578,9 @@ config_item *nullb_group_make_item(struct config_group *group, const char *name) { struct nullb_device *dev; + if (null_find_dev_by_name(name)) + return ERR_PTR(-EEXIST); + dev = null_alloc_dev(); if (!dev) return ERR_PTR(-ENOMEM); @@ -586,7 +608,13 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n"); + "badblocks,blocking,blocksize,cache_size," + "completion_nsec,discard,home_node,hw_queue_depth," + "irqmode,max_sectors,mbps,memory_backed,no_sched," + "poll_queues,power,queue_mode,shared_tag_bitmap,size," + "submit_queues,use_per_node_hctx,virt_boundary,zoned," + "zone_capacity,zone_max_active,zone_max_open," + "zone_nr_conv,zone_size\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -648,6 +676,10 @@ static struct nullb_device *null_alloc_dev(void) dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; + dev->memory_backed = g_memory_backed; + dev->discard = g_discard; + dev->cache_size = g_cache_size; + dev->mbps = g_mbps; dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; @@ -656,6 +688,8 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; dev->virt_boundary = g_virt_boundary; + dev->no_sched = g_no_sched; + dev->shared_tag_bitmap = g_shared_tag_bitmap; return dev; } @@ -719,26 +753,25 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) return NULL; } -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) { struct nullb_cmd *cmd; DEFINE_WAIT(wait); - cmd = __alloc_cmd(nq); - if (cmd || !can_wait) - return cmd; - do { - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + /* + * This avoids multiple return statements, multiple calls to + * __alloc_cmd() and a fast path call to prepare_to_wait(). + */ cmd = __alloc_cmd(nq); - if (cmd) - break; - + if (cmd) { + cmd->bio = bio; + return cmd; + } + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); io_schedule(); + finish_wait(&nq->wait, &wait); } while (1); - - finish_wait(&nq->wait, &wait); - return cmd; } static void end_cmd(struct nullb_cmd *cmd) @@ -777,24 +810,22 @@ static void null_complete_rq(struct request *rq) end_cmd(blk_mq_rq_to_pdu(rq)); } -static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +static struct nullb_page *null_alloc_page(void) { struct nullb_page *t_page; - t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) - goto out; + return NULL; - t_page->page = alloc_pages(gfp_flags, 0); - if (!t_page->page) - goto out_freepage; + t_page->page = alloc_pages(GFP_NOIO, 0); + if (!t_page->page) { + kfree(t_page); + return NULL; + } memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); return t_page; -out_freepage: - kfree(t_page); -out: - return NULL; } static void null_free_page(struct nullb_page *t_page) @@ -932,7 +963,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, spin_unlock_irq(&nullb->lock); - t_page = null_alloc_page(GFP_NOIO); + t_page = null_alloc_page(); if (!t_page) goto out_lock; @@ -1311,7 +1342,7 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op, + enum req_op op, sector_t sector, sector_t nr_sectors) { @@ -1382,9 +1413,8 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) } } -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors) +blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; blk_status_t ret; @@ -1402,7 +1432,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, } static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_opf op) + sector_t nr_sectors, enum req_op op) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; @@ -1476,12 +1506,8 @@ static void null_submit_bio(struct bio *bio) sector_t nr_sectors = bio_sectors(bio); struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; struct nullb_queue *nq = nullb_to_queue(nullb); - struct nullb_cmd *cmd; - - cmd = alloc_cmd(nq, 1); - cmd->bio = bio; - null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); + null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); } static bool should_timeout_request(struct request *rq) @@ -1502,7 +1528,7 @@ static bool should_requeue_request(struct request *rq) return false; } -static int null_map_queues(struct blk_mq_tag_set *set) +static void null_map_queues(struct blk_mq_tag_set *set) { struct nullb *nullb = set->driver_data; int i, qoff; @@ -1529,7 +1555,9 @@ static int null_map_queues(struct blk_mq_tag_set *set) } else { pr_warn("tag set has unexpected nr_hw_queues: %d\n", set->nr_hw_queues); - return -EINVAL; + WARN_ON_ONCE(true); + submit_queues = 1; + poll_queues = 0; } } @@ -1551,8 +1579,6 @@ static int null_map_queues(struct blk_mq_tag_set *set) qoff += map->nr_queues; blk_mq_map_queues(map); } - - return 0; } static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) @@ -1574,14 +1600,16 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); - end_cmd(cmd); + if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, + blk_mq_end_request_batch)) + end_cmd(cmd); nr++; } return nr; } -static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +static enum blk_eh_timer_return null_timeout_rq(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); @@ -1604,7 +1632,7 @@ static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) * Only fake timeouts need to execute blk_mq_complete_request() here. */ cmd->error = BLK_STS_TIMEOUT; - if (cmd->fake_timeout) + if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL) blk_mq_complete_request(rq); return BLK_EH_DONE; } @@ -1659,7 +1687,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, static void cleanup_queue(struct nullb_queue *nq) { - kfree(nq->tag_map); + bitmap_free(nq->tag_map); kfree(nq->cmds); } @@ -1740,7 +1768,7 @@ static void null_del_dev(struct nullb *nullb) null_restart_queue_async(nullb); } - blk_cleanup_disk(nullb->disk); + put_disk(nullb->disk); if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); @@ -1769,9 +1797,7 @@ static void null_config_discard(struct nullb *nullb) } nullb->q->limits.discard_granularity = nullb->dev->blocksize; - nullb->q->limits.discard_alignment = nullb->dev->blocksize; blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); } static const struct block_device_operations null_bio_ops = { @@ -1788,14 +1814,13 @@ static const struct block_device_operations null_rq_ops = { static int setup_commands(struct nullb_queue *nq) { struct nullb_cmd *cmd; - int i, tag_size; + int i; nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); if (!nq->cmds) return -ENOMEM; - tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; - nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); + nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); if (!nq->tag_map) { kfree(nq->cmds); return -ENOMEM; @@ -1850,7 +1875,6 @@ static int null_gendisk_register(struct nullb *nullb) set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; disk->minors = 1; @@ -1873,31 +1897,48 @@ static int null_gendisk_register(struct nullb *nullb) static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) { + unsigned int flags = BLK_MQ_F_SHOULD_MERGE; + int hw_queues, numa_node; + unsigned int queue_depth; int poll_queues; + if (nullb) { + hw_queues = nullb->dev->submit_queues; + poll_queues = nullb->dev->poll_queues; + queue_depth = nullb->dev->hw_queue_depth; + numa_node = nullb->dev->home_node; + if (nullb->dev->no_sched) + flags |= BLK_MQ_F_NO_SCHED; + if (nullb->dev->shared_tag_bitmap) + flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (nullb->dev->blocking) + flags |= BLK_MQ_F_BLOCKING; + } else { + hw_queues = g_submit_queues; + poll_queues = g_poll_queues; + queue_depth = g_hw_queue_depth; + numa_node = g_home_node; + if (g_no_sched) + flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (g_blocking) + flags |= BLK_MQ_F_BLOCKING; + } + set->ops = &null_mq_ops; - set->nr_hw_queues = nullb ? nullb->dev->submit_queues : - g_submit_queues; - poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues; - if (poll_queues) - set->nr_hw_queues += poll_queues; - set->queue_depth = nullb ? nullb->dev->hw_queue_depth : - g_hw_queue_depth; - set->numa_node = nullb ? nullb->dev->home_node : g_home_node; set->cmd_size = sizeof(struct nullb_cmd); - set->flags = BLK_MQ_F_SHOULD_MERGE; - if (g_no_sched) - set->flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) - set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + set->flags = flags; set->driver_data = nullb; - if (g_poll_queues) + set->nr_hw_queues = hw_queues; + set->queue_depth = queue_depth; + set->numa_node = numa_node; + if (poll_queues) { + set->nr_hw_queues += poll_queues; set->nr_maps = 3; - else + } else { set->nr_maps = 1; - - if ((nullb && nullb->dev->blocking) || g_blocking) - set->flags |= BLK_MQ_F_BLOCKING; + } return blk_mq_alloc_tag_set(set); } @@ -1918,8 +1959,6 @@ static int null_validate_conf(struct nullb_device *dev) if (dev->poll_queues > g_poll_queues) dev->poll_queues = g_poll_queues; - else if (dev->poll_queues == 0) - dev->poll_queues = 1; dev->prev_poll_queues = dev->poll_queues; dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); @@ -2051,8 +2090,13 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); - nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); - dev->index = nullb->index; + rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + if (rv < 0) { + mutex_unlock(&lock); + goto out_cleanup_zone; + } + nullb->index = rv; + dev->index = rv; mutex_unlock(&lock); blk_queue_logical_block_size(nullb->q, dev->blocksize); @@ -2068,21 +2112,32 @@ static int null_add_dev(struct nullb_device *dev) null_config_discard(nullb); - sprintf(nullb->disk_name, "nullb%d", nullb->index); + if (config_item_name(&dev->item)) { + /* Use configfs dir name as the device name */ + snprintf(nullb->disk_name, sizeof(nullb->disk_name), + "%s", config_item_name(&dev->item)); + } else { + sprintf(nullb->disk_name, "nullb%d", nullb->index); + } rv = null_gendisk_register(nullb); if (rv) - goto out_cleanup_zone; + goto out_ida_free; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); + pr_info("disk %s created\n", nullb->disk_name); + return 0; + +out_ida_free: + ida_free(&nullb_indexes, nullb->index); out_cleanup_zone: null_free_zoned_dev(dev); out_cleanup_disk: - blk_cleanup_disk(nullb->disk); + put_disk(nullb->disk); out_cleanup_tags: if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); @@ -2095,12 +2150,53 @@ out: return rv; } +static struct nullb *null_find_dev_by_name(const char *name) +{ + struct nullb *nullb = NULL, *nb; + + mutex_lock(&lock); + list_for_each_entry(nb, &nullb_list, list) { + if (strcmp(nb->disk_name, name) == 0) { + nullb = nb; + break; + } + } + mutex_unlock(&lock); + + return nullb; +} + +static int null_create_dev(void) +{ + struct nullb_device *dev; + int ret; + + dev = null_alloc_dev(); + if (!dev) + return -ENOMEM; + + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); + return ret; + } + + return 0; +} + +static void null_destroy_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + + null_del_dev(nullb); + null_free_dev(dev); +} + static int __init null_init(void) { int ret = 0; unsigned int i; struct nullb *nullb; - struct nullb_device *dev; if (g_bs > PAGE_SIZE) { pr_warn("invalid block size\n"); @@ -2120,19 +2216,21 @@ static int __init null_init(void) } if (g_queue_mode == NULL_Q_RQ) { - pr_err("legacy IO path no longer available\n"); + pr_err("legacy IO path is no longer available\n"); return -EINVAL; } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", - nr_online_nodes); + nr_online_nodes); g_submit_queues = nr_online_nodes; } - } else if (g_submit_queues > nr_cpu_ids) + } else if (g_submit_queues > nr_cpu_ids) { g_submit_queues = nr_cpu_ids; - else if (g_submit_queues <= 0) + } else if (g_submit_queues <= 0) { g_submit_queues = 1; + } if (g_queue_mode == NULL_Q_MQ && shared_tags) { ret = null_init_tag_set(NULL, &tag_set); @@ -2156,16 +2254,9 @@ static int __init null_init(void) } for (i = 0; i < nr_devices; i++) { - dev = null_alloc_dev(); - if (!dev) { - ret = -ENOMEM; - goto err_dev; - } - ret = null_add_dev(dev); - if (ret) { - null_free_dev(dev); + ret = null_create_dev(); + if (ret) goto err_dev; - } } pr_info("module loaded\n"); @@ -2174,9 +2265,7 @@ static int __init null_init(void) err_dev: while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); + null_destroy_dev(nullb); } unregister_blkdev(null_major, "nullb"); err_conf: @@ -2197,12 +2286,8 @@ static void __exit null_exit(void) mutex_lock(&lock); while (!list_empty(&nullb_list)) { - struct nullb_device *dev; - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); + null_destroy_dev(nullb); } mutex_unlock(&lock); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 78eb56b0ca55..94ff68052b1e 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -16,13 +16,15 @@ #include <linux/mutex.h> struct nullb_cmd { - struct request *rq; - struct bio *bio; + union { + struct request *rq; + struct bio *bio; + }; unsigned int tag; blk_status_t error; + bool fake_timeout; struct nullb_queue *nq; struct hrtimer timer; - bool fake_timeout; }; struct nullb_queue { @@ -58,6 +60,13 @@ struct nullb_zone { unsigned int capacity; }; +/* Queue modes */ +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + struct nullb_device { struct nullb *nullb; struct config_item item; @@ -104,6 +113,8 @@ struct nullb_device { bool discard; /* if support discard */ bool zoned; /* if device is zoned */ bool virt_boundary; /* virtual boundary on/off for the device */ + bool no_sched; /* no IO scheduler for the device */ + bool shared_tag_bitmap; /* use hostwide shared tags */ }; struct nullb { @@ -127,9 +138,8 @@ struct nullb { blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors); -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors); +blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, unsigned int nr_sectors); #ifdef CONFIG_BLK_DEV_ZONED int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); @@ -137,9 +147,8 @@ int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors); +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len); #else @@ -155,7 +164,7 @@ static inline int null_register_zoned_dev(struct nullb *nullb) } static inline void null_free_zoned_dev(struct nullb_device *dev) {} static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, sector_t nr_sectors) + enum req_op op, sector_t sector, sector_t nr_sectors) { return BLK_STS_NOTSUPP; } diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index ce3b430e88c5..6b2b370e786f 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -36,7 +36,7 @@ TRACE_EVENT(nullb_zone_op, TP_ARGS(cmd, zone_no, zone_cond), TP_STRUCT__entry( __array(char, disk, DISK_NAME_LEN) - __field(enum req_opf, op) + __field(enum req_op, op) __field(unsigned int, zone_no) __field(unsigned int, zone_cond) ), @@ -44,7 +44,7 @@ TRACE_EVENT(nullb_zone_op, __entry->op = req_op(cmd->rq); __entry->zone_no = zone_no; __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + __assign_disk_name(__entry->disk, cmd->rq->q->disk); ), TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", __print_disk_name(__entry->disk), diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index dae54dd1aeac..55a69e48ef8b 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -6,6 +6,9 @@ #define CREATE_TRACE_POINTS #include "trace.h" +#undef pr_fmt +#define pr_fmt(fmt) "null_blk: " fmt + static inline sector_t mb_to_sects(unsigned long mb) { return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; @@ -75,8 +78,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_capacity = dev->zone_size; if (dev->zone_capacity > dev->zone_size) { - pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", - dev->zone_capacity, dev->zone_size); + pr_err("zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); return -EINVAL; } @@ -156,7 +159,7 @@ int null_register_zoned_dev(struct nullb *nullb) struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; - blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); + disk_set_zoned(nullb->disk, BLK_ZONED_HM); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); @@ -167,12 +170,12 @@ int null_register_zoned_dev(struct nullb *nullb) return ret; } else { blk_queue_chunk_sectors(q, dev->zone_size_sects); - q->nr_zones = blkdev_nr_zones(nullb->disk); + nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); } blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - blk_queue_max_open_zones(q, dev->zone_max_open); - blk_queue_max_active_zones(q, dev->zone_max_active); + disk_set_max_open_zones(nullb->disk, dev->zone_max_open); + disk_set_max_active_zones(nullb->disk, dev->zone_max_active); return 0; } @@ -395,10 +398,10 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, */ if (append) { sector = zone->wp; - if (cmd->bio) - cmd->bio->bi_iter.bi_sector = sector; - else + if (dev->queue_mode == NULL_Q_MQ) cmd->rq->__sector = sector; + else + cmd->bio->bi_iter.bi_sector = sector; } else if (sector != zone->wp) { ret = BLK_STS_IOERR; goto unlock; @@ -597,7 +600,7 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, return BLK_STS_OK; } -static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op, sector_t sector) { struct nullb_device *dev = cmd->nq->dev; @@ -650,7 +653,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, return ret; } -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors) { struct nullb_device *dev; diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c index f5f63ca2889d..d880a9465e9b 100644 --- a/drivers/block/paride/bpck.c +++ b/drivers/block/paride/bpck.c @@ -28,6 +28,7 @@ #undef r2 #undef w2 +#undef PC #define PC pi->private #define r2() (PC=(in_p(2) & 0xff)) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index f6b1d63e96e1..a5ab40784119 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -690,7 +690,7 @@ static void pcd_request(void) if (!pcd_req && !set_next_request()) return; - cd = pcd_req->rq_disk->private_data; + cd = pcd_req->q->disk->private_data; if (cd != pcd_current) pcd_bufblk = -1; pcd_current = cd; @@ -928,8 +928,9 @@ static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; - disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay, pcd_buffer, PI_PCD, verbose, cd->name)) { @@ -955,7 +956,7 @@ out_unreg_cdrom: out_pi_release: pi_release(cd->pi); out_free_disk: - blk_cleanup_disk(cd->disk); + put_disk(cd->disk); out_free_tag_set: blk_mq_free_tag_set(&cd->tag_set); return ret; @@ -1028,7 +1029,7 @@ static void __exit pcd_exit(void) unregister_cdrom(&cd->info); del_gendisk(cd->disk); pi_release(cd->pi); - blk_cleanup_disk(cd->disk); + put_disk(cd->disk); blk_mq_free_tag_set(&cd->tag_set); } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index fba865058a17..f8a75bc90f70 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -430,7 +430,7 @@ static void run_fsm(void) int stop = 0; if (!phase) { - pd_current = pd_req->rq_disk->private_data; + pd_current = pd_req->q->disk->private_data; pi_current = pd_current->pi; phase = do_pd_io_start; } @@ -492,7 +492,7 @@ static enum action do_pd_io_start(void) case REQ_OP_WRITE: pd_block = blk_rq_pos(pd_req); pd_count = blk_rq_cur_sectors(pd_req); - if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) + if (pd_block + pd_count > get_capacity(pd_req->q->disk)) return Fail; pd_run = blk_rq_sectors(pd_req); pd_buf = bio_data(pd_req->bio); @@ -501,6 +501,8 @@ static enum action do_pd_io_start(void) return do_pd_read_start(); else return do_pd_write_start(); + default: + break; } return Fail; } @@ -781,7 +783,7 @@ static int pd_special_command(struct pd_unit *disk, req = blk_mq_rq_to_pdu(rq); req->func = func; - blk_execute_rq(disk->gd, rq, 0); + blk_execute_rq(rq, false); blk_mq_free_request(rq); return 0; } @@ -943,7 +945,7 @@ static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port, goto cleanup_disk; return 0; cleanup_disk: - blk_cleanup_disk(disk->gd); + put_disk(disk->gd); put_disk: put_disk(p); disk->gd = NULL; @@ -1018,7 +1020,7 @@ static void __exit pd_exit(void) if (p) { disk->gd = NULL; del_gendisk(p); - blk_cleanup_disk(p); + put_disk(p); blk_mq_free_tag_set(&disk->tag_set); pi_release(disk->pi); } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index bf8d0ef41a0a..eec1b9fde245 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -746,12 +746,12 @@ repeat: if (!pf_req && !set_next_request()) return; - pf_current = pf_req->rq_disk->private_data; + pf_current = pf_req->q->disk->private_data; pf_block = blk_rq_pos(pf_req); pf_run = blk_rq_sectors(pf_req); pf_count = blk_rq_cur_sectors(pf_req); - if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { + if (pf_block + pf_count > get_capacity(pf_req->q->disk)) { pf_end_request(BLK_STS_IOERR); goto repeat; } @@ -942,6 +942,7 @@ static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, pf->name); disk->fops = &pf_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = pf; @@ -974,7 +975,7 @@ static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port, out_pi_release: pi_release(pf->pi); out_free_disk: - blk_cleanup_disk(pf->disk); + put_disk(pf->disk); out_free_tag_set: blk_mq_free_tag_set(&pf->tag_set); return ret; @@ -1043,7 +1044,7 @@ static void __exit pf_exit(void) if (!pf->present) continue; del_gendisk(pf->disk); - blk_cleanup_disk(pf->disk); + put_disk(pf->disk); blk_mq_free_tag_set(&pf->tag_set); pi_release(pf->pi); } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b53f648302c1..4cea3b08087e 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -12,7 +12,7 @@ * Theory of operation: * * At the lowest level, there is the standard driver for the CD/DVD device, - * typically ide-cd.c or sr.c. This driver can handle read and write requests, + * such as drivers/scsi/sr.c. This driver can handle read and write requests, * but it doesn't know anything about the special restrictions that apply to * packet writing. One restriction is that write requests must be aligned to * packet boundaries on the physical media, and the size of a write request @@ -113,57 +113,10 @@ static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); } -/* - * create and register a pktcdvd kernel object. - */ -static struct pktcdvd_kobj* pkt_kobj_create(struct pktcdvd_device *pd, - const char* name, - struct kobject* parent, - struct kobj_type* ktype) -{ - struct pktcdvd_kobj *p; - int error; - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return NULL; - p->pd = pd; - error = kobject_init_and_add(&p->kobj, ktype, parent, "%s", name); - if (error) { - kobject_put(&p->kobj); - return NULL; - } - kobject_uevent(&p->kobj, KOBJ_ADD); - return p; -} -/* - * remove a pktcdvd kernel object. - */ -static void pkt_kobj_remove(struct pktcdvd_kobj *p) -{ - if (p) - kobject_put(&p->kobj); -} -/* - * default release function for pktcdvd kernel objects. - */ -static void pkt_kobj_release(struct kobject *kobj) -{ - kfree(to_pktcdvdkobj(kobj)); -} - - /********************************************************** - * * sysfs interface for pktcdvd * by (C) 2006 Thomas Maier <balagi@justmail.de> - * - **********************************************************/ - -#define DEF_ATTR(_obj,_name,_mode) \ - static struct attribute _obj = { .name = _name, .mode = _mode } - -/********************************************************** + /sys/class/pktcdvd/pktcdvd[0-7]/ stat/reset stat/packets_started @@ -176,75 +129,94 @@ static void pkt_kobj_release(struct kobject *kobj) write_queue/congestion_on **********************************************************/ -DEF_ATTR(kobj_pkt_attr_st1, "reset", 0200); -DEF_ATTR(kobj_pkt_attr_st2, "packets_started", 0444); -DEF_ATTR(kobj_pkt_attr_st3, "packets_finished", 0444); -DEF_ATTR(kobj_pkt_attr_st4, "kb_written", 0444); -DEF_ATTR(kobj_pkt_attr_st5, "kb_read", 0444); -DEF_ATTR(kobj_pkt_attr_st6, "kb_read_gather", 0444); - -static struct attribute *kobj_pkt_attrs_stat[] = { - &kobj_pkt_attr_st1, - &kobj_pkt_attr_st2, - &kobj_pkt_attr_st3, - &kobj_pkt_attr_st4, - &kobj_pkt_attr_st5, - &kobj_pkt_attr_st6, - NULL -}; +static ssize_t packets_started_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); -DEF_ATTR(kobj_pkt_attr_wq1, "size", 0444); -DEF_ATTR(kobj_pkt_attr_wq2, "congestion_off", 0644); -DEF_ATTR(kobj_pkt_attr_wq3, "congestion_on", 0644); + return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started); +} +static DEVICE_ATTR_RO(packets_started); -static struct attribute *kobj_pkt_attrs_wqueue[] = { - &kobj_pkt_attr_wq1, - &kobj_pkt_attr_wq2, - &kobj_pkt_attr_wq3, - NULL -}; +static ssize_t packets_finished_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended); +} +static DEVICE_ATTR_RO(packets_finished); -static ssize_t kobj_pkt_show(struct kobject *kobj, - struct attribute *attr, char *data) +static ssize_t kb_written_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd; - int n = 0; - int v; - if (strcmp(attr->name, "packets_started") == 0) { - n = sprintf(data, "%lu\n", pd->stats.pkt_started); + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "packets_finished") == 0) { - n = sprintf(data, "%lu\n", pd->stats.pkt_ended); + return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1); +} +static DEVICE_ATTR_RO(kb_written); - } else if (strcmp(attr->name, "kb_written") == 0) { - n = sprintf(data, "%lu\n", pd->stats.secs_w >> 1); +static ssize_t kb_read_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "kb_read") == 0) { - n = sprintf(data, "%lu\n", pd->stats.secs_r >> 1); + return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1); +} +static DEVICE_ATTR_RO(kb_read); - } else if (strcmp(attr->name, "kb_read_gather") == 0) { - n = sprintf(data, "%lu\n", pd->stats.secs_rg >> 1); +static ssize_t kb_read_gather_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "size") == 0) { - spin_lock(&pd->lock); - v = pd->bio_queue_size; - spin_unlock(&pd->lock); - n = sprintf(data, "%d\n", v); + return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1); +} +static DEVICE_ATTR_RO(kb_read_gather); - } else if (strcmp(attr->name, "congestion_off") == 0) { - spin_lock(&pd->lock); - v = pd->write_congestion_off; - spin_unlock(&pd->lock); - n = sprintf(data, "%d\n", v); +static ssize_t reset_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "congestion_on") == 0) { - spin_lock(&pd->lock); - v = pd->write_congestion_on; - spin_unlock(&pd->lock); - n = sprintf(data, "%d\n", v); + if (len > 0) { + pd->stats.pkt_started = 0; + pd->stats.pkt_ended = 0; + pd->stats.secs_w = 0; + pd->stats.secs_rg = 0; + pd->stats.secs_r = 0; } + return len; +} +static DEVICE_ATTR_WO(reset); + +static struct attribute *pkt_stat_attrs[] = { + &dev_attr_packets_finished.attr, + &dev_attr_packets_started.attr, + &dev_attr_kb_read.attr, + &dev_attr_kb_written.attr, + &dev_attr_kb_read_gather.attr, + &dev_attr_reset.attr, + NULL, +}; + +static const struct attribute_group pkt_stat_group = { + .name = "stat", + .attrs = pkt_stat_attrs, +}; + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->bio_queue_size); + spin_unlock(&pd->lock); return n; } +static DEVICE_ATTR_RO(size); static void init_write_congestion_marks(int* lo, int* hi) { @@ -263,30 +235,56 @@ static void init_write_congestion_marks(int* lo, int* hi) } } -static ssize_t kobj_pkt_store(struct kobject *kobj, - struct attribute *attr, - const char *data, size_t len) +static ssize_t congestion_off_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd; - int val; + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; - if (strcmp(attr->name, "reset") == 0 && len > 0) { - pd->stats.pkt_started = 0; - pd->stats.pkt_ended = 0; - pd->stats.secs_w = 0; - pd->stats.secs_rg = 0; - pd->stats.secs_r = 0; + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->write_congestion_off); + spin_unlock(&pd->lock); + return n; +} - } else if (strcmp(attr->name, "congestion_off") == 0 - && sscanf(data, "%d", &val) == 1) { +static ssize_t congestion_off_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int val; + + if (sscanf(buf, "%d", &val) == 1) { spin_lock(&pd->lock); pd->write_congestion_off = val; init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); spin_unlock(&pd->lock); + } + return len; +} +static DEVICE_ATTR_RW(congestion_off); + +static ssize_t congestion_on_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; - } else if (strcmp(attr->name, "congestion_on") == 0 - && sscanf(data, "%d", &val) == 1) { + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->write_congestion_on); + spin_unlock(&pd->lock); + return n; +} + +static ssize_t congestion_on_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int val; + + if (sscanf(buf, "%d", &val) == 1) { spin_lock(&pd->lock); pd->write_congestion_on = val; init_write_congestion_marks(&pd->write_congestion_off, @@ -295,44 +293,39 @@ static ssize_t kobj_pkt_store(struct kobject *kobj, } return len; } +static DEVICE_ATTR_RW(congestion_on); -static const struct sysfs_ops kobj_pkt_ops = { - .show = kobj_pkt_show, - .store = kobj_pkt_store +static struct attribute *pkt_wq_attrs[] = { + &dev_attr_congestion_on.attr, + &dev_attr_congestion_off.attr, + &dev_attr_size.attr, + NULL, }; -static struct kobj_type kobj_pkt_type_stat = { - .release = pkt_kobj_release, - .sysfs_ops = &kobj_pkt_ops, - .default_attrs = kobj_pkt_attrs_stat + +static const struct attribute_group pkt_wq_group = { + .name = "write_queue", + .attrs = pkt_wq_attrs, }; -static struct kobj_type kobj_pkt_type_wqueue = { - .release = pkt_kobj_release, - .sysfs_ops = &kobj_pkt_ops, - .default_attrs = kobj_pkt_attrs_wqueue + +static const struct attribute_group *pkt_groups[] = { + &pkt_stat_group, + &pkt_wq_group, + NULL, }; static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) { if (class_pktcdvd) { - pd->dev = device_create(class_pktcdvd, NULL, MKDEV(0, 0), NULL, - "%s", pd->name); + pd->dev = device_create_with_groups(class_pktcdvd, NULL, + MKDEV(0, 0), pd, pkt_groups, + "%s", pd->name); if (IS_ERR(pd->dev)) pd->dev = NULL; } - if (pd->dev) { - pd->kobj_stat = pkt_kobj_create(pd, "stat", - &pd->dev->kobj, - &kobj_pkt_type_stat); - pd->kobj_wqueue = pkt_kobj_create(pd, "write_queue", - &pd->dev->kobj, - &kobj_pkt_type_wqueue); - } } static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) { - pkt_kobj_remove(pd->kobj_stat); - pkt_kobj_remove(pd->kobj_wqueue); if (class_pktcdvd) device_unregister(pd->dev); } @@ -529,7 +522,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames) goto no_pkt; pkt->frames = frames; - pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames); + pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL); if (!pkt->w_bio) goto no_bio; @@ -543,27 +536,21 @@ static struct packet_data *pkt_alloc_packet_data(int frames) bio_list_init(&pkt->orig_bios); for (i = 0; i < frames; i++) { - struct bio *bio = bio_kmalloc(GFP_KERNEL, 1); - if (!bio) + pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL); + if (!pkt->r_bios[i]) goto no_rd_bio; - - pkt->r_bios[i] = bio; } return pkt; no_rd_bio: - for (i = 0; i < frames; i++) { - struct bio *bio = pkt->r_bios[i]; - if (bio) - bio_put(bio); - } - + for (i = 0; i < frames; i++) + kfree(pkt->r_bios[i]); no_page: for (i = 0; i < frames / FRAMES_PER_PAGE; i++) if (pkt->pages[i]) __free_page(pkt->pages[i]); - bio_put(pkt->w_bio); + kfree(pkt->w_bio); no_bio: kfree(pkt); no_pkt: @@ -577,14 +564,11 @@ static void pkt_free_packet_data(struct packet_data *pkt) { int i; - for (i = 0; i < pkt->frames; i++) { - struct bio *bio = pkt->r_bios[i]; - if (bio) - bio_put(bio); - } + for (i = 0; i < pkt->frames; i++) + kfree(pkt->r_bios[i]); for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) __free_page(pkt->pages[i]); - bio_put(pkt->w_bio); + kfree(pkt->w_bio); kfree(pkt); } @@ -700,6 +684,7 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) { struct request_queue *q = bdev_get_queue(pd->bdev); + struct scsi_cmnd *scmd; struct request *rq; int ret = 0; @@ -707,6 +692,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); + scmd = blk_mq_rq_to_pdu(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, @@ -715,15 +701,15 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * goto out; } - scsi_req(rq)->cmd_len = COMMAND_SIZE(cgc->cmd[0]); - memcpy(scsi_req(rq)->cmd, cgc->cmd, CDROM_PACKET_SIZE); + scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]); + memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE); rq->timeout = 60*HZ; if (cgc->quiet) rq->rq_flags |= RQF_QUIET; - blk_execute_rq(pd->bdev->bd_disk, rq, 0); - if (scsi_req(rq)->result) + blk_execute_rq(rq, false); + if (scmd->result) ret = -EIO; out: blk_mq_free_request(rq); @@ -956,6 +942,7 @@ static void pkt_end_io_read(struct bio *bio) if (bio->bi_status) atomic_inc(&pkt->io_errors); + bio_uninit(bio); if (atomic_dec_and_test(&pkt->io_wait)) { atomic_inc(&pkt->run_sm); wake_up(&pd->wqueue); @@ -973,6 +960,7 @@ static void pkt_end_io_packet_write(struct bio *bio) pd->stats.pkt_ended++; + bio_uninit(bio); pkt_bio_finished(pd); atomic_dec(&pkt->io_wait); atomic_inc(&pkt->run_sm); @@ -1027,9 +1015,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) continue; bio = pkt->r_bios[f]; - bio_reset(bio); + bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ); bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); - bio_set_dev(bio, pd->bdev); bio->bi_end_io = pkt_end_io_read; bio->bi_private = pkt; @@ -1041,7 +1028,6 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) BUG(); atomic_inc(&pkt->io_wait); - bio_set_op_attrs(bio, REQ_OP_READ, 0); pkt_queue_bio(pd, bio); frames_read++; } @@ -1107,7 +1093,6 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) sector_t zone = 0; /* Suppress gcc warning */ struct pkt_rb_node *node, *first_node; struct rb_node *n; - int wakeup; atomic_set(&pd->scan_queue, 0); @@ -1179,12 +1164,14 @@ try_next_bio: spin_unlock(&pkt->lock); } /* check write congestion marks, and if bio_queue_size is - below, wake up any waiters */ - wakeup = (pd->write_congestion_on > 0 - && pd->bio_queue_size <= pd->write_congestion_off); + * below, wake up any waiters + */ + if (pd->congested && + pd->bio_queue_size <= pd->write_congestion_off) { + pd->congested = false; + wake_up_var(&pd->congested); + } spin_unlock(&pd->lock); - if (wakeup) - clear_bdi_congested(pd->disk->bdi, BLK_RW_ASYNC); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -1241,9 +1228,9 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) { int f; - bio_reset(pkt->w_bio); + bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames, + REQ_OP_WRITE); pkt->w_bio->bi_iter.bi_sector = pkt->sector; - bio_set_dev(pkt->w_bio, pd->bdev); pkt->w_bio->bi_end_io = pkt_end_io_packet_write; pkt->w_bio->bi_private = pkt; @@ -1276,7 +1263,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) /* Start the write request */ atomic_set(&pkt->io_wait, 1); - bio_set_op_attrs(pkt->w_bio, REQ_OP_WRITE, 0); pkt_queue_bio(pd, pkt->w_bio); } @@ -2304,12 +2290,12 @@ static void pkt_end_io_read_cloned(struct bio *bio) static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) { - struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set); + struct bio *cloned_bio = + bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set); struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); psd->pd = pd; psd->bio = bio; - bio_set_dev(cloned_bio, pd->bdev); cloned_bio->bi_private = psd; cloned_bio->bi_end_io = pkt_end_io_read_cloned; pd->stats.secs_r += bio_sectors(bio); @@ -2356,7 +2342,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) } spin_unlock(&pd->cdrw.active_list_lock); - /* + /* * Test if there is enough room left in the bio work queue * (queue size >= congestion on mark). * If not, wait till the work queue size is below the congestion off mark. @@ -2364,12 +2350,20 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(bio->bi_bdev->bd_disk->bdi, BLK_RW_ASYNC); - do { + struct wait_bit_queue_entry wqe; + + init_wait_var_entry(&wqe, &pd->congested, 0); + for (;;) { + prepare_to_wait_event(__var_waitqueue(&pd->congested), + &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); + if (pd->bio_queue_size <= pd->write_congestion_off) + break; + pd->congested = true; spin_unlock(&pd->lock); - congestion_wait(BLK_RW_ASYNC, HZ); + schedule(); spin_lock(&pd->lock); - } while(pd->bio_queue_size > pd->write_congestion_off); + } } spin_unlock(&pd->lock); @@ -2402,17 +2396,10 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) static void pkt_submit_bio(struct bio *bio) { - struct pktcdvd_device *pd; - char b[BDEVNAME_SIZE]; + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; struct bio *split; - blk_queue_split(&bio); - - pd = bio->bi_bdev->bd_disk->queue->queuedata; - if (!pd) { - pr_err("%s incorrect request queue\n", bio_devname(bio, b)); - goto end_io; - } + bio = bio_split_to_limits(bio); pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_iter.bi_sector, @@ -2473,11 +2460,9 @@ static int pkt_seq_show(struct seq_file *m, void *p) { struct pktcdvd_device *pd = m->private; char *msg; - char bdev_buf[BDEVNAME_SIZE]; int states[PACKET_NUM_STATES]; - seq_printf(m, "Writer %s mapped to %s:\n", pd->name, - bdevname(pd->bdev, bdev_buf)); + seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev); seq_printf(m, "\nSettings:\n"); seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); @@ -2534,7 +2519,6 @@ static int pkt_seq_show(struct seq_file *m, void *p) static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { int i; - char b[BDEVNAME_SIZE]; struct block_device *bdev; struct scsi_device *sdev; @@ -2547,8 +2531,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - pkt_err(pd, "%s already setup\n", - bdevname(pd2->bdev, b)); + pkt_err(pd, "%pg already setup\n", pd2->bdev); return -EBUSY; } if (pd2->pkt_dev == dev) { @@ -2583,7 +2566,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd); - pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b)); + pkt_dbg(1, pd, "writer mapped to %pg\n", bdev); return 0; out_mem: @@ -2719,7 +2702,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->first_minor = idx; disk->minors = 1; disk->fops = &pktcdvd_ops; - disk->flags = GENHD_FL_REMOVABLE; + disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; strcpy(disk->disk_name, pd->name); disk->private_data = pd; @@ -2746,7 +2729,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) return 0; out_mem2: - blk_cleanup_disk(disk); + put_disk(disk); out_mem: mempool_exit(&pd->rb_pool); kfree(pd); @@ -2796,7 +2779,7 @@ static int pkt_remove_dev(dev_t pkt_dev) pkt_dbg(1, pd, "writer unmapped\n"); del_gendisk(pd->disk); - blk_cleanup_disk(pd->disk); + put_disk(pd->disk); mempool_exit(&pd->rb_pool); kfree(pd); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 3054adf77460..36d7b36c60c7 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -473,7 +473,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) return 0; fail_cleanup_disk: - blk_cleanup_disk(gendisk); + put_disk(gendisk); fail_free_tag_set: blk_mq_free_tag_set(&priv->tag_set); fail_teardown: @@ -500,7 +500,7 @@ static void ps3disk_remove(struct ps3_system_bus_device *_dev) &ps3disk_mask); mutex_unlock(&ps3disk_mask_mutex); del_gendisk(priv->gendisk); - blk_cleanup_disk(priv->gendisk); + put_disk(priv->gendisk); blk_mq_free_tag_set(&priv->tag_set); dev_notice(&dev->sbd.core, "Synchronizing disk cache\n"); ps3disk_sync_cache(dev); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index c1876646a4cb..c76e0148eada 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); @@ -742,9 +742,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) priv->gendisk = gendisk; gendisk->major = ps3vram_major; gendisk->minors = 1; + gendisk->flags |= GENHD_FL_NO_PART; gendisk->fops = &ps3vram_fops; gendisk->private_data = dev; - strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); + strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE); @@ -760,7 +761,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) return 0; out_cleanup_disk: - blk_cleanup_disk(gendisk); + put_disk(gendisk); out_cache_cleanup: remove_proc_entry(DEVICE_NAME, NULL); ps3vram_cache_cleanup(dev); @@ -791,7 +792,7 @@ static void ps3vram_remove(struct ps3_system_bus_device *dev) struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); del_gendisk(priv->gendisk); - blk_cleanup_disk(priv->gendisk); + put_disk(priv->gendisk); remove_proc_entry(DEVICE_NAME, NULL); ps3vram_cache_cleanup(dev); iounmap(priv->reports); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 953fa134cd3d..04453f4a319c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -756,24 +756,23 @@ static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) */ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) { - struct rbd_client *client_node; - bool found = false; + struct rbd_client *rbdc = NULL, *iter; if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; spin_lock(&rbd_client_list_lock); - list_for_each_entry(client_node, &rbd_client_list, node) { - if (!ceph_compare_options(ceph_opts, client_node->client)) { - __rbd_get_client(client_node); + list_for_each_entry(iter, &rbd_client_list, node) { + if (!ceph_compare_options(ceph_opts, iter->client)) { + __rbd_get_client(iter); - found = true; + rbdc = iter; break; } } spin_unlock(&rbd_client_list_lock); - return found ? client_node : NULL; + return rbdc; } /* @@ -1298,7 +1297,7 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req) dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", __func__, osd_req, obj_req, obj_req->ex.oe_objno, obj_req->ex.oe_off, obj_req->ex.oe_len); - ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); + ceph_osdc_start_request(osd_req->r_osdc, osd_req); } /* @@ -2082,7 +2081,7 @@ static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, if (ret) return ret; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); return 0; } @@ -4730,7 +4729,7 @@ static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, static void rbd_free_disk(struct rbd_device *rbd_dev) { - blk_cleanup_disk(rbd_dev->disk); + put_disk(rbd_dev->disk); blk_mq_free_tag_set(&rbd_dev->tag_set); rbd_dev->disk = NULL; } @@ -4769,7 +4768,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (ret) goto out_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) ceph_copy_from_page_vector(pages, buf, 0, ret); @@ -4924,12 +4923,10 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->dev_id); disk->major = rbd_dev->major; disk->first_minor = rbd_dev->minor; - if (single_major) { + if (single_major) disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT); - disk->flags |= GENHD_FL_EXT_DEVT; - } else { + else disk->minors = RBD_MINORS_PER_MAJOR; - } disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; @@ -4944,7 +4941,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_io_opt(q, rbd_dev->opts->alloc_size); if (rbd_dev->opts->trim) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); q->limits.discard_granularity = rbd_dev->opts->alloc_size; blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); @@ -6191,7 +6187,7 @@ static inline size_t next_token(const char **buf) * These are the characters that produce nonzero for * isspace() in the "C" and "POSIX" locales. */ - const char *spaces = " \f\n\r\t\v"; + static const char spaces[] = " \f\n\r\t\v"; *buf += strspn(*buf, spaces); /* Find start of token */ @@ -6497,7 +6493,8 @@ static int rbd_add_parse_args(const char *buf, pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; pctx.opts->trim = RBD_TRIM_DEFAULT; - ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL); + ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL, + ','); if (ret) goto out_err; @@ -7186,7 +7183,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, * IO to complete/fail. */ blk_mq_freeze_queue(rbd_dev->disk->queue); - blk_set_queue_dying(rbd_dev->disk->queue); + blk_mark_disk_dead(rbd_dev->disk); } del_gendisk(rbd_dev->disk); @@ -7225,8 +7222,10 @@ static int __init rbd_sysfs_init(void) int ret; ret = device_register(&rbd_root_dev); - if (ret < 0) + if (ret < 0) { + put_device(&rbd_root_dev); return ret; + } ret = bus_register(&rbd_bus_type); if (ret < 0) diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile index 5bb1a7ad1ada..40b31630822c 100644 --- a/drivers/block/rnbd/Makefile +++ b/drivers/block/rnbd/Makefile @@ -6,10 +6,12 @@ rnbd-client-y := rnbd-clt.o \ rnbd-clt-sysfs.o \ rnbd-common.o +CFLAGS_rnbd-srv-trace.o = -I$(src) + rnbd-server-y := rnbd-common.o \ rnbd-srv.o \ - rnbd-srv-dev.o \ - rnbd-srv-sysfs.o + rnbd-srv-sysfs.o \ + rnbd-srv-trace.o obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 44e45af00e83..e7c7d9a68168 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -376,7 +376,7 @@ static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, if (ret) return ret; - ret = rnbd_clt_resize_disk(dev, (size_t)sectors); + ret = rnbd_clt_resize_disk(dev, sectors); if (ret) return ret; @@ -452,6 +452,7 @@ static struct attribute *rnbd_dev_attrs[] = { &rnbd_clt_nr_poll_queues.attr, NULL, }; +ATTRIBUTE_GROUPS(rnbd_dev); void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) { @@ -474,7 +475,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) static struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = rnbd_dev_attrs, + .default_groups = rnbd_dev_groups, }; static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 2df0657cdf00..78334da74d8b 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -23,9 +23,9 @@ MODULE_LICENSE("GPL"); static int rnbd_client_major; static DEFINE_IDA(index_ida); -static DEFINE_MUTEX(ida_lock); static DEFINE_MUTEX(sess_lock); static LIST_HEAD(sess_list); +static struct workqueue_struct *rnbd_clt_wq; /* * Maximum number of partitions an instance can have. @@ -55,9 +55,7 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) if (!refcount_dec_and_test(&dev->refcount)) return; - mutex_lock(&ida_lock); - ida_simple_remove(&index_ida, dev->clt_device_id); - mutex_unlock(&ida_lock); + ida_free(&index_ida, dev->clt_device_id); kfree(dev->hw_queues); kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); @@ -70,41 +68,18 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) return refcount_inc_not_zero(&dev->refcount); } -static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, - const struct rnbd_msg_open_rsp *rsp) +static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, + sector_t new_nsectors) { - struct rnbd_clt_session *sess = dev->sess; - - if (!rsp->logical_block_size) - return -EINVAL; - - dev->device_id = le32_to_cpu(rsp->device_id); - dev->nsectors = le64_to_cpu(rsp->nsectors); - dev->logical_block_size = le16_to_cpu(rsp->logical_block_size); - dev->physical_block_size = le16_to_cpu(rsp->physical_block_size); - dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors); - dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors); - dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); - dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); - dev->secure_discard = le16_to_cpu(rsp->secure_discard); - dev->rotational = rsp->rotational; - dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); - dev->fua = !!(rsp->cache_policy & RNBD_FUA); - - dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; - dev->max_segments = sess->max_segments; - - return 0; -} + if (get_capacity(dev->gd) == new_nsectors) + return; -static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, - size_t new_nsectors) -{ - rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", - dev->nsectors, new_nsectors); - dev->nsectors = new_nsectors; - set_capacity_and_notify(dev->gd, dev->nsectors); - return 0; + /* + * If the size changed, we need to revalidate it + */ + rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n", + get_capacity(dev->gd), new_nsectors); + set_capacity_and_notify(dev->gd, new_nsectors); } static int process_msg_open_rsp(struct rnbd_clt_dev *dev, @@ -123,19 +98,16 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev, if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) { u64 nsectors = le64_to_cpu(rsp->nsectors); - /* - * If the device was remapped and the size changed in the - * meantime we need to revalidate it - */ - if (dev->nsectors != nsectors) - rnbd_clt_change_capacity(dev, nsectors); + rnbd_clt_change_capacity(dev, nsectors); gd_kobj = &disk_to_dev(dev->gd)->kobj; kobject_uevent(gd_kobj, KOBJ_ONLINE); rnbd_clt_info(dev, "Device online, device remapped successfully\n"); } - err = rnbd_clt_set_dev_attr(dev, rsp); - if (err) + if (!rsp->logical_block_size) { + err = -EINVAL; goto out; + } + dev->device_id = le32_to_cpu(rsp->device_id); dev->dev_state = DEV_STATE_MAPPED; out: @@ -144,7 +116,7 @@ out: return err; } -int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize) +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize) { int ret = 0; @@ -154,7 +126,7 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize) ret = -ENOENT; goto out; } - ret = rnbd_clt_change_capacity(dev, newsize); + rnbd_clt_change_capacity(dev, newsize); out: mutex_unlock(&dev->lock); @@ -196,7 +168,7 @@ rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) return per_cpu_ptr(sess->cpu_queues, bit); } else if (cpu != 0) { /* Search from 0 to cpu */ - bit = find_next_bit(sess->cpu_queues_bm, cpu, 0); + bit = find_first_bit(sess->cpu_queues_bm, cpu); if (bit < cpu) return per_cpu_ptr(sess->cpu_queues, bit); } @@ -393,7 +365,7 @@ static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) static void rnbd_softirq_done_fn(struct request *rq) { - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_clt_session *sess = dev->sess; struct rnbd_iu *iu; @@ -433,7 +405,7 @@ static void msg_conf(void *priv, int errno) schedule_work(&iu->work); } -static int send_usr_msg(struct rtrs_clt *rtrs, int dir, +static int send_usr_msg(struct rtrs_clt_sess *rtrs, int dir, struct rnbd_iu *iu, struct kvec *vec, size_t len, struct scatterlist *sg, unsigned int sg_len, void (*conf)(struct work_struct *work), @@ -511,6 +483,11 @@ static void msg_open_conf(struct work_struct *work) struct rnbd_msg_open_rsp *rsp = iu->buf; struct rnbd_clt_dev *dev = iu->dev; int errno = iu->errno; + bool from_map = false; + + /* INIT state is only triggered from rnbd_clt_map_device */ + if (dev->dev_state == DEV_STATE_INIT) + from_map = true; if (errno) { rnbd_clt_err(dev, @@ -527,7 +504,9 @@ static void msg_open_conf(struct work_struct *work) send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT); } } - kfree(rsp); + /* We free rsp in rnbd_clt_map_device for map scenario */ + if (!from_map) + kfree(rsp); wake_up_iu_comp(iu, errno); rnbd_put_iu(dev->sess, iu); rnbd_clt_put_dev(dev); @@ -946,7 +925,7 @@ static int rnbd_client_open(struct block_device *block_device, fmode_t mode) { struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; - if (dev->read_only && (mode & FMODE_WRITE)) + if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE)) return -EPERM; if (dev->dev_state == DEV_STATE_UNMAPPED || @@ -967,10 +946,10 @@ static int rnbd_client_getgeo(struct block_device *block_device, struct hd_geometry *geo) { u64 size; - struct rnbd_clt_dev *dev; + struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct queue_limits *limit = &dev->queue->limits; - dev = block_device->bd_disk->private_data; - size = dev->size * (dev->logical_block_size / SECTOR_SIZE); + size = dev->size * (limit->logical_block_size / SECTOR_SIZE); geo->cylinders = size >> 6; /* size/64 */ geo->heads = 4; geo->sectors = 16; @@ -1010,7 +989,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, struct request *rq, struct rnbd_iu *iu) { - struct rtrs_clt *rtrs = dev->sess->rtrs; + struct rtrs_clt_sess *rtrs = dev->sess->rtrs; struct rtrs_permit *permit = iu->permit; struct rnbd_msg_io msg; struct rtrs_clt_req_ops req_ops; @@ -1133,7 +1112,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); int err; blk_status_t ret = BLK_STS_IOERR; @@ -1180,13 +1159,11 @@ static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct rnbd_queue *q = hctx->driver_data; struct rnbd_clt_dev *dev = q->dev; - int cnt; - cnt = rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); - return cnt; + return rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); } -static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) +static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set) { struct rnbd_clt_session *sess = set->driver_data; @@ -1215,8 +1192,6 @@ static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) set->map[HCTX_TYPE_DEFAULT].nr_queues, set->map[HCTX_TYPE_READ].nr_queues); } - - return 0; } static struct blk_mq_ops rnbd_mq_ops = { @@ -1262,9 +1237,9 @@ find_and_get_or_create_sess(const char *sessname, struct rtrs_clt_ops rtrs_ops; sess = find_or_create_sess(sessname, &first); - if (sess == ERR_PTR(-ENOMEM)) + if (sess == ERR_PTR(-ENOMEM)) { return ERR_PTR(-ENOMEM); - else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { + } else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { /* * A device MUST have its own session to use the polling-mode. * It must fail to map new device with the same session. @@ -1343,7 +1318,7 @@ static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev, static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) { - int i; + unsigned long i; struct blk_mq_hw_ctx *hctx; struct rnbd_queue *q; @@ -1354,13 +1329,15 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) } } -static void setup_request_queue(struct rnbd_clt_dev *dev) +static void setup_request_queue(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp) { - blk_queue_logical_block_size(dev->queue, dev->logical_block_size); - blk_queue_physical_block_size(dev->queue, dev->physical_block_size); - blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors); - blk_queue_max_write_same_sectors(dev->queue, - dev->max_write_same_sectors); + blk_queue_logical_block_size(dev->queue, + le16_to_cpu(rsp->logical_block_size)); + blk_queue_physical_block_size(dev->queue, + le16_to_cpu(rsp->physical_block_size)); + blk_queue_max_hw_sectors(dev->queue, + dev->sess->max_io_size / SECTOR_SIZE); /* * we don't support discards to "discontiguous" segments @@ -1368,23 +1345,27 @@ static void setup_request_queue(struct rnbd_clt_dev *dev) */ blk_queue_max_discard_segments(dev->queue, 1); - blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors); - dev->queue->limits.discard_granularity = dev->discard_granularity; - dev->queue->limits.discard_alignment = dev->discard_alignment; - if (dev->max_discard_sectors) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, dev->queue); - if (dev->secure_discard) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, dev->queue); - + blk_queue_max_discard_sectors(dev->queue, + le32_to_cpu(rsp->max_discard_sectors)); + dev->queue->limits.discard_granularity = + le32_to_cpu(rsp->discard_granularity); + dev->queue->limits.discard_alignment = + le32_to_cpu(rsp->discard_alignment); + if (le16_to_cpu(rsp->secure_discard)) + blk_queue_max_secure_erase_sectors(dev->queue, + le32_to_cpu(rsp->max_discard_sectors)); blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); - blk_queue_max_segments(dev->queue, dev->max_segments); + blk_queue_max_segments(dev->queue, dev->sess->max_segments); blk_queue_io_opt(dev->queue, dev->sess->max_io_size); blk_queue_virt_boundary(dev->queue, SZ_4K - 1); - blk_queue_write_cache(dev->queue, dev->wc, dev->fua); + blk_queue_write_cache(dev->queue, + !!(rsp->cache_policy & RNBD_WRITEBACK), + !!(rsp->cache_policy & RNBD_FUA)); } -static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) +static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp, int idx) { int err; @@ -1396,34 +1377,34 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) dev->gd->private_data = dev; snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d", idx); - pr_debug("disk_name=%s, capacity=%zu\n", + pr_debug("disk_name=%s, capacity=%llu\n", dev->gd->disk_name, - dev->nsectors * (dev->logical_block_size / SECTOR_SIZE) - ); + le64_to_cpu(rsp->nsectors) * + (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE)); - set_capacity(dev->gd, dev->nsectors); + set_capacity(dev->gd, le64_to_cpu(rsp->nsectors)); - if (dev->access_mode == RNBD_ACCESS_RO) { - dev->read_only = true; + if (dev->access_mode == RNBD_ACCESS_RO) set_disk_ro(dev->gd, true); - } else { - dev->read_only = false; - } - if (!dev->rotational) - blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); + /* + * Network device does not need rotational + */ + blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) - blk_cleanup_disk(dev->gd); + put_disk(dev->gd); return err; } -static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) +static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp) { int idx = dev->clt_device_id; - dev->size = dev->nsectors * dev->logical_block_size; + dev->size = le64_to_cpu(rsp->nsectors) * + le16_to_cpu(rsp->logical_block_size); dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); if (IS_ERR(dev->gd)) @@ -1431,8 +1412,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - setup_request_queue(dev); - return rnbd_clt_setup_gen_disk(dev, idx); + setup_request_queue(dev, rsp); + return rnbd_clt_setup_gen_disk(dev, rsp, idx); } static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, @@ -1459,10 +1440,8 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, goto out_alloc; } - mutex_lock(&ida_lock); - ret = ida_simple_get(&index_ida, 0, 1 << (MINORBITS - RNBD_PART_BITS), - GFP_KERNEL); - mutex_unlock(&ida_lock); + ret = ida_alloc_max(&index_ida, 1 << (MINORBITS - RNBD_PART_BITS), + GFP_KERNEL); if (ret < 0) { pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n", pathname, sess->sessname, ret); @@ -1570,7 +1549,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, { struct rnbd_clt_session *sess; struct rnbd_clt_dev *dev; - int ret; + int ret, errno; + struct rnbd_msg_open_rsp *rsp; + struct rnbd_msg_open msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; if (exists_devpath(pathname, sessname)) return ERR_PTR(-EEXIST); @@ -1590,17 +1576,47 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, ret = -EEXIST; goto put_dev; } - ret = send_msg_open(dev, RTRS_PERMIT_WAIT); + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + ret = -ENOMEM; + goto del_dev; + } + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) { + ret = -ENOMEM; + kfree(rsp); + goto del_dev; + } + iu->buf = rsp; + iu->dev = dev; + sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); + msg.access_mode = dev->access_mode; + strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); + + WARN_ON(!rnbd_clt_get_dev(dev)); + ret = send_usr_msg(sess->rtrs, READ, iu, + &vec, sizeof(*rsp), iu->sgt.sgl, 1, + msg_open_conf, &errno, RTRS_PERMIT_WAIT); + if (ret) { + rnbd_clt_put_dev(dev); + rnbd_put_iu(sess, iu); + } else { + ret = errno; + } if (ret) { rnbd_clt_err(dev, "map_device: failed, can't open remote device, err: %d\n", ret); - goto del_dev; + goto put_iu; } mutex_lock(&dev->lock); pr_debug("Opened remote device: session=%s, path='%s'\n", sess->sessname, pathname); - ret = rnbd_client_setup_device(dev); + ret = rnbd_client_setup_device(dev, rsp); if (ret) { rnbd_clt_err(dev, "map_device: Failed to configure device, err: %d\n", @@ -1610,21 +1626,30 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, } rnbd_clt_info(dev, - "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d, wc: %d, fua: %d)\n", - dev->gd->disk_name, dev->nsectors, - dev->logical_block_size, dev->physical_block_size, - dev->max_write_same_sectors, dev->max_discard_sectors, - dev->discard_granularity, dev->discard_alignment, - dev->secure_discard, dev->max_segments, - dev->max_hw_sectors, dev->rotational, dev->wc, dev->fua); + "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", + dev->gd->disk_name, le64_to_cpu(rsp->nsectors), + le16_to_cpu(rsp->logical_block_size), + le16_to_cpu(rsp->physical_block_size), + le32_to_cpu(rsp->max_discard_sectors), + le32_to_cpu(rsp->discard_granularity), + le32_to_cpu(rsp->discard_alignment), + le16_to_cpu(rsp->secure_discard), + sess->max_segments, sess->max_io_size / SECTOR_SIZE, + !!(rsp->cache_policy & RNBD_WRITEBACK), + !!(rsp->cache_policy & RNBD_FUA)); mutex_unlock(&dev->lock); + kfree(rsp); + rnbd_put_iu(sess, iu); rnbd_clt_put_sess(sess); return dev; send_close: send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); +put_iu: + kfree(rsp); + rnbd_put_iu(sess, iu); del_dev: delete_dev(dev); put_dev: @@ -1638,7 +1663,7 @@ put_sess: static void destroy_gen_disk(struct rnbd_clt_dev *dev) { del_gendisk(dev->gd); - blk_cleanup_disk(dev->gd); + put_disk(dev->gd); } static void destroy_sysfs(struct rnbd_clt_dev *dev, @@ -1763,17 +1788,17 @@ static void rnbd_destroy_sessions(void) list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { /* * Here unmap happens in parallel for only one reason: - * blk_cleanup_queue() takes around half a second, so + * del_gendisk() takes around half a second, so * on huge amount of devices the whole module unload * procedure takes minutes. */ INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work); - queue_work(system_long_wq, &dev->unmap_on_rmmod_work); + queue_work(rnbd_clt_wq, &dev->unmap_on_rmmod_work); } rnbd_clt_put_sess(sess); } /* Wait for all scheduled unmap works */ - flush_workqueue(system_long_wq); + flush_workqueue(rnbd_clt_wq); WARN_ON(!list_empty(&sess_list)); } @@ -1798,6 +1823,14 @@ static int __init rnbd_client_init(void) pr_err("Failed to load module, creating sysfs device files failed, err: %d\n", err); unregister_blkdev(rnbd_client_major, "rnbd"); + return err; + } + rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0); + if (!rnbd_clt_wq) { + pr_err("Failed to load module, alloc_workqueue failed.\n"); + rnbd_clt_destroy_sysfs_files(); + unregister_blkdev(rnbd_client_major, "rnbd"); + err = -ENOMEM; } return err; @@ -1808,6 +1841,7 @@ static void __exit rnbd_client_exit(void) rnbd_destroy_sessions(); unregister_blkdev(rnbd_client_major, "rnbd"); ida_destroy(&index_ida); + destroy_workqueue(rnbd_clt_wq); } module_init(rnbd_client_init); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 9ef8c4f306f2..a48e040abe63 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -75,7 +75,7 @@ struct rnbd_cpu_qlist { struct rnbd_clt_session { struct list_head list; - struct rtrs_clt *rtrs; + struct rtrs_clt_sess *rtrs; wait_queue_head_t rtrs_waitq; bool rtrs_ready; struct rnbd_cpu_qlist __percpu @@ -106,6 +106,7 @@ struct rnbd_queue { }; struct rnbd_clt_dev { + struct kobject kobj; struct rnbd_clt_session *sess; struct request_queue *queue; struct rnbd_queue *hw_queues; @@ -114,29 +115,14 @@ struct rnbd_clt_dev { u32 clt_device_id; struct mutex lock; enum rnbd_clt_dev_state dev_state; + refcount_t refcount; char *pathname; enum rnbd_access_mode access_mode; u32 nr_poll_queues; - bool read_only; - bool rotational; - bool wc; - bool fua; - u32 max_hw_sectors; - u32 max_write_same_sectors; - u32 max_discard_sectors; - u32 discard_granularity; - u32 discard_alignment; - u16 secure_discard; - u16 physical_block_size; - u16 logical_block_size; - u16 max_segments; - size_t nsectors; u64 size; /* device size in bytes */ struct list_head list; struct gendisk *gd; - struct kobject kobj; char *blk_symlink_name; - refcount_t refcount; struct work_struct unmap_on_rmmod_work; }; @@ -152,7 +138,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, const struct attribute *sysfs_self); int rnbd_clt_remap_device(struct rnbd_clt_dev *dev); -int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize); +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize); /* rnbd-clt-sysfs.c */ diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index de5d5a8df81d..ea7ac8bca63c 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -128,7 +128,7 @@ enum rnbd_cache_policy { * @logical_block_size: logical block size device supports in bytes * @max_segments: max segments hardware support in one transfer * @secure_discard: supports secure discard - * @rotation: is a rotational disc? + * @obsolete_rotational: obsolete, not in used. * @cache_policy: support write-back caching or FUA? */ struct rnbd_msg_open_rsp { @@ -144,7 +144,7 @@ struct rnbd_msg_open_rsp { __le16 logical_block_size; __le16 max_segments; __le16 secure_discard; - u8 rotational; + u8 obsolete_rotational; u8 cache_policy; u8 reserved[10]; }; @@ -229,9 +229,9 @@ static inline bool rnbd_flags_supported(u32 flags) return true; } -static inline u32 rnbd_to_bio_flags(u32 rnbd_opf) +static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) { - u32 bio_opf; + blk_opf_t bio_opf; switch (rnbd_op(rnbd_opf)) { case RNBD_OP_READ: @@ -249,9 +249,6 @@ static inline u32 rnbd_to_bio_flags(u32 rnbd_opf) case RNBD_OP_SECURE_ERASE: bio_opf = REQ_OP_SECURE_ERASE; break; - case RNBD_OP_WRITE_SAME: - bio_opf = REQ_OP_WRITE_SAME; - break; default: WARN(1, "Unknown RNBD type: %d (flags %d)\n", rnbd_op(rnbd_opf), rnbd_opf); @@ -284,15 +281,13 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) case REQ_OP_SECURE_ERASE: rnbd_opf = RNBD_OP_SECURE_ERASE; break; - case REQ_OP_WRITE_SAME: - rnbd_opf = RNBD_OP_WRITE_SAME; - break; case REQ_OP_FLUSH: rnbd_opf = RNBD_OP_FLUSH; break; default: WARN(1, "Unknown request type %d (flags %llu)\n", - req_op(rq), (unsigned long long)rq->cmd_flags); + (__force u32)req_op(rq), + (__force unsigned long long)rq->cmd_flags); rnbd_opf = 0; } diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c deleted file mode 100644 index b241a099aeae..000000000000 --- a/drivers/block/rnbd/rnbd-srv-dev.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#undef pr_fmt -#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt - -#include "rnbd-srv-dev.h" -#include "rnbd-log.h" - -struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags, - struct bio_set *bs) -{ - struct rnbd_dev *dev; - int ret; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return ERR_PTR(-ENOMEM); - - dev->blk_open_flags = flags; - dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE); - ret = PTR_ERR_OR_ZERO(dev->bdev); - if (ret) - goto err; - - dev->blk_open_flags = flags; - bdevname(dev->bdev, dev->name); - dev->ibd_bio_set = bs; - - return dev; - -err: - kfree(dev); - return ERR_PTR(ret); -} - -void rnbd_dev_close(struct rnbd_dev *dev) -{ - blkdev_put(dev->bdev, dev->blk_open_flags); - kfree(dev); -} - -void rnbd_dev_bi_end_io(struct bio *bio) -{ - struct rnbd_dev_blk_io *io = bio->bi_private; - - rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status)); - bio_put(bio); -} - -/** - * rnbd_bio_map_kern - map kernel address into bio - * @data: pointer to buffer to map - * @bs: bio_set to use. - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; - struct bio *bio; - - bio = bio_alloc_bioset(gfp_mask, nr_pages, bs); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (bio_add_page(bio, virt_to_page(data), bytes, - offset) < bytes) { - /* we don't support partial mappings */ - bio_put(bio); - return ERR_PTR(-EINVAL); - } - - data += bytes; - len -= bytes; - offset = 0; - } - - return bio; -} diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h deleted file mode 100644 index 0eb23850afb9..000000000000 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#ifndef RNBD_SRV_DEV_H -#define RNBD_SRV_DEV_H - -#include <linux/fs.h> -#include "rnbd-proto.h" - -struct rnbd_dev { - struct block_device *bdev; - struct bio_set *ibd_bio_set; - fmode_t blk_open_flags; - char name[BDEVNAME_SIZE]; -}; - -struct rnbd_dev_blk_io { - struct rnbd_dev *dev; - void *priv; - /* have to be last member for front_pad usage of bioset_init */ - struct bio bio; -}; - -/** - * rnbd_dev_open() - Open a device - * @flags: open flags - * @bs: bio_set to use during block io, - */ -struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags, - struct bio_set *bs); - -/** - * rnbd_dev_close() - Close a device - */ -void rnbd_dev_close(struct rnbd_dev *dev); - -void rnbd_endio(void *priv, int error); - -void rnbd_dev_bi_end_io(struct bio *bio); - -struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, - unsigned int len, gfp_t gfp_mask); - -static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) -{ - return queue_max_segments(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) -{ - return queue_max_hw_sectors(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) -{ - return blk_queue_secure_erase(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) -{ - if (!blk_queue_discard(bdev_get_queue(dev->bdev))) - return 0; - - return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev), - REQ_OP_DISCARD); -} - -static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) -{ - return bdev_get_queue(dev->bdev)->limits.discard_granularity; -} - -static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) -{ - return bdev_get_queue(dev->bdev)->limits.discard_alignment; -} - -#endif /* RNBD_SRV_DEV_H */ diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 4db98e0e76f0..297a6924ff4e 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -13,7 +13,6 @@ #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/stat.h> -#include <linux/genhd.h> #include <linux/list.h> #include <linux/moduleparam.h> #include <linux/device.h> @@ -39,14 +38,13 @@ static struct kobj_type dev_ktype = { }; int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, - struct block_device *bdev, - const char *dev_name) + struct block_device *bdev) { struct kobject *bdev_kobj; int ret; ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, - rnbd_devs_kobj, dev_name); + rnbd_devs_kobj, "%pg", bdev); if (ret) { kobject_put(&dev->dev_kobj); return ret; diff --git a/drivers/block/rnbd/rnbd-srv-trace.c b/drivers/block/rnbd/rnbd-srv-trace.c new file mode 100644 index 000000000000..30f0895c18f5 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-trace.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. + */ +#include "rtrs.h" +#include "rtrs-srv.h" +#include "rnbd-srv.h" +#include "rnbd-proto.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "rnbd-srv-trace.h" diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h new file mode 100644 index 000000000000..8dedf73bdd28 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rnbd_srv + +#if !defined(_TRACE_RNBD_SRV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RNBD_SRV_H + +#include <linux/tracepoint.h> + +struct rnbd_srv_session; +struct rtrs_srv_op; + +DECLARE_EVENT_CLASS(rnbd_srv_link_class, + TP_PROTO(struct rnbd_srv_session *srv), + + TP_ARGS(srv), + + TP_STRUCT__entry( + __field(int, qdepth) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->qdepth = srv->queue_depth; + __assign_str(sessname, srv->sessname); + ), + + TP_printk("sessname: %s qdepth: %d", + __get_str(sessname), + __entry->qdepth + ) +); + +#define DEFINE_LINK_EVENT(name) \ +DEFINE_EVENT(rnbd_srv_link_class, name, \ + TP_PROTO(struct rnbd_srv_session *srv), \ + TP_ARGS(srv)) + +DEFINE_LINK_EVENT(create_sess); +DEFINE_LINK_EVENT(destroy_sess); + +TRACE_DEFINE_ENUM(RNBD_OP_READ); +TRACE_DEFINE_ENUM(RNBD_OP_WRITE); +TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); +TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); +TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); +TRACE_DEFINE_ENUM(RNBD_F_SYNC); +TRACE_DEFINE_ENUM(RNBD_F_FUA); + +#define show_rnbd_rw_flags(x) \ + __print_flags(x, "|", \ + { RNBD_OP_READ, "READ" }, \ + { RNBD_OP_WRITE, "WRITE" }, \ + { RNBD_OP_FLUSH, "FLUSH" }, \ + { RNBD_OP_DISCARD, "DISCARD" }, \ + { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ + { RNBD_F_SYNC, "SYNC" }, \ + { RNBD_F_FUA, "FUA" }) + +TRACE_EVENT(process_rdma, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_io *msg, + struct rtrs_srv_op *id, + u32 datalen, + size_t usrlen), + + TP_ARGS(srv, msg, id, datalen, usrlen), + + TP_STRUCT__entry( + __string(sessname, srv->sessname) + __field(u8, dir) + __field(u8, ver) + __field(u32, device_id) + __field(u64, sector) + __field(u32, flags) + __field(u32, bi_size) + __field(u16, ioprio) + __field(u32, datalen) + __field(size_t, usrlen) + ), + + TP_fast_assign( + __assign_str(sessname, srv->sessname); + __entry->dir = id->dir; + __entry->ver = srv->ver; + __entry->device_id = le32_to_cpu(msg->device_id); + __entry->sector = le64_to_cpu(msg->sector); + __entry->bi_size = le32_to_cpu(msg->bi_size); + __entry->flags = le32_to_cpu(msg->rw); + __entry->ioprio = le16_to_cpu(msg->prio); + __entry->datalen = datalen; + __entry->usrlen = usrlen; + ), + + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", + __get_str(sessname), + __print_symbolic(__entry->dir, + { READ, "READ" }, + { WRITE, "WRITE" }), + __entry->ver, + __entry->device_id, + __entry->sector, + __entry->bi_size, + show_rnbd_rw_flags(__entry->flags), + __entry->ioprio, + __entry->datalen, + __entry->usrlen + ) +); + +TRACE_EVENT(process_msg_sess_info, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_sess_info *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u8, proto_ver) + __field(u8, clt_ver) + __field(u8, srv_ver) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->proto_ver = srv->ver; + __entry->clt_ver = msg->ver; + __entry->srv_ver = RNBD_PROTO_VER_MAJOR; + __assign_str(sessname, srv->sessname); + ), + + TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)", + __get_str(sessname), + __entry->proto_ver, + __entry->clt_ver, + __entry->srv_ver + ) +); + +TRACE_DEFINE_ENUM(RNBD_ACCESS_RO); +TRACE_DEFINE_ENUM(RNBD_ACCESS_RW); +TRACE_DEFINE_ENUM(RNBD_ACCESS_MIGRATION); + +#define show_rnbd_access_mode(x) \ + __print_symbolic(x, \ + { RNBD_ACCESS_RO, "RO" }, \ + { RNBD_ACCESS_RW, "RW" }, \ + { RNBD_ACCESS_MIGRATION, "MIGRATION" }) + +TRACE_EVENT(process_msg_open, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_open *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u8, access_mode) + __string(sessname, srv->sessname) + __string(dev_name, msg->dev_name) + ), + + TP_fast_assign( + __entry->access_mode = msg->access_mode; + __assign_str(sessname, srv->sessname); + __assign_str(dev_name, msg->dev_name); + ), + + TP_printk("Open message received: session='%s' path='%s' access_mode=%s", + __get_str(sessname), + __get_str(dev_name), + show_rnbd_access_mode(__entry->access_mode) + ) +); + +TRACE_EVENT(process_msg_close, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_close *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u32, device_id) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->device_id = le32_to_cpu(msg->device_id); + __assign_str(sessname, srv->sessname); + ), + + TP_printk("Close message received: session='%s' device id='%d'", + __get_str(sessname), + __entry->device_id + ) +); + +#endif /* _TRACE_RNBD_SRV_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rnbd-srv-trace +#include <trace/define_trace.h> + diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index aafecfe97055..2cfed2e58d64 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -13,7 +13,7 @@ #include <linux/blkdev.h> #include "rnbd-srv.h" -#include "rnbd-srv-dev.h" +#include "rnbd-srv-trace.h" MODULE_DESCRIPTION("RDMA Network Block Device Server"); MODULE_LICENSE("GPL"); @@ -84,18 +84,6 @@ static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev) kref_put(&sess_dev->kref, rnbd_sess_dev_release); } -void rnbd_endio(void *priv, int error) -{ - struct rnbd_io_private *rnbd_priv = priv; - struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; - - rnbd_put_sess_dev(sess_dev); - - rtrs_srv_resp_rdma(rnbd_priv->id, error); - - kfree(priv); -} - static struct rnbd_srv_sess_dev * rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) { @@ -114,6 +102,18 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) return sess_dev; } +static void rnbd_dev_bi_end_io(struct bio *bio) +{ + struct rnbd_io_private *rnbd_priv = bio->bi_private; + struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; + + rnbd_put_sess_dev(sess_dev); + rtrs_srv_resp_rdma(rnbd_priv->id, blk_status_to_errno(bio->bi_status)); + + kfree(rnbd_priv); + bio_put(bio); +} + static int process_rdma(struct rnbd_srv_session *srv_sess, struct rtrs_srv_op *id, void *data, u32 datalen, const void *usr, size_t usrlen) @@ -123,10 +123,11 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, struct rnbd_srv_sess_dev *sess_dev; u32 dev_id; int err; - struct rnbd_dev_blk_io *io; struct bio *bio; short prio; + trace_process_rdma(srv_sess, msg, id, datalen, usrlen); + priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -144,33 +145,29 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - /* Generate bio with pages pointing to the rdma buffer */ - bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); - rnbd_srv_err(sess_dev, "Failed to generate bio, err: %d\n", err); - goto sess_dev_put; + bio = bio_alloc(sess_dev->bdev, 1, + rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); + if (bio_add_page(bio, virt_to_page(data), datalen, + offset_in_page(data)) != datalen) { + rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); + err = -EINVAL; + goto bio_put; } - io = container_of(bio, struct rnbd_dev_blk_io, bio); - io->dev = sess_dev->rnbd_dev; - io->priv = priv; - bio->bi_end_io = rnbd_dev_bi_end_io; - bio->bi_private = io; - bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); bio_set_prio(bio, prio); - bio_set_dev(bio, sess_dev->rnbd_dev->bdev); submit_bio(bio); return 0; -sess_dev_put: +bio_put: + bio_put(bio); rnbd_put_sess_dev(sess_dev); err: kfree(priv); @@ -222,8 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ - rnbd_dev_close(sess_dev->rnbd_dev); - list_del(&sess_dev->sess_list); + blkdev_put(sess_dev->bdev, sess_dev->open_flags); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); if (sess_dev->open_flags & FMODE_WRITE) @@ -238,20 +234,21 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) static void destroy_sess(struct rnbd_srv_session *srv_sess) { - struct rnbd_srv_sess_dev *sess_dev, *tmp; + struct rnbd_srv_sess_dev *sess_dev; + unsigned long index; - if (list_empty(&srv_sess->sess_dev_list)) + if (xa_empty(&srv_sess->index_idr)) goto out; + trace_destroy_sess(srv_sess); + mutex_lock(&srv_sess->lock); - list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list, - sess_list) + xa_for_each(&srv_sess->index_idr, index, sess_dev) rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&srv_sess->lock); out: xa_destroy(&srv_sess->index_idr); - bioset_exit(&srv_sess->sess_bio_set); pr_info("RTRS Session %s disconnected\n", srv_sess->sessname); @@ -263,15 +260,15 @@ out: kfree(srv_sess); } -static int create_sess(struct rtrs_srv *rtrs) +static int create_sess(struct rtrs_srv_sess *rtrs) { struct rnbd_srv_session *srv_sess; - char sessname[NAME_MAX]; + char pathname[NAME_MAX]; int err; - err = rtrs_srv_get_sess_name(rtrs, sessname, sizeof(sessname)); + err = rtrs_srv_get_path_name(rtrs, pathname, sizeof(pathname)); if (err) { - pr_err("rtrs_srv_get_sess_name(%s): %d\n", sessname, err); + pr_err("rtrs_srv_get_path_name(%s): %d\n", pathname, err); return err; } @@ -280,32 +277,23 @@ static int create_sess(struct rtrs_srv *rtrs) return -ENOMEM; srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs); - err = bioset_init(&srv_sess->sess_bio_set, srv_sess->queue_depth, - offsetof(struct rnbd_dev_blk_io, bio), - BIOSET_NEED_BVECS); - if (err) { - pr_err("Allocating srv_session for session %s failed\n", - sessname); - kfree(srv_sess); - return err; - } - xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC); - INIT_LIST_HEAD(&srv_sess->sess_dev_list); mutex_init(&srv_sess->lock); mutex_lock(&sess_lock); list_add(&srv_sess->list, &sess_list); mutex_unlock(&sess_lock); srv_sess->rtrs = rtrs; - strscpy(srv_sess->sessname, sessname, sizeof(srv_sess->sessname)); + strscpy(srv_sess->sessname, pathname, sizeof(srv_sess->sessname)); rtrs_srv_set_sess_priv(rtrs, srv_sess); + trace_create_sess(srv_sess); + return 0; } -static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, +static int rnbd_srv_link_ev(struct rtrs_srv_sess *rtrs, enum rtrs_srv_link_ev ev, void *priv) { struct rnbd_srv_session *srv_sess = priv; @@ -333,33 +321,35 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, { struct rnbd_srv_session *sess = sess_dev->sess; - sess_dev->keep_id = true; /* It is already started to close by client's close message. */ if (!mutex_trylock(&sess->lock)) return; + + sess_dev->keep_id = true; /* first remove sysfs itself to avoid deadlock */ sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&sess->lock); } -static int process_msg_close(struct rnbd_srv_session *srv_sess, +static void process_msg_close(struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, size_t usrlen) { const struct rnbd_msg_close *close_msg = usr; struct rnbd_srv_sess_dev *sess_dev; + trace_process_msg_close(srv_sess, close_msg); + sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), srv_sess); if (IS_ERR(sess_dev)) - return 0; + return; rnbd_put_sess_dev(sess_dev); mutex_lock(&srv_sess->lock); rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&srv_sess->lock); - return 0; } static int process_msg_open(struct rnbd_srv_session *srv_sess, @@ -370,10 +360,9 @@ static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen); -static int rnbd_srv_rdma_ev(void *priv, - struct rtrs_srv_op *id, int dir, - void *data, size_t datalen, const void *usr, - size_t usrlen) +static int rnbd_srv_rdma_ev(void *priv, struct rtrs_srv_op *id, + void *data, size_t datalen, + const void *usr, size_t usrlen) { struct rnbd_srv_session *srv_sess = priv; const struct rnbd_msg_hdr *hdr = usr; @@ -389,7 +378,7 @@ static int rnbd_srv_rdma_ev(void *priv, case RNBD_MSG_IO: return process_rdma(srv_sess, id, data, datalen, usr, usrlen); case RNBD_MSG_CLOSE: - ret = process_msg_close(srv_sess, data, datalen, usr, usrlen); + process_msg_close(srv_sess, data, datalen, usr, usrlen); break; case RNBD_MSG_OPEN: ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); @@ -399,11 +388,16 @@ static int rnbd_srv_rdma_ev(void *priv, datalen); break; default: - pr_warn("Received unexpected message type %d with dir %d from session %s\n", - type, dir, srv_sess->sessname); + pr_warn("Received unexpected message type %d from session %s\n", + type, srv_sess->sessname); return -EINVAL; } + /* + * Since ret is passed to rtrs to handle the failure case, we + * just return 0 at the end otherwise callers in rtrs would call + * send_io_resp_imm again to print redundant err message. + */ rtrs_srv_resp_rdma(id, ret); return 0; } @@ -429,7 +423,7 @@ static struct rnbd_srv_sess_dev return sess_dev; } -static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) +static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(struct block_device *bdev) { struct rnbd_srv_dev *dev; @@ -437,7 +431,7 @@ static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) if (!dev) return ERR_PTR(-ENOMEM); - strscpy(dev->id, id, sizeof(dev->id)); + snprintf(dev->id, sizeof(dev->id), "%pg", bdev); kref_init(&dev->kref); INIT_LIST_HEAD(&dev->sess_dev_list); mutex_init(&dev->lock); @@ -515,14 +509,14 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, } static struct rnbd_srv_dev * -rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, +rnbd_srv_get_or_create_srv_dev(struct block_device *bdev, struct rnbd_srv_session *srv_sess, enum rnbd_access_mode access_mode) { int ret; struct rnbd_srv_dev *new_dev, *dev; - new_dev = rnbd_srv_init_srv_dev(rnbd_dev->name); + new_dev = rnbd_srv_init_srv_dev(bdev); if (IS_ERR(new_dev)) return new_dev; @@ -542,44 +536,32 @@ rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, struct rnbd_srv_sess_dev *sess_dev) { - struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; - struct request_queue *q = bdev_get_queue(rnbd_dev->bdev); + struct block_device *bdev = sess_dev->bdev; rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); - rsp->device_id = - cpu_to_le32(sess_dev->device_id); - rsp->nsectors = - cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); - rsp->logical_block_size = - cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev)); - rsp->physical_block_size = - cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev)); - rsp->max_segments = - cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev)); + rsp->device_id = cpu_to_le32(sess_dev->device_id); + rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); + rsp->logical_block_size = cpu_to_le16(bdev_logical_block_size(bdev)); + rsp->physical_block_size = cpu_to_le16(bdev_physical_block_size(bdev)); + rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev)); rsp->max_hw_sectors = - cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); - rsp->max_write_same_sectors = - cpu_to_le32(bdev_write_same(rnbd_dev->bdev)); - rsp->max_discard_sectors = - cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); - rsp->discard_granularity = - cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); - rsp->discard_alignment = - cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); - rsp->secure_discard = - cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); - rsp->rotational = !blk_queue_nonrot(q); + cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev))); + rsp->max_write_same_sectors = 0; + rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev)); + rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev)); + rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev)); + rsp->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev)); rsp->cache_policy = 0; - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (bdev_write_cache(bdev)) rsp->cache_policy |= RNBD_WRITEBACK; - if (blk_queue_fua(q)) + if (bdev_fua(bdev)) rsp->cache_policy |= RNBD_FUA; } static struct rnbd_srv_sess_dev * rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, const struct rnbd_msg_open *open_msg, - struct rnbd_dev *rnbd_dev, fmode_t open_flags, + struct block_device *bdev, fmode_t open_flags, struct rnbd_srv_dev *srv_dev) { struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); @@ -591,7 +573,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); - sdev->rnbd_dev = rnbd_dev; + sdev->bdev = bdev; sdev->sess = srv_sess; sdev->dev = srv_dev; sdev->open_flags = open_flags; @@ -657,9 +639,8 @@ static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, struct rnbd_msg_sess_info_rsp *rsp = data; srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); - pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n", - srv_sess->sessname, srv_sess->ver, - sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + + trace_process_msg_sess_info(srv_sess, sess_info_msg); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; @@ -679,11 +660,12 @@ static struct rnbd_srv_sess_dev * find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name) { struct rnbd_srv_sess_dev *sess_dev; + unsigned long index; - if (list_empty(&srv_sess->sess_dev_list)) + if (xa_empty(&srv_sess->index_idr)) return NULL; - list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list) + xa_for_each(&srv_sess->index_idr, index, sess_dev) if (!strcmp(sess_dev->pathname, dev_name)) return sess_dev; @@ -698,14 +680,13 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, struct rnbd_srv_dev *srv_dev; struct rnbd_srv_sess_dev *srv_sess_dev; const struct rnbd_msg_open *open_msg = msg; + struct block_device *bdev; fmode_t open_flags; char *full_path; - struct rnbd_dev *rnbd_dev; struct rnbd_msg_open_rsp *rsp = data; - pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", - srv_sess->sessname, open_msg->dev_name, - open_msg->access_mode); + trace_process_msg_open(srv_sess, open_msg); + open_flags = FMODE_READ; if (open_msg->access_mode != RNBD_ACCESS_RO) open_flags |= FMODE_WRITE; @@ -738,26 +719,25 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto reject; } - rnbd_dev = rnbd_dev_open(full_path, open_flags, - &srv_sess->sess_bio_set); - if (IS_ERR(rnbd_dev)) { - pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", - full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); - ret = PTR_ERR(rnbd_dev); + bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n", + full_path, srv_sess->sessname, ret); goto free_path; } - srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, + srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess, open_msg->access_mode); if (IS_ERR(srv_dev)) { pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", full_path, srv_sess->sessname, PTR_ERR(srv_dev)); ret = PTR_ERR(srv_dev); - goto rnbd_dev_close; + goto blkdev_put; } srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, - rnbd_dev, open_flags, + bdev, open_flags, srv_dev); if (IS_ERR(srv_sess_dev)) { pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", @@ -772,8 +752,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, */ mutex_lock(&srv_dev->lock); if (!srv_dev->dev_kobj.state_in_sysfs) { - ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev, - rnbd_dev->name); + ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev); if (ret) { mutex_unlock(&srv_dev->lock); rnbd_srv_err(srv_sess_dev, @@ -795,8 +774,6 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list); mutex_unlock(&srv_dev->lock); - list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list); - rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id); kfree(full_path); @@ -817,8 +794,8 @@ srv_dev_put: mutex_unlock(&srv_dev->lock); } rnbd_put_srv_dev(srv_dev); -rnbd_dev_close: - rnbd_dev_close(rnbd_dev); +blkdev_put: + blkdev_put(bdev, open_flags); free_path: kfree(full_path); reject: diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 98ddc31eb408..f5962fd31d62 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -20,14 +20,11 @@ struct rnbd_srv_session { /* Entry inside global sess_list */ struct list_head list; - struct rtrs_srv *rtrs; + struct rtrs_srv_sess *rtrs; char sessname[NAME_MAX]; int queue_depth; - struct bio_set sess_bio_set; struct xarray index_idr; - /* List of struct rnbd_srv_sess_dev */ - struct list_head sess_dev_list; struct mutex lock; u8 ver; }; @@ -49,9 +46,7 @@ struct rnbd_srv_dev { struct rnbd_srv_sess_dev { /* Entry inside rnbd_srv_dev struct */ struct list_head dev_list; - /* Entry inside rnbd_srv_session struct */ - struct list_head sess_list; - struct rnbd_dev *rnbd_dev; + struct block_device *bdev; struct rnbd_srv_session *sess; struct rnbd_srv_dev *dev; struct kobject kobj; @@ -69,8 +64,7 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, /* rnbd-srv-sysfs.c */ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, - struct block_device *bdev, - const char *dir_name); + struct block_device *bdev); void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev); int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile deleted file mode 100644 index 7ef158099d33..000000000000 --- a/drivers/block/rsxx/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o -rsxx-objs := config.o core.o cregs.o dev.o dma.o diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c deleted file mode 100644 index 11ed1d9646b9..000000000000 --- a/drivers/block/rsxx/config.c +++ /dev/null @@ -1,197 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: config.c -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include <linux/types.h> -#include <linux/crc32.h> -#include <linux/swab.h> - -#include "rsxx_priv.h" -#include "rsxx_cfg.h" - -static void initialize_config(struct rsxx_card_cfg *cfg) -{ - cfg->hdr.version = RSXX_CFG_VERSION; - - cfg->data.block_size = RSXX_HW_BLK_SIZE; - cfg->data.stripe_size = RSXX_HW_BLK_SIZE; - cfg->data.vendor_id = RSXX_VENDOR_ID_IBM; - cfg->data.cache_order = (-1); - cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED; - cfg->data.intr_coal.count = 0; - cfg->data.intr_coal.latency = 0; -} - -static u32 config_data_crc32(struct rsxx_card_cfg *cfg) -{ - /* - * Return the compliment of the CRC to ensure compatibility - * (i.e. this is how early rsxx drivers did it.) - */ - - return ~crc32(~0, &cfg->data, sizeof(cfg->data)); -} - - -/*----------------- Config Byte Swap Functions -------------------*/ -static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr) -{ - hdr->version = be32_to_cpu((__force __be32) hdr->version); - hdr->crc = be32_to_cpu((__force __be32) hdr->crc); -} - -static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr) -{ - hdr->version = (__force u32) cpu_to_be32(hdr->version); - hdr->crc = (__force u32) cpu_to_be32(hdr->crc); -} - -static void config_data_swab(struct rsxx_card_cfg *cfg) -{ - u32 *data = (u32 *) &cfg->data; - int i; - - for (i = 0; i < (sizeof(cfg->data) / 4); i++) - data[i] = swab32(data[i]); -} - -static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg) -{ - u32 *data = (u32 *) &cfg->data; - int i; - - for (i = 0; i < (sizeof(cfg->data) / 4); i++) - data[i] = le32_to_cpu((__force __le32) data[i]); -} - -static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg) -{ - u32 *data = (u32 *) &cfg->data; - int i; - - for (i = 0; i < (sizeof(cfg->data) / 4); i++) - data[i] = (__force u32) cpu_to_le32(data[i]); -} - - -/*----------------- Config Operations ------------------*/ -static int rsxx_save_config(struct rsxx_cardinfo *card) -{ - struct rsxx_card_cfg cfg; - int st; - - memcpy(&cfg, &card->config, sizeof(cfg)); - - if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) { - dev_err(CARD_TO_DEV(card), - "Cannot save config with invalid version %d\n", - cfg.hdr.version); - return -EINVAL; - } - - /* Convert data to little endian for the CRC calculation. */ - config_data_cpu_to_le(&cfg); - - cfg.hdr.crc = config_data_crc32(&cfg); - - /* - * Swap the data from little endian to big endian so it can be - * stored. - */ - config_data_swab(&cfg); - config_hdr_cpu_to_be(&cfg.hdr); - - st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1); - if (st) - return st; - - return 0; -} - -int rsxx_load_config(struct rsxx_cardinfo *card) -{ - int st; - u32 crc; - - st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config), - &card->config, 1); - if (st) { - dev_err(CARD_TO_DEV(card), - "Failed reading card config.\n"); - return st; - } - - config_hdr_be_to_cpu(&card->config.hdr); - - if (card->config.hdr.version == RSXX_CFG_VERSION) { - /* - * We calculate the CRC with the data in little endian, because - * early drivers did not take big endian CPUs into account. - * The data is always stored in big endian, so we need to byte - * swap it before calculating the CRC. - */ - - config_data_swab(&card->config); - - /* Check the CRC */ - crc = config_data_crc32(&card->config); - if (crc != card->config.hdr.crc) { - dev_err(CARD_TO_DEV(card), - "Config corruption detected!\n"); - dev_info(CARD_TO_DEV(card), - "CRC (sb x%08x is x%08x)\n", - card->config.hdr.crc, crc); - return -EIO; - } - - /* Convert the data to CPU byteorder */ - config_data_le_to_cpu(&card->config); - - } else if (card->config.hdr.version != 0) { - dev_err(CARD_TO_DEV(card), - "Invalid config version %d.\n", - card->config.hdr.version); - /* - * Config version changes require special handling from the - * user - */ - return -EINVAL; - } else { - dev_info(CARD_TO_DEV(card), - "Initializing card configuration.\n"); - initialize_config(&card->config); - st = rsxx_save_config(card); - if (st) - return st; - } - - card->config_valid = 1; - - dev_dbg(CARD_TO_DEV(card), "version: x%08x\n", - card->config.hdr.version); - dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n", - card->config.hdr.crc); - dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n", - card->config.data.block_size); - dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n", - card->config.data.stripe_size); - dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n", - card->config.data.vendor_id); - dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n", - card->config.data.cache_order); - dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n", - card->config.data.intr_coal.mode); - dev_dbg(CARD_TO_DEV(card), "count: x%08x\n", - card->config.data.intr_coal.count); - dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n", - card->config.data.intr_coal.latency); - - return 0; -} - diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c deleted file mode 100644 index 8d9d69f5dfbc..000000000000 --- a/drivers/block/rsxx/core.c +++ /dev/null @@ -1,1126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: core.c -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/pci.h> -#include <linux/reboot.h> -#include <linux/slab.h> -#include <linux/bitops.h> -#include <linux/delay.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> - -#include <linux/genhd.h> -#include <linux/idr.h> - -#include "rsxx_priv.h" -#include "rsxx_cfg.h" - -#define NO_LEGACY 0 -#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */ - -MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver"); -MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(DRIVER_VERSION); - -static unsigned int force_legacy = NO_LEGACY; -module_param(force_legacy, uint, 0444); -MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); - -static unsigned int sync_start = 1; -module_param(sync_start, uint, 0444); -MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete " - "until the card startup has completed."); - -static DEFINE_IDA(rsxx_disk_ida); - -/* --------------------Debugfs Setup ------------------- */ - -static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) -{ - struct rsxx_cardinfo *card = m->private; - - seq_printf(m, "HWID 0x%08x\n", - ioread32(card->regmap + HWID)); - seq_printf(m, "SCRATCH 0x%08x\n", - ioread32(card->regmap + SCRATCH)); - seq_printf(m, "IER 0x%08x\n", - ioread32(card->regmap + IER)); - seq_printf(m, "IPR 0x%08x\n", - ioread32(card->regmap + IPR)); - seq_printf(m, "CREG_CMD 0x%08x\n", - ioread32(card->regmap + CREG_CMD)); - seq_printf(m, "CREG_ADD 0x%08x\n", - ioread32(card->regmap + CREG_ADD)); - seq_printf(m, "CREG_CNT 0x%08x\n", - ioread32(card->regmap + CREG_CNT)); - seq_printf(m, "CREG_STAT 0x%08x\n", - ioread32(card->regmap + CREG_STAT)); - seq_printf(m, "CREG_DATA0 0x%08x\n", - ioread32(card->regmap + CREG_DATA0)); - seq_printf(m, "CREG_DATA1 0x%08x\n", - ioread32(card->regmap + CREG_DATA1)); - seq_printf(m, "CREG_DATA2 0x%08x\n", - ioread32(card->regmap + CREG_DATA2)); - seq_printf(m, "CREG_DATA3 0x%08x\n", - ioread32(card->regmap + CREG_DATA3)); - seq_printf(m, "CREG_DATA4 0x%08x\n", - ioread32(card->regmap + CREG_DATA4)); - seq_printf(m, "CREG_DATA5 0x%08x\n", - ioread32(card->regmap + CREG_DATA5)); - seq_printf(m, "CREG_DATA6 0x%08x\n", - ioread32(card->regmap + CREG_DATA6)); - seq_printf(m, "CREG_DATA7 0x%08x\n", - ioread32(card->regmap + CREG_DATA7)); - seq_printf(m, "INTR_COAL 0x%08x\n", - ioread32(card->regmap + INTR_COAL)); - seq_printf(m, "HW_ERROR 0x%08x\n", - ioread32(card->regmap + HW_ERROR)); - seq_printf(m, "DEBUG0 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG0)); - seq_printf(m, "DEBUG1 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG1)); - seq_printf(m, "DEBUG2 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG2)); - seq_printf(m, "DEBUG3 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG3)); - seq_printf(m, "DEBUG4 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG4)); - seq_printf(m, "DEBUG5 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG5)); - seq_printf(m, "DEBUG6 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG6)); - seq_printf(m, "DEBUG7 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG7)); - seq_printf(m, "RECONFIG 0x%08x\n", - ioread32(card->regmap + PCI_RECONFIG)); - - return 0; -} - -static int rsxx_attr_stats_show(struct seq_file *m, void *p) -{ - struct rsxx_cardinfo *card = m->private; - int i; - - for (i = 0; i < card->n_targets; i++) { - seq_printf(m, "Ctrl %d CRC Errors = %d\n", - i, card->ctrl[i].stats.crc_errors); - seq_printf(m, "Ctrl %d Hard Errors = %d\n", - i, card->ctrl[i].stats.hard_errors); - seq_printf(m, "Ctrl %d Soft Errors = %d\n", - i, card->ctrl[i].stats.soft_errors); - seq_printf(m, "Ctrl %d Writes Issued = %d\n", - i, card->ctrl[i].stats.writes_issued); - seq_printf(m, "Ctrl %d Writes Failed = %d\n", - i, card->ctrl[i].stats.writes_failed); - seq_printf(m, "Ctrl %d Reads Issued = %d\n", - i, card->ctrl[i].stats.reads_issued); - seq_printf(m, "Ctrl %d Reads Failed = %d\n", - i, card->ctrl[i].stats.reads_failed); - seq_printf(m, "Ctrl %d Reads Retried = %d\n", - i, card->ctrl[i].stats.reads_retried); - seq_printf(m, "Ctrl %d Discards Issued = %d\n", - i, card->ctrl[i].stats.discards_issued); - seq_printf(m, "Ctrl %d Discards Failed = %d\n", - i, card->ctrl[i].stats.discards_failed); - seq_printf(m, "Ctrl %d DMA SW Errors = %d\n", - i, card->ctrl[i].stats.dma_sw_err); - seq_printf(m, "Ctrl %d DMA HW Faults = %d\n", - i, card->ctrl[i].stats.dma_hw_fault); - seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n", - i, card->ctrl[i].stats.dma_cancelled); - seq_printf(m, "Ctrl %d SW Queue Depth = %d\n", - i, card->ctrl[i].stats.sw_q_depth); - seq_printf(m, "Ctrl %d HW Queue Depth = %d\n", - i, atomic_read(&card->ctrl[i].stats.hw_q_depth)); - } - - return 0; -} - -static int rsxx_attr_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, rsxx_attr_stats_show, inode->i_private); -} - -static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) -{ - return single_open(file, rsxx_attr_pci_regs_show, inode->i_private); -} - -static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct rsxx_cardinfo *card = file_inode(fp)->i_private; - char *buf; - int st; - - buf = kzalloc(cnt, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - st = rsxx_creg_read(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); - if (!st) { - if (copy_to_user(ubuf, buf, cnt)) - st = -EFAULT; - } - kfree(buf); - if (st) - return st; - *ppos += cnt; - return cnt; -} - -static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct rsxx_cardinfo *card = file_inode(fp)->i_private; - char *buf; - ssize_t st; - - buf = memdup_user(ubuf, cnt); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - st = rsxx_creg_write(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); - kfree(buf); - if (st) - return st; - *ppos += cnt; - return cnt; -} - -static const struct file_operations debugfs_cram_fops = { - .owner = THIS_MODULE, - .read = rsxx_cram_read, - .write = rsxx_cram_write, -}; - -static const struct file_operations debugfs_stats_fops = { - .owner = THIS_MODULE, - .open = rsxx_attr_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static const struct file_operations debugfs_pci_regs_fops = { - .owner = THIS_MODULE, - .open = rsxx_attr_pci_regs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) -{ - struct dentry *debugfs_stats; - struct dentry *debugfs_pci_regs; - struct dentry *debugfs_cram; - - card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL); - if (IS_ERR_OR_NULL(card->debugfs_dir)) - goto failed_debugfs_dir; - - debugfs_stats = debugfs_create_file("stats", 0444, - card->debugfs_dir, card, - &debugfs_stats_fops); - if (IS_ERR_OR_NULL(debugfs_stats)) - goto failed_debugfs_stats; - - debugfs_pci_regs = debugfs_create_file("pci_regs", 0444, - card->debugfs_dir, card, - &debugfs_pci_regs_fops); - if (IS_ERR_OR_NULL(debugfs_pci_regs)) - goto failed_debugfs_pci_regs; - - debugfs_cram = debugfs_create_file("cram", 0644, - card->debugfs_dir, card, - &debugfs_cram_fops); - if (IS_ERR_OR_NULL(debugfs_cram)) - goto failed_debugfs_cram; - - return; -failed_debugfs_cram: - debugfs_remove(debugfs_pci_regs); -failed_debugfs_pci_regs: - debugfs_remove(debugfs_stats); -failed_debugfs_stats: - debugfs_remove(card->debugfs_dir); -failed_debugfs_dir: - card->debugfs_dir = NULL; -} - -/*----------------- Interrupt Control & Handling -------------------*/ - -static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) -{ - card->isr_mask = 0; - card->ier_mask = 0; -} - -static void __enable_intr(unsigned int *mask, unsigned int intr) -{ - *mask |= intr; -} - -static void __disable_intr(unsigned int *mask, unsigned int intr) -{ - *mask &= ~intr; -} - -/* - * NOTE: Disabling the IER will disable the hardware interrupt. - * Disabling the ISR will disable the software handling of the ISR bit. - * - * Enable/Disable interrupt functions assume the card->irq_lock - * is held by the caller. - */ -void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) -{ - if (unlikely(card->halt) || - unlikely(card->eeh_state)) - return; - - __enable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} - -void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) -{ - if (unlikely(card->eeh_state)) - return; - - __disable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} - -void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr) -{ - if (unlikely(card->halt) || - unlikely(card->eeh_state)) - return; - - __enable_intr(&card->isr_mask, intr); - __enable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} -void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr) -{ - if (unlikely(card->eeh_state)) - return; - - __disable_intr(&card->isr_mask, intr); - __disable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} - -static irqreturn_t rsxx_isr(int irq, void *pdata) -{ - struct rsxx_cardinfo *card = pdata; - unsigned int isr; - int handled = 0; - int reread_isr; - int i; - - spin_lock(&card->irq_lock); - - do { - reread_isr = 0; - - if (unlikely(card->eeh_state)) - break; - - isr = ioread32(card->regmap + ISR); - if (isr == 0xffffffff) { - /* - * A few systems seem to have an intermittent issue - * where PCI reads return all Fs, but retrying the read - * a little later will return as expected. - */ - dev_info(CARD_TO_DEV(card), - "ISR = 0xFFFFFFFF, retrying later\n"); - break; - } - - isr &= card->isr_mask; - if (!isr) - break; - - for (i = 0; i < card->n_targets; i++) { - if (isr & CR_INTR_DMA(i)) { - if (card->ier_mask & CR_INTR_DMA(i)) { - rsxx_disable_ier(card, CR_INTR_DMA(i)); - reread_isr = 1; - } - queue_work(card->ctrl[i].done_wq, - &card->ctrl[i].dma_done_work); - handled++; - } - } - - if (isr & CR_INTR_CREG) { - queue_work(card->creg_ctrl.creg_wq, - &card->creg_ctrl.done_work); - handled++; - } - - if (isr & CR_INTR_EVENT) { - queue_work(card->event_wq, &card->event_work); - rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); - handled++; - } - } while (reread_isr); - - spin_unlock(&card->irq_lock); - - return handled ? IRQ_HANDLED : IRQ_NONE; -} - -/*----------------- Card Event Handler -------------------*/ -static const char *rsxx_card_state_to_str(unsigned int state) -{ - static const char * const state_strings[] = { - "Unknown", "Shutdown", "Starting", "Formatting", - "Uninitialized", "Good", "Shutting Down", - "Fault", "Read Only Fault", "dStroying" - }; - - return state_strings[ffs(state)]; -} - -static void card_state_change(struct rsxx_cardinfo *card, - unsigned int new_state) -{ - int st; - - dev_info(CARD_TO_DEV(card), - "card state change detected.(%s -> %s)\n", - rsxx_card_state_to_str(card->state), - rsxx_card_state_to_str(new_state)); - - card->state = new_state; - - /* Don't attach DMA interfaces if the card has an invalid config */ - if (!card->config_valid) - return; - - switch (new_state) { - case CARD_STATE_RD_ONLY_FAULT: - dev_crit(CARD_TO_DEV(card), - "Hardware has entered read-only mode!\n"); - /* - * Fall through so the DMA devices can be attached and - * the user can attempt to pull off their data. - */ - fallthrough; - case CARD_STATE_GOOD: - st = rsxx_get_card_size8(card, &card->size8); - if (st) - dev_err(CARD_TO_DEV(card), - "Failed attaching DMA devices\n"); - - if (card->config_valid) - set_capacity(card->gendisk, card->size8 >> 9); - break; - - case CARD_STATE_FAULT: - dev_crit(CARD_TO_DEV(card), - "Hardware Fault reported!\n"); - fallthrough; - - /* Everything else, detach DMA interface if it's attached. */ - case CARD_STATE_SHUTDOWN: - case CARD_STATE_STARTING: - case CARD_STATE_FORMATTING: - case CARD_STATE_UNINITIALIZED: - case CARD_STATE_SHUTTING_DOWN: - /* - * dStroy is a term coined by marketing to represent the low level - * secure erase. - */ - case CARD_STATE_DSTROYING: - set_capacity(card->gendisk, 0); - break; - } -} - -static void card_event_handler(struct work_struct *work) -{ - struct rsxx_cardinfo *card; - unsigned int state; - unsigned long flags; - int st; - - card = container_of(work, struct rsxx_cardinfo, event_work); - - if (unlikely(card->halt)) - return; - - /* - * Enable the interrupt now to avoid any weird race conditions where a - * state change might occur while rsxx_get_card_state() is - * processing a returned creg cmd. - */ - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - st = rsxx_get_card_state(card, &state); - if (st) { - dev_info(CARD_TO_DEV(card), - "Failed reading state after event.\n"); - return; - } - - if (card->state != state) - card_state_change(card, state); - - if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING) - rsxx_read_hw_log(card); -} - -/*----------------- Card Operations -------------------*/ -static int card_shutdown(struct rsxx_cardinfo *card) -{ - unsigned int state; - signed long start; - const int timeout = msecs_to_jiffies(120000); - int st; - - /* We can't issue a shutdown if the card is in a transition state */ - start = jiffies; - do { - st = rsxx_get_card_state(card, &state); - if (st) - return st; - } while (state == CARD_STATE_STARTING && - (jiffies - start < timeout)); - - if (state == CARD_STATE_STARTING) - return -ETIMEDOUT; - - /* Only issue a shutdown if we need to */ - if ((state != CARD_STATE_SHUTTING_DOWN) && - (state != CARD_STATE_SHUTDOWN)) { - st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN); - if (st) - return st; - } - - start = jiffies; - do { - st = rsxx_get_card_state(card, &state); - if (st) - return st; - } while (state != CARD_STATE_SHUTDOWN && - (jiffies - start < timeout)); - - if (state != CARD_STATE_SHUTDOWN) - return -ETIMEDOUT; - - return 0; -} - -static int rsxx_eeh_frozen(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - int i; - int st; - - dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n"); - - card->eeh_state = 1; - rsxx_mask_interrupts(card); - - /* - * We need to guarantee that the write for eeh_state and masking - * interrupts does not become reordered. This will prevent a possible - * race condition with the EEH code. - */ - wmb(); - - pci_disable_device(dev); - - st = rsxx_eeh_save_issued_dmas(card); - if (st) - return st; - - rsxx_eeh_save_issued_creg(card); - - for (i = 0; i < card->n_targets; i++) { - if (card->ctrl[i].status.buf) - dma_free_coherent(&card->dev->dev, - STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); - if (card->ctrl[i].cmd.buf) - dma_free_coherent(&card->dev->dev, - COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); - } - - return 0; -} - -static void rsxx_eeh_failure(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - int i; - int cnt = 0; - - dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n"); - - card->eeh_state = 1; - card->halt = 1; - - for (i = 0; i < card->n_targets; i++) { - spin_lock_bh(&card->ctrl[i].queue_lock); - cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], - &card->ctrl[i].queue, - COMPLETE_DMA); - spin_unlock_bh(&card->ctrl[i].queue_lock); - - cnt += rsxx_dma_cancel(&card->ctrl[i]); - - if (cnt) - dev_info(CARD_TO_DEV(card), - "Freed %d queued DMAs on channel %d\n", - cnt, card->ctrl[i].id); - } -} - -static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) -{ - unsigned int status; - int iter = 0; - - /* We need to wait for the hardware to reset */ - while (iter++ < 10) { - status = ioread32(card->regmap + PCI_RECONFIG); - - if (status & RSXX_FLUSH_BUSY) { - ssleep(1); - continue; - } - - if (status & RSXX_FLUSH_TIMEOUT) - dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n"); - return 0; - } - - /* Hardware failed resetting itself. */ - return -1; -} - -static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, - pci_channel_state_t error) -{ - int st; - - if (dev->revision < RSXX_EEH_SUPPORT) - return PCI_ERS_RESULT_NONE; - - if (error == pci_channel_io_perm_failure) { - rsxx_eeh_failure(dev); - return PCI_ERS_RESULT_DISCONNECT; - } - - st = rsxx_eeh_frozen(dev); - if (st) { - dev_err(&dev->dev, "Slot reset setup failed\n"); - rsxx_eeh_failure(dev); - return PCI_ERS_RESULT_DISCONNECT; - } - - return PCI_ERS_RESULT_NEED_RESET; -} - -static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - unsigned long flags; - int i; - int st; - - dev_warn(&dev->dev, - "IBM Flash Adapter PCI: recovering from slot reset.\n"); - - st = pci_enable_device(dev); - if (st) - goto failed_hw_setup; - - pci_set_master(dev); - - st = rsxx_eeh_fifo_flush_poll(card); - if (st) - goto failed_hw_setup; - - rsxx_dma_queue_reset(card); - - for (i = 0; i < card->n_targets; i++) { - st = rsxx_hw_buffers_init(dev, &card->ctrl[i]); - if (st) - goto failed_hw_buffers_init; - } - - if (card->config_valid) - rsxx_dma_configure(card); - - /* Clears the ISR register from spurious interrupts */ - st = ioread32(card->regmap + ISR); - - card->eeh_state = 0; - - spin_lock_irqsave(&card->irq_lock, flags); - if (card->n_targets & RSXX_MAX_TARGETS) - rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); - else - rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C); - spin_unlock_irqrestore(&card->irq_lock, flags); - - rsxx_kick_creg_queue(card); - - for (i = 0; i < card->n_targets; i++) { - spin_lock(&card->ctrl[i].queue_lock); - if (list_empty(&card->ctrl[i].queue)) { - spin_unlock(&card->ctrl[i].queue_lock); - continue; - } - spin_unlock(&card->ctrl[i].queue_lock); - - queue_work(card->ctrl[i].issue_wq, - &card->ctrl[i].issue_dma_work); - } - - dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n"); - - return PCI_ERS_RESULT_RECOVERED; - -failed_hw_buffers_init: - for (i = 0; i < card->n_targets; i++) { - if (card->ctrl[i].status.buf) - dma_free_coherent(&card->dev->dev, - STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); - if (card->ctrl[i].cmd.buf) - dma_free_coherent(&card->dev->dev, - COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); - } -failed_hw_setup: - rsxx_eeh_failure(dev); - return PCI_ERS_RESULT_DISCONNECT; - -} - -/*----------------- Driver Initialization & Setup -------------------*/ -/* Returns: 0 if the driver is compatible with the device - -1 if the driver is NOT compatible with the device */ -static int rsxx_compatibility_check(struct rsxx_cardinfo *card) -{ - unsigned char pci_rev; - - pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); - - if (pci_rev > RS70_PCI_REV_SUPPORTED) - return -1; - return 0; -} - -static int rsxx_pci_probe(struct pci_dev *dev, - const struct pci_device_id *id) -{ - struct rsxx_cardinfo *card; - int st; - unsigned int sync_timeout; - - dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); - - card = kzalloc(sizeof(*card), GFP_KERNEL); - if (!card) - return -ENOMEM; - - card->dev = dev; - pci_set_drvdata(dev, card); - - st = ida_alloc(&rsxx_disk_ida, GFP_KERNEL); - if (st < 0) - goto failed_ida_get; - card->disk_id = st; - - st = pci_enable_device(dev); - if (st) - goto failed_enable; - - pci_set_master(dev); - - st = dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); - if (st) { - dev_err(CARD_TO_DEV(card), - "No usable DMA configuration,aborting\n"); - goto failed_dma_mask; - } - - st = pci_request_regions(dev, DRIVER_NAME); - if (st) { - dev_err(CARD_TO_DEV(card), - "Failed to request memory region\n"); - goto failed_request_regions; - } - - if (pci_resource_len(dev, 0) == 0) { - dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n"); - st = -ENOMEM; - goto failed_iomap; - } - - card->regmap = pci_iomap(dev, 0, 0); - if (!card->regmap) { - dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n"); - st = -ENOMEM; - goto failed_iomap; - } - - spin_lock_init(&card->irq_lock); - card->halt = 0; - card->eeh_state = 0; - - spin_lock_irq(&card->irq_lock); - rsxx_disable_ier_and_isr(card, CR_INTR_ALL); - spin_unlock_irq(&card->irq_lock); - - if (!force_legacy) { - st = pci_enable_msi(dev); - if (st) - dev_warn(CARD_TO_DEV(card), - "Failed to enable MSI\n"); - } - - st = request_irq(dev->irq, rsxx_isr, IRQF_SHARED, - DRIVER_NAME, card); - if (st) { - dev_err(CARD_TO_DEV(card), - "Failed requesting IRQ%d\n", dev->irq); - goto failed_irq; - } - - /************* Setup Processor Command Interface *************/ - st = rsxx_creg_setup(card); - if (st) { - dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n"); - goto failed_creg_setup; - } - - spin_lock_irq(&card->irq_lock); - rsxx_enable_ier_and_isr(card, CR_INTR_CREG); - spin_unlock_irq(&card->irq_lock); - - st = rsxx_compatibility_check(card); - if (st) { - dev_warn(CARD_TO_DEV(card), - "Incompatible driver detected. Please update the driver.\n"); - st = -EINVAL; - goto failed_compatiblity_check; - } - - /************* Load Card Config *************/ - st = rsxx_load_config(card); - if (st) - dev_err(CARD_TO_DEV(card), - "Failed loading card config\n"); - - /************* Setup DMA Engine *************/ - st = rsxx_get_num_targets(card, &card->n_targets); - if (st) - dev_info(CARD_TO_DEV(card), - "Failed reading the number of DMA targets\n"); - - card->ctrl = kcalloc(card->n_targets, sizeof(*card->ctrl), - GFP_KERNEL); - if (!card->ctrl) { - st = -ENOMEM; - goto failed_dma_setup; - } - - st = rsxx_dma_setup(card); - if (st) { - dev_info(CARD_TO_DEV(card), - "Failed to setup DMA engine\n"); - goto failed_dma_setup; - } - - /************* Setup Card Event Handler *************/ - card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); - if (!card->event_wq) { - dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); - st = -ENOMEM; - goto failed_event_handler; - } - - INIT_WORK(&card->event_work, card_event_handler); - - st = rsxx_setup_dev(card); - if (st) - goto failed_create_dev; - - rsxx_get_card_state(card, &card->state); - - dev_info(CARD_TO_DEV(card), - "card state: %s\n", - rsxx_card_state_to_str(card->state)); - - /* - * Now that the DMA Engine and devices have been setup, - * we can enable the event interrupt(it kicks off actions in - * those layers so we couldn't enable it right away.) - */ - spin_lock_irq(&card->irq_lock); - rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); - spin_unlock_irq(&card->irq_lock); - - if (card->state == CARD_STATE_SHUTDOWN) { - st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP); - if (st) - dev_crit(CARD_TO_DEV(card), - "Failed issuing card startup\n"); - if (sync_start) { - sync_timeout = SYNC_START_TIMEOUT; - - dev_info(CARD_TO_DEV(card), - "Waiting for card to startup\n"); - - do { - ssleep(1); - sync_timeout--; - - rsxx_get_card_state(card, &card->state); - } while (sync_timeout && - (card->state == CARD_STATE_STARTING)); - - if (card->state == CARD_STATE_STARTING) { - dev_warn(CARD_TO_DEV(card), - "Card startup timed out\n"); - card->size8 = 0; - } else { - dev_info(CARD_TO_DEV(card), - "card state: %s\n", - rsxx_card_state_to_str(card->state)); - st = rsxx_get_card_size8(card, &card->size8); - if (st) - card->size8 = 0; - } - } - } else if (card->state == CARD_STATE_GOOD || - card->state == CARD_STATE_RD_ONLY_FAULT) { - st = rsxx_get_card_size8(card, &card->size8); - if (st) - card->size8 = 0; - } - - st = rsxx_attach_dev(card); - if (st) - goto failed_create_dev; - - /************* Setup Debugfs *************/ - rsxx_debugfs_dev_new(card); - - return 0; - -failed_create_dev: - destroy_workqueue(card->event_wq); - card->event_wq = NULL; -failed_event_handler: - rsxx_dma_destroy(card); -failed_dma_setup: -failed_compatiblity_check: - destroy_workqueue(card->creg_ctrl.creg_wq); - card->creg_ctrl.creg_wq = NULL; -failed_creg_setup: - spin_lock_irq(&card->irq_lock); - rsxx_disable_ier_and_isr(card, CR_INTR_ALL); - spin_unlock_irq(&card->irq_lock); - free_irq(dev->irq, card); - if (!force_legacy) - pci_disable_msi(dev); -failed_irq: - pci_iounmap(dev, card->regmap); -failed_iomap: - pci_release_regions(dev); -failed_request_regions: -failed_dma_mask: - pci_disable_device(dev); -failed_enable: - ida_free(&rsxx_disk_ida, card->disk_id); -failed_ida_get: - kfree(card); - - return st; -} - -static void rsxx_pci_remove(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - unsigned long flags; - int st; - int i; - - if (!card) - return; - - dev_info(CARD_TO_DEV(card), - "Removing PCI-Flash SSD.\n"); - - rsxx_detach_dev(card); - - for (i = 0; i < card->n_targets; i++) { - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); - spin_unlock_irqrestore(&card->irq_lock, flags); - } - - st = card_shutdown(card); - if (st) - dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n"); - - /* Sync outstanding event handlers. */ - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - cancel_work_sync(&card->event_work); - - destroy_workqueue(card->event_wq); - rsxx_destroy_dev(card); - rsxx_dma_destroy(card); - destroy_workqueue(card->creg_ctrl.creg_wq); - - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_ALL); - spin_unlock_irqrestore(&card->irq_lock, flags); - - /* Prevent work_structs from re-queuing themselves. */ - card->halt = 1; - - debugfs_remove_recursive(card->debugfs_dir); - - free_irq(dev->irq, card); - - if (!force_legacy) - pci_disable_msi(dev); - - rsxx_creg_destroy(card); - - pci_iounmap(dev, card->regmap); - - pci_disable_device(dev); - pci_release_regions(dev); - - ida_free(&rsxx_disk_ida, card->disk_id); - kfree(card); -} - -static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state) -{ - /* We don't support suspend at this time. */ - return -ENOSYS; -} - -static void rsxx_pci_shutdown(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - unsigned long flags; - int i; - - if (!card) - return; - - dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n"); - - rsxx_detach_dev(card); - - for (i = 0; i < card->n_targets; i++) { - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); - spin_unlock_irqrestore(&card->irq_lock, flags); - } - - card_shutdown(card); -} - -static const struct pci_error_handlers rsxx_err_handler = { - .error_detected = rsxx_error_detected, - .slot_reset = rsxx_slot_reset, -}; - -static const struct pci_device_id rsxx_pci_ids[] = { - {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)}, - {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)}, - {0,}, -}; - -MODULE_DEVICE_TABLE(pci, rsxx_pci_ids); - -static struct pci_driver rsxx_pci_driver = { - .name = DRIVER_NAME, - .id_table = rsxx_pci_ids, - .probe = rsxx_pci_probe, - .remove = rsxx_pci_remove, - .suspend = rsxx_pci_suspend, - .shutdown = rsxx_pci_shutdown, - .err_handler = &rsxx_err_handler, -}; - -static int __init rsxx_core_init(void) -{ - int st; - - st = rsxx_dev_init(); - if (st) - return st; - - st = rsxx_dma_init(); - if (st) - goto dma_init_failed; - - st = rsxx_creg_init(); - if (st) - goto creg_init_failed; - - return pci_register_driver(&rsxx_pci_driver); - -creg_init_failed: - rsxx_dma_cleanup(); -dma_init_failed: - rsxx_dev_cleanup(); - - return st; -} - -static void __exit rsxx_core_cleanup(void) -{ - pci_unregister_driver(&rsxx_pci_driver); - rsxx_creg_cleanup(); - rsxx_dma_cleanup(); - rsxx_dev_cleanup(); -} - -module_init(rsxx_core_init); -module_exit(rsxx_core_cleanup); diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c deleted file mode 100644 index 60ecd3f7cbd2..000000000000 --- a/drivers/block/rsxx/cregs.c +++ /dev/null @@ -1,789 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: cregs.c -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include <linux/completion.h> -#include <linux/slab.h> - -#include "rsxx_priv.h" - -#define CREG_TIMEOUT_MSEC 10000 - -typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card, - struct creg_cmd *cmd, - int st); - -struct creg_cmd { - struct list_head list; - creg_cmd_cb cb; - void *cb_private; - unsigned int op; - unsigned int addr; - int cnt8; - void *buf; - unsigned int stream; - unsigned int status; -}; - -static struct kmem_cache *creg_cmd_pool; - - -/*------------ Private Functions --------------*/ - -#if defined(__LITTLE_ENDIAN) -#define LITTLE_ENDIAN 1 -#elif defined(__BIG_ENDIAN) -#define LITTLE_ENDIAN 0 -#else -#error Unknown endianess!!! Aborting... -#endif - -static int copy_to_creg_data(struct rsxx_cardinfo *card, - int cnt8, - void *buf, - unsigned int stream) -{ - int i = 0; - u32 *data = buf; - - if (unlikely(card->eeh_state)) - return -EIO; - - for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { - /* - * Firmware implementation makes it necessary to byte swap on - * little endian processors. - */ - if (LITTLE_ENDIAN && stream) - iowrite32be(data[i], card->regmap + CREG_DATA(i)); - else - iowrite32(data[i], card->regmap + CREG_DATA(i)); - } - - return 0; -} - - -static int copy_from_creg_data(struct rsxx_cardinfo *card, - int cnt8, - void *buf, - unsigned int stream) -{ - int i = 0; - u32 *data = buf; - - if (unlikely(card->eeh_state)) - return -EIO; - - for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { - /* - * Firmware implementation makes it necessary to byte swap on - * little endian processors. - */ - if (LITTLE_ENDIAN && stream) - data[i] = ioread32be(card->regmap + CREG_DATA(i)); - else - data[i] = ioread32(card->regmap + CREG_DATA(i)); - } - - return 0; -} - -static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) -{ - int st; - - if (unlikely(card->eeh_state)) - return; - - iowrite32(cmd->addr, card->regmap + CREG_ADD); - iowrite32(cmd->cnt8, card->regmap + CREG_CNT); - - if (cmd->op == CREG_OP_WRITE) { - if (cmd->buf) { - st = copy_to_creg_data(card, cmd->cnt8, - cmd->buf, cmd->stream); - if (st) - return; - } - } - - if (unlikely(card->eeh_state)) - return; - - /* Setting the valid bit will kick off the command. */ - iowrite32(cmd->op, card->regmap + CREG_CMD); -} - -static void creg_kick_queue(struct rsxx_cardinfo *card) -{ - if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue)) - return; - - card->creg_ctrl.active = 1; - card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue, - struct creg_cmd, list); - list_del(&card->creg_ctrl.active_cmd->list); - card->creg_ctrl.q_depth--; - - /* - * We have to set the timer before we push the new command. Otherwise, - * we could create a race condition that would occur if the timer - * was not canceled, and expired after the new command was pushed, - * but before the command was issued to hardware. - */ - mod_timer(&card->creg_ctrl.cmd_timer, - jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC)); - - creg_issue_cmd(card, card->creg_ctrl.active_cmd); -} - -static int creg_queue_cmd(struct rsxx_cardinfo *card, - unsigned int op, - unsigned int addr, - unsigned int cnt8, - void *buf, - int stream, - creg_cmd_cb callback, - void *cb_private) -{ - struct creg_cmd *cmd; - - /* Don't queue stuff up if we're halted. */ - if (unlikely(card->halt)) - return -EINVAL; - - if (card->creg_ctrl.reset) - return -EAGAIN; - - if (cnt8 > MAX_CREG_DATA8) - return -EINVAL; - - cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL); - if (!cmd) - return -ENOMEM; - - INIT_LIST_HEAD(&cmd->list); - - cmd->op = op; - cmd->addr = addr; - cmd->cnt8 = cnt8; - cmd->buf = buf; - cmd->stream = stream; - cmd->cb = callback; - cmd->cb_private = cb_private; - cmd->status = 0; - - spin_lock_bh(&card->creg_ctrl.lock); - list_add_tail(&cmd->list, &card->creg_ctrl.queue); - card->creg_ctrl.q_depth++; - creg_kick_queue(card); - spin_unlock_bh(&card->creg_ctrl.lock); - - return 0; -} - -static void creg_cmd_timed_out(struct timer_list *t) -{ - struct rsxx_cardinfo *card = from_timer(card, t, creg_ctrl.cmd_timer); - struct creg_cmd *cmd; - - spin_lock(&card->creg_ctrl.lock); - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - spin_unlock(&card->creg_ctrl.lock); - - if (cmd == NULL) { - card->creg_ctrl.creg_stats.creg_timeout++; - dev_warn(CARD_TO_DEV(card), - "No active command associated with timeout!\n"); - return; - } - - if (cmd->cb) - cmd->cb(card, cmd, -ETIMEDOUT); - - kmem_cache_free(creg_cmd_pool, cmd); - - - spin_lock(&card->creg_ctrl.lock); - card->creg_ctrl.active = 0; - creg_kick_queue(card); - spin_unlock(&card->creg_ctrl.lock); -} - - -static void creg_cmd_done(struct work_struct *work) -{ - struct rsxx_cardinfo *card; - struct creg_cmd *cmd; - int st = 0; - - card = container_of(work, struct rsxx_cardinfo, - creg_ctrl.done_work); - - /* - * The timer could not be cancelled for some reason, - * race to pop the active command. - */ - if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0) - card->creg_ctrl.creg_stats.failed_cancel_timer++; - - spin_lock_bh(&card->creg_ctrl.lock); - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - spin_unlock_bh(&card->creg_ctrl.lock); - - if (cmd == NULL) { - dev_err(CARD_TO_DEV(card), - "Spurious creg interrupt!\n"); - return; - } - - card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT); - cmd->status = card->creg_ctrl.creg_stats.stat; - if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) { - dev_err(CARD_TO_DEV(card), - "Invalid status on creg command\n"); - /* - * At this point we're probably reading garbage from HW. Don't - * do anything else that could mess up the system and let - * the sync function return an error. - */ - st = -EIO; - goto creg_done; - } else if (cmd->status & CREG_STAT_ERROR) { - st = -EIO; - } - - if (cmd->op == CREG_OP_READ) { - unsigned int cnt8 = ioread32(card->regmap + CREG_CNT); - - /* Paranoid Sanity Checks */ - if (!cmd->buf) { - dev_err(CARD_TO_DEV(card), - "Buffer not given for read.\n"); - st = -EIO; - goto creg_done; - } - if (cnt8 != cmd->cnt8) { - dev_err(CARD_TO_DEV(card), - "count mismatch\n"); - st = -EIO; - goto creg_done; - } - - st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); - } - -creg_done: - if (cmd->cb) - cmd->cb(card, cmd, st); - - kmem_cache_free(creg_cmd_pool, cmd); - - spin_lock_bh(&card->creg_ctrl.lock); - card->creg_ctrl.active = 0; - creg_kick_queue(card); - spin_unlock_bh(&card->creg_ctrl.lock); -} - -static void creg_reset(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd = NULL; - struct creg_cmd *tmp; - unsigned long flags; - - /* - * mutex_trylock is used here because if reset_lock is taken then a - * reset is already happening. So, we can just go ahead and return. - */ - if (!mutex_trylock(&card->creg_ctrl.reset_lock)) - return; - - card->creg_ctrl.reset = 1; - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - dev_warn(CARD_TO_DEV(card), - "Resetting creg interface for recovery\n"); - - /* Cancel outstanding commands */ - spin_lock_bh(&card->creg_ctrl.lock); - list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { - list_del(&cmd->list); - card->creg_ctrl.q_depth--; - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - kmem_cache_free(creg_cmd_pool, cmd); - } - - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - if (cmd) { - if (timer_pending(&card->creg_ctrl.cmd_timer)) - del_timer_sync(&card->creg_ctrl.cmd_timer); - - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - kmem_cache_free(creg_cmd_pool, cmd); - - card->creg_ctrl.active = 0; - } - spin_unlock_bh(&card->creg_ctrl.lock); - - card->creg_ctrl.reset = 0; - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - mutex_unlock(&card->creg_ctrl.reset_lock); -} - -/* Used for synchronous accesses */ -struct creg_completion { - struct completion *cmd_done; - int st; - u32 creg_status; -}; - -static void creg_cmd_done_cb(struct rsxx_cardinfo *card, - struct creg_cmd *cmd, - int st) -{ - struct creg_completion *cmd_completion; - - cmd_completion = cmd->cb_private; - BUG_ON(!cmd_completion); - - cmd_completion->st = st; - cmd_completion->creg_status = cmd->status; - complete(cmd_completion->cmd_done); -} - -static int __issue_creg_rw(struct rsxx_cardinfo *card, - unsigned int op, - unsigned int addr, - unsigned int cnt8, - void *buf, - int stream, - unsigned int *hw_stat) -{ - DECLARE_COMPLETION_ONSTACK(cmd_done); - struct creg_completion completion; - unsigned long timeout; - int st; - - completion.cmd_done = &cmd_done; - completion.st = 0; - completion.creg_status = 0; - - st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb, - &completion); - if (st) - return st; - - /* - * This timeout is necessary for unresponsive hardware. The additional - * 20 seconds to used to guarantee that each cregs requests has time to - * complete. - */ - timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC * - card->creg_ctrl.q_depth + 20000); - - /* - * The creg interface is guaranteed to complete. It has a timeout - * mechanism that will kick in if hardware does not respond. - */ - st = wait_for_completion_timeout(completion.cmd_done, timeout); - if (st == 0) { - /* - * This is really bad, because the kernel timer did not - * expire and notify us of a timeout! - */ - dev_crit(CARD_TO_DEV(card), - "cregs timer failed\n"); - creg_reset(card); - return -EIO; - } - - *hw_stat = completion.creg_status; - - if (completion.st) { - /* - * This read is needed to verify that there has not been any - * extreme errors that might have occurred, i.e. EEH. The - * function iowrite32 will not detect EEH errors, so it is - * necessary that we recover if such an error is the reason - * for the timeout. This is a dummy read. - */ - ioread32(card->regmap + SCRATCH); - - dev_warn(CARD_TO_DEV(card), - "creg command failed(%d x%08x)\n", - completion.st, addr); - return completion.st; - } - - return 0; -} - -static int issue_creg_rw(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int stream, - int read) -{ - unsigned int hw_stat; - unsigned int xfer; - unsigned int op; - int st; - - op = read ? CREG_OP_READ : CREG_OP_WRITE; - - do { - xfer = min_t(unsigned int, size8, MAX_CREG_DATA8); - - st = __issue_creg_rw(card, op, addr, xfer, - data, stream, &hw_stat); - if (st) - return st; - - data = (char *)data + xfer; - addr += xfer; - size8 -= xfer; - } while (size8); - - return 0; -} - -/* ---------------------------- Public API ---------------------------------- */ -int rsxx_creg_write(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int byte_stream) -{ - return issue_creg_rw(card, addr, size8, data, byte_stream, 0); -} - -int rsxx_creg_read(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int byte_stream) -{ - return issue_creg_rw(card, addr, size8, data, byte_stream, 1); -} - -int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state) -{ - return rsxx_creg_read(card, CREG_ADD_CARD_STATE, - sizeof(*state), state, 0); -} - -int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8) -{ - unsigned int size; - int st; - - st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE, - sizeof(size), &size, 0); - if (st) - return st; - - *size8 = (u64)size * RSXX_HW_BLK_SIZE; - return 0; -} - -int rsxx_get_num_targets(struct rsxx_cardinfo *card, - unsigned int *n_targets) -{ - return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS, - sizeof(*n_targets), n_targets, 0); -} - -int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, - u32 *capabilities) -{ - return rsxx_creg_read(card, CREG_ADD_CAPABILITIES, - sizeof(*capabilities), capabilities, 0); -} - -int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd) -{ - return rsxx_creg_write(card, CREG_ADD_CARD_CMD, - sizeof(cmd), &cmd, 0); -} - - -/*----------------- HW Log Functions -------------------*/ -static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len) -{ - static char level; - - /* - * New messages start with "<#>", where # is the log level. Messages - * that extend past the log buffer will use the previous level - */ - if ((len > 3) && (str[0] == '<') && (str[2] == '>')) { - level = str[1]; - str += 3; /* Skip past the log level. */ - len -= 3; - } - - switch (level) { - case '0': - dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '1': - dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '2': - dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '3': - dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '4': - dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '5': - dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '6': - dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '7': - dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - default: - dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - } -} - -/* - * The substrncpy function copies the src string (which includes the - * terminating '\0' character), up to the count into the dest pointer. - * Returns the number of bytes copied to dest. - */ -static int substrncpy(char *dest, const char *src, int count) -{ - int max_cnt = count; - - while (count) { - count--; - *dest = *src; - if (*dest == '\0') - break; - src++; - dest++; - } - return max_cnt - count; -} - - -static void read_hw_log_done(struct rsxx_cardinfo *card, - struct creg_cmd *cmd, - int st) -{ - char *buf; - char *log_str; - int cnt; - int len; - int off; - - buf = cmd->buf; - off = 0; - - /* Failed getting the log message */ - if (st) - return; - - while (off < cmd->cnt8) { - log_str = &card->log.buf[card->log.buf_len]; - cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len); - len = substrncpy(log_str, &buf[off], cnt); - - off += len; - card->log.buf_len += len; - - /* - * Flush the log if we've hit the end of a message or if we've - * run out of buffer space. - */ - if ((log_str[len - 1] == '\0') || - (card->log.buf_len == LOG_BUF_SIZE8)) { - if (card->log.buf_len != 1) /* Don't log blank lines. */ - hw_log_msg(card, card->log.buf, - card->log.buf_len); - card->log.buf_len = 0; - } - - } - - if (cmd->status & CREG_STAT_LOG_PENDING) - rsxx_read_hw_log(card); -} - -int rsxx_read_hw_log(struct rsxx_cardinfo *card) -{ - int st; - - st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG, - sizeof(card->log.tmp), card->log.tmp, - 1, read_hw_log_done, NULL); - if (st) - dev_err(CARD_TO_DEV(card), - "Failed getting log text\n"); - - return st; -} - -/*-------------- IOCTL REG Access ------------------*/ -static int issue_reg_cmd(struct rsxx_cardinfo *card, - struct rsxx_reg_access *cmd, - int read) -{ - unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE; - - return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data, - cmd->stream, &cmd->stat); -} - -int rsxx_reg_access(struct rsxx_cardinfo *card, - struct rsxx_reg_access __user *ucmd, - int read) -{ - struct rsxx_reg_access cmd; - int st; - - st = copy_from_user(&cmd, ucmd, sizeof(cmd)); - if (st) - return -EFAULT; - - if (cmd.cnt > RSXX_MAX_REG_CNT) - return -EFAULT; - - st = issue_reg_cmd(card, &cmd, read); - if (st) - return st; - - st = put_user(cmd.stat, &ucmd->stat); - if (st) - return -EFAULT; - - if (read) { - st = copy_to_user(ucmd->data, cmd.data, cmd.cnt); - if (st) - return -EFAULT; - } - - return 0; -} - -void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd = NULL; - - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - - if (cmd) { - del_timer_sync(&card->creg_ctrl.cmd_timer); - - spin_lock_bh(&card->creg_ctrl.lock); - list_add(&cmd->list, &card->creg_ctrl.queue); - card->creg_ctrl.q_depth++; - card->creg_ctrl.active = 0; - spin_unlock_bh(&card->creg_ctrl.lock); - } -} - -void rsxx_kick_creg_queue(struct rsxx_cardinfo *card) -{ - spin_lock_bh(&card->creg_ctrl.lock); - if (!list_empty(&card->creg_ctrl.queue)) - creg_kick_queue(card); - spin_unlock_bh(&card->creg_ctrl.lock); -} - -/*------------ Initialization & Setup --------------*/ -int rsxx_creg_setup(struct rsxx_cardinfo *card) -{ - card->creg_ctrl.active_cmd = NULL; - - card->creg_ctrl.creg_wq = - create_singlethread_workqueue(DRIVER_NAME"_creg"); - if (!card->creg_ctrl.creg_wq) - return -ENOMEM; - - INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); - mutex_init(&card->creg_ctrl.reset_lock); - INIT_LIST_HEAD(&card->creg_ctrl.queue); - spin_lock_init(&card->creg_ctrl.lock); - timer_setup(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out, 0); - - return 0; -} - -void rsxx_creg_destroy(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd; - struct creg_cmd *tmp; - int cnt = 0; - - /* Cancel outstanding commands */ - spin_lock_bh(&card->creg_ctrl.lock); - list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { - list_del(&cmd->list); - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - kmem_cache_free(creg_cmd_pool, cmd); - cnt++; - } - - if (cnt) - dev_info(CARD_TO_DEV(card), - "Canceled %d queue creg commands\n", cnt); - - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - if (cmd) { - if (timer_pending(&card->creg_ctrl.cmd_timer)) - del_timer_sync(&card->creg_ctrl.cmd_timer); - - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - dev_info(CARD_TO_DEV(card), - "Canceled active creg command\n"); - kmem_cache_free(creg_cmd_pool, cmd); - } - spin_unlock_bh(&card->creg_ctrl.lock); - - cancel_work_sync(&card->creg_ctrl.done_work); -} - - -int rsxx_creg_init(void) -{ - creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN); - if (!creg_cmd_pool) - return -ENOMEM; - - return 0; -} - -void rsxx_creg_cleanup(void) -{ - kmem_cache_destroy(creg_cmd_pool); -} diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c deleted file mode 100644 index dd33f1bdf3b8..000000000000 --- a/drivers/block/rsxx/dev.c +++ /dev/null @@ -1,306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: dev.c -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/pci.h> -#include <linux/slab.h> - -#include <linux/hdreg.h> -#include <linux/genhd.h> -#include <linux/blkdev.h> -#include <linux/bio.h> - -#include <linux/fs.h> - -#include "rsxx_priv.h" - -static unsigned int blkdev_minors = 64; -module_param(blkdev_minors, uint, 0444); -MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)"); - -/* - * For now I'm making this tweakable in case any applications hit this limit. - * If you see a "bio too big" error in the log you will need to raise this - * value. - */ -static unsigned int blkdev_max_hw_sectors = 1024; -module_param(blkdev_max_hw_sectors, uint, 0444); -MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO"); - -static unsigned int enable_blkdev = 1; -module_param(enable_blkdev , uint, 0444); -MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces"); - - -struct rsxx_bio_meta { - struct bio *bio; - atomic_t pending_dmas; - atomic_t error; - unsigned long start_time; -}; - -static struct kmem_cache *bio_meta_pool; - -static void rsxx_submit_bio(struct bio *bio); - -/*----------------- Block Device Operations -----------------*/ -static int rsxx_blkdev_ioctl(struct block_device *bdev, - fmode_t mode, - unsigned int cmd, - unsigned long arg) -{ - struct rsxx_cardinfo *card = bdev->bd_disk->private_data; - - switch (cmd) { - case RSXX_GETREG: - return rsxx_reg_access(card, (void __user *)arg, 1); - case RSXX_SETREG: - return rsxx_reg_access(card, (void __user *)arg, 0); - } - - return -ENOTTY; -} - -static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct rsxx_cardinfo *card = bdev->bd_disk->private_data; - u64 blocks = card->size8 >> 9; - - /* - * get geometry: Fake it. I haven't found any drivers that set - * geo->start, so we won't either. - */ - if (card->size8) { - geo->heads = 64; - geo->sectors = 16; - do_div(blocks, (geo->heads * geo->sectors)); - geo->cylinders = blocks; - } else { - geo->heads = 0; - geo->sectors = 0; - geo->cylinders = 0; - } - return 0; -} - -static const struct block_device_operations rsxx_fops = { - .owner = THIS_MODULE, - .submit_bio = rsxx_submit_bio, - .getgeo = rsxx_getgeo, - .ioctl = rsxx_blkdev_ioctl, -}; - -static void bio_dma_done_cb(struct rsxx_cardinfo *card, - void *cb_data, - unsigned int error) -{ - struct rsxx_bio_meta *meta = cb_data; - - if (error) - atomic_set(&meta->error, 1); - - if (atomic_dec_and_test(&meta->pending_dmas)) { - if (!card->eeh_state && card->gendisk) - bio_end_io_acct(meta->bio, meta->start_time); - - if (atomic_read(&meta->error)) - bio_io_error(meta->bio); - else - bio_endio(meta->bio); - kmem_cache_free(bio_meta_pool, meta); - } -} - -static void rsxx_submit_bio(struct bio *bio) -{ - struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data; - struct rsxx_bio_meta *bio_meta; - blk_status_t st = BLK_STS_IOERR; - - blk_queue_split(&bio); - - might_sleep(); - - if (!card) - goto req_err; - - if (bio_end_sector(bio) > get_capacity(card->gendisk)) - goto req_err; - - if (unlikely(card->halt)) - goto req_err; - - if (unlikely(card->dma_fault)) - goto req_err; - - if (bio->bi_iter.bi_size == 0) { - dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); - goto req_err; - } - - bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); - if (!bio_meta) { - st = BLK_STS_RESOURCE; - goto req_err; - } - - bio_meta->bio = bio; - atomic_set(&bio_meta->error, 0); - atomic_set(&bio_meta->pending_dmas, 0); - - if (!unlikely(card->halt)) - bio_meta->start_time = bio_start_io_acct(bio); - - dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", - bio_data_dir(bio) ? 'W' : 'R', bio_meta, - (u64)bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size); - - st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas, - bio_dma_done_cb, bio_meta); - if (st) - goto queue_err; - - return; - -queue_err: - kmem_cache_free(bio_meta_pool, bio_meta); -req_err: - if (st) - bio->bi_status = st; - bio_endio(bio); -} - -/*----------------- Device Setup -------------------*/ -static bool rsxx_discard_supported(struct rsxx_cardinfo *card) -{ - unsigned char pci_rev; - - pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); - - return (pci_rev >= RSXX_DISCARD_SUPPORT); -} - -int rsxx_attach_dev(struct rsxx_cardinfo *card) -{ - int err = 0; - - mutex_lock(&card->dev_lock); - - /* The block device requires the stripe size from the config. */ - if (enable_blkdev) { - if (card->config_valid) - set_capacity(card->gendisk, card->size8 >> 9); - else - set_capacity(card->gendisk, 0); - err = device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL); - if (err == 0) - card->bdev_attached = 1; - } - - mutex_unlock(&card->dev_lock); - - if (err) - blk_cleanup_disk(card->gendisk); - - return err; -} - -void rsxx_detach_dev(struct rsxx_cardinfo *card) -{ - mutex_lock(&card->dev_lock); - - if (card->bdev_attached) { - del_gendisk(card->gendisk); - card->bdev_attached = 0; - } - - mutex_unlock(&card->dev_lock); -} - -int rsxx_setup_dev(struct rsxx_cardinfo *card) -{ - unsigned short blk_size; - - mutex_init(&card->dev_lock); - - if (!enable_blkdev) - return 0; - - card->major = register_blkdev(0, DRIVER_NAME); - if (card->major < 0) { - dev_err(CARD_TO_DEV(card), "Failed to get major number\n"); - return -ENOMEM; - } - - card->gendisk = blk_alloc_disk(blkdev_minors); - if (!card->gendisk) { - dev_err(CARD_TO_DEV(card), "Failed disk alloc\n"); - unregister_blkdev(card->major, DRIVER_NAME); - return -ENOMEM; - } - - if (card->config_valid) { - blk_size = card->config.data.block_size; - blk_queue_dma_alignment(card->gendisk->queue, blk_size - 1); - blk_queue_logical_block_size(card->gendisk->queue, blk_size); - } - - blk_queue_max_hw_sectors(card->gendisk->queue, blkdev_max_hw_sectors); - blk_queue_physical_block_size(card->gendisk->queue, RSXX_HW_BLK_SIZE); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, card->gendisk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, card->gendisk->queue); - if (rsxx_discard_supported(card)) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, card->gendisk->queue); - blk_queue_max_discard_sectors(card->gendisk->queue, - RSXX_HW_BLK_SIZE >> 9); - card->gendisk->queue->limits.discard_granularity = - RSXX_HW_BLK_SIZE; - card->gendisk->queue->limits.discard_alignment = - RSXX_HW_BLK_SIZE; - } - - snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), - "rsxx%d", card->disk_id); - card->gendisk->major = card->major; - card->gendisk->minors = blkdev_minors; - card->gendisk->fops = &rsxx_fops; - card->gendisk->private_data = card; - - return 0; -} - -void rsxx_destroy_dev(struct rsxx_cardinfo *card) -{ - if (!enable_blkdev) - return; - - blk_cleanup_disk(card->gendisk); - card->gendisk = NULL; - unregister_blkdev(card->major, DRIVER_NAME); -} - -int rsxx_dev_init(void) -{ - bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN); - if (!bio_meta_pool) - return -ENOMEM; - - return 0; -} - -void rsxx_dev_cleanup(void) -{ - kmem_cache_destroy(bio_meta_pool); -} - - diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c deleted file mode 100644 index ed182f3dd054..000000000000 --- a/drivers/block/rsxx/dma.c +++ /dev/null @@ -1,1085 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: dma.c -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include <linux/slab.h> -#include "rsxx_priv.h" - -struct rsxx_dma { - struct list_head list; - u8 cmd; - unsigned int laddr; /* Logical address */ - struct { - u32 off; - u32 cnt; - } sub_page; - dma_addr_t dma_addr; - struct page *page; - unsigned int pg_off; /* Page Offset */ - rsxx_dma_cb cb; - void *cb_data; -}; - -/* This timeout is used to detect a stalled DMA channel */ -#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000) - -struct hw_status { - u8 status; - u8 tag; - __le16 count; - __le32 _rsvd2; - __le64 _rsvd3; -} __packed; - -enum rsxx_dma_status { - DMA_SW_ERR = 0x1, - DMA_HW_FAULT = 0x2, - DMA_CANCELLED = 0x4, -}; - -struct hw_cmd { - u8 command; - u8 tag; - u8 _rsvd; - u8 sub_page; /* Bit[0:2]: 512byte offset */ - /* Bit[4:6]: 512byte count */ - __le32 device_addr; - __le64 host_addr; -} __packed; - -enum rsxx_hw_cmd { - HW_CMD_BLK_DISCARD = 0x70, - HW_CMD_BLK_WRITE = 0x80, - HW_CMD_BLK_READ = 0xC0, - HW_CMD_BLK_RECON_READ = 0xE0, -}; - -enum rsxx_hw_status { - HW_STATUS_CRC = 0x01, - HW_STATUS_HARD_ERR = 0x02, - HW_STATUS_SOFT_ERR = 0x04, - HW_STATUS_FAULT = 0x08, -}; - -static struct kmem_cache *rsxx_dma_pool; - -struct dma_tracker { - int next_tag; - struct rsxx_dma *dma; -}; - -struct dma_tracker_list { - spinlock_t lock; - int head; - struct dma_tracker list[]; -}; - - -/*----------------- Misc Utility Functions -------------------*/ -static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card) -{ - unsigned long long tgt_addr8; - - tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) & - card->_stripe.upper_mask) | - ((addr8) & card->_stripe.lower_mask); - do_div(tgt_addr8, RSXX_HW_BLK_SIZE); - return tgt_addr8; -} - -static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8) -{ - unsigned int tgt; - - tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask; - - return tgt; -} - -void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) -{ - /* Reset all DMA Command/Status Queues */ - iowrite32(DMA_QUEUE_RESET, card->regmap + RESET); -} - -static unsigned int get_dma_size(struct rsxx_dma *dma) -{ - if (dma->sub_page.cnt) - return dma->sub_page.cnt << 9; - else - return RSXX_HW_BLK_SIZE; -} - - -/*----------------- DMA Tracker -------------------*/ -static void set_tracker_dma(struct dma_tracker_list *trackers, - int tag, - struct rsxx_dma *dma) -{ - trackers->list[tag].dma = dma; -} - -static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers, - int tag) -{ - return trackers->list[tag].dma; -} - -static int pop_tracker(struct dma_tracker_list *trackers) -{ - int tag; - - spin_lock(&trackers->lock); - tag = trackers->head; - if (tag != -1) { - trackers->head = trackers->list[tag].next_tag; - trackers->list[tag].next_tag = -1; - } - spin_unlock(&trackers->lock); - - return tag; -} - -static void push_tracker(struct dma_tracker_list *trackers, int tag) -{ - spin_lock(&trackers->lock); - trackers->list[tag].next_tag = trackers->head; - trackers->head = tag; - trackers->list[tag].dma = NULL; - spin_unlock(&trackers->lock); -} - - -/*----------------- Interrupt Coalescing -------------*/ -/* - * Interrupt Coalescing Register Format: - * Interrupt Timer (64ns units) [15:0] - * Interrupt Count [24:16] - * Reserved [31:25] -*/ -#define INTR_COAL_LATENCY_MASK (0x0000ffff) - -#define INTR_COAL_COUNT_SHIFT 16 -#define INTR_COAL_COUNT_BITS 9 -#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \ - INTR_COAL_COUNT_SHIFT) -#define INTR_COAL_LATENCY_UNITS_NS 64 - - -static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency) -{ - u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS; - - if (mode == RSXX_INTR_COAL_DISABLED) - return 0; - - return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) | - (latency_units & INTR_COAL_LATENCY_MASK); - -} - -static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) -{ - int i; - u32 q_depth = 0; - u32 intr_coal; - - if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE || - unlikely(card->eeh_state)) - return; - - for (i = 0; i < card->n_targets; i++) - q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth); - - intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, - q_depth / 2, - card->config.data.intr_coal.latency); - iowrite32(intr_coal, card->regmap + INTR_COAL); -} - -/*----------------- RSXX DMA Handling -------------------*/ -static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma) -{ - if (dma->cmd != HW_CMD_BLK_DISCARD) { - if (!dma_mapping_error(&ctrl->card->dev->dev, dma->dma_addr)) { - dma_unmap_page(&ctrl->card->dev->dev, dma->dma_addr, - get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - DMA_TO_DEVICE : - DMA_FROM_DEVICE); - } - } - - kmem_cache_free(rsxx_dma_pool, dma); -} - -static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, - struct rsxx_dma *dma, - unsigned int status) -{ - if (status & DMA_SW_ERR) - ctrl->stats.dma_sw_err++; - if (status & DMA_HW_FAULT) - ctrl->stats.dma_hw_fault++; - if (status & DMA_CANCELLED) - ctrl->stats.dma_cancelled++; - - if (dma->cb) - dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); - - rsxx_free_dma(ctrl, dma); -} - -int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, - struct list_head *q, unsigned int done) -{ - struct rsxx_dma *dma; - struct rsxx_dma *tmp; - int cnt = 0; - - list_for_each_entry_safe(dma, tmp, q, list) { - list_del(&dma->list); - if (done & COMPLETE_DMA) - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - else - rsxx_free_dma(ctrl, dma); - cnt++; - } - - return cnt; -} - -static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, - struct rsxx_dma *dma) -{ - /* - * Requeued DMAs go to the front of the queue so they are issued - * first. - */ - spin_lock_bh(&ctrl->queue_lock); - ctrl->stats.sw_q_depth++; - list_add(&dma->list, &ctrl->queue); - spin_unlock_bh(&ctrl->queue_lock); -} - -static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, - struct rsxx_dma *dma, - u8 hw_st) -{ - unsigned int status = 0; - int requeue_cmd = 0; - - dev_dbg(CARD_TO_DEV(ctrl->card), - "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n", - dma->cmd, dma->laddr, hw_st); - - if (hw_st & HW_STATUS_CRC) - ctrl->stats.crc_errors++; - if (hw_st & HW_STATUS_HARD_ERR) - ctrl->stats.hard_errors++; - if (hw_st & HW_STATUS_SOFT_ERR) - ctrl->stats.soft_errors++; - - switch (dma->cmd) { - case HW_CMD_BLK_READ: - if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { - if (ctrl->card->scrub_hard) { - dma->cmd = HW_CMD_BLK_RECON_READ; - requeue_cmd = 1; - ctrl->stats.reads_retried++; - } else { - status |= DMA_HW_FAULT; - ctrl->stats.reads_failed++; - } - } else if (hw_st & HW_STATUS_FAULT) { - status |= DMA_HW_FAULT; - ctrl->stats.reads_failed++; - } - - break; - case HW_CMD_BLK_RECON_READ: - if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { - /* Data could not be reconstructed. */ - status |= DMA_HW_FAULT; - ctrl->stats.reads_failed++; - } - - break; - case HW_CMD_BLK_WRITE: - status |= DMA_HW_FAULT; - ctrl->stats.writes_failed++; - - break; - case HW_CMD_BLK_DISCARD: - status |= DMA_HW_FAULT; - ctrl->stats.discards_failed++; - - break; - default: - dev_err(CARD_TO_DEV(ctrl->card), - "Unknown command in DMA!(cmd: x%02x " - "laddr x%08x st: x%02x\n", - dma->cmd, dma->laddr, hw_st); - status |= DMA_SW_ERR; - - break; - } - - if (requeue_cmd) - rsxx_requeue_dma(ctrl, dma); - else - rsxx_complete_dma(ctrl, dma, status); -} - -static void dma_engine_stalled(struct timer_list *t) -{ - struct rsxx_dma_ctrl *ctrl = from_timer(ctrl, t, activity_timer); - int cnt; - - if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || - unlikely(ctrl->card->eeh_state)) - return; - - if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) { - /* - * The dma engine was stalled because the SW_CMD_IDX write - * was lost. Issue it again to recover. - */ - dev_warn(CARD_TO_DEV(ctrl->card), - "SW_CMD_IDX write was lost, re-writing...\n"); - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - mod_timer(&ctrl->activity_timer, - jiffies + DMA_ACTIVITY_TIMEOUT); - } else { - dev_warn(CARD_TO_DEV(ctrl->card), - "DMA channel %d has stalled, faulting interface.\n", - ctrl->id); - ctrl->card->dma_fault = 1; - - /* Clean up the DMA queue */ - spin_lock(&ctrl->queue_lock); - cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); - spin_unlock(&ctrl->queue_lock); - - cnt += rsxx_dma_cancel(ctrl); - - if (cnt) - dev_info(CARD_TO_DEV(ctrl->card), - "Freed %d queued DMAs on channel %d\n", - cnt, ctrl->id); - } -} - -static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) -{ - struct rsxx_dma *dma; - int tag; - int cmds_pending = 0; - struct hw_cmd *hw_cmd_buf; - int dir; - - hw_cmd_buf = ctrl->cmd.buf; - - if (unlikely(ctrl->card->halt) || - unlikely(ctrl->card->eeh_state)) - return; - - while (1) { - spin_lock_bh(&ctrl->queue_lock); - if (list_empty(&ctrl->queue)) { - spin_unlock_bh(&ctrl->queue_lock); - break; - } - spin_unlock_bh(&ctrl->queue_lock); - - tag = pop_tracker(ctrl->trackers); - if (tag == -1) - break; - - spin_lock_bh(&ctrl->queue_lock); - dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); - list_del(&dma->list); - ctrl->stats.sw_q_depth--; - spin_unlock_bh(&ctrl->queue_lock); - - /* - * This will catch any DMAs that slipped in right before the - * fault, but was queued after all the other DMAs were - * cancelled. - */ - if (unlikely(ctrl->card->dma_fault)) { - push_tracker(ctrl->trackers, tag); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - continue; - } - - if (dma->cmd != HW_CMD_BLK_DISCARD) { - if (dma->cmd == HW_CMD_BLK_WRITE) - dir = DMA_TO_DEVICE; - else - dir = DMA_FROM_DEVICE; - - /* - * The function dma_map_page is placed here because we - * can only, by design, issue up to 255 commands to the - * hardware at one time per DMA channel. So the maximum - * amount of mapped memory would be 255 * 4 channels * - * 4096 Bytes which is less than 2GB, the limit of a x8 - * Non-HWWD PCIe slot. This way the dma_map_page - * function should never fail because of a lack of - * mappable memory. - */ - dma->dma_addr = dma_map_page(&ctrl->card->dev->dev, dma->page, - dma->pg_off, dma->sub_page.cnt << 9, dir); - if (dma_mapping_error(&ctrl->card->dev->dev, dma->dma_addr)) { - push_tracker(ctrl->trackers, tag); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - continue; - } - } - - set_tracker_dma(ctrl->trackers, tag, dma); - hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; - hw_cmd_buf[ctrl->cmd.idx].tag = tag; - hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0; - hw_cmd_buf[ctrl->cmd.idx].sub_page = - ((dma->sub_page.cnt & 0x7) << 4) | - (dma->sub_page.off & 0x7); - - hw_cmd_buf[ctrl->cmd.idx].device_addr = - cpu_to_le32(dma->laddr); - - hw_cmd_buf[ctrl->cmd.idx].host_addr = - cpu_to_le64(dma->dma_addr); - - dev_dbg(CARD_TO_DEV(ctrl->card), - "Issue DMA%d(laddr %d tag %d) to idx %d\n", - ctrl->id, dma->laddr, tag, ctrl->cmd.idx); - - ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK; - cmds_pending++; - - if (dma->cmd == HW_CMD_BLK_WRITE) - ctrl->stats.writes_issued++; - else if (dma->cmd == HW_CMD_BLK_DISCARD) - ctrl->stats.discards_issued++; - else - ctrl->stats.reads_issued++; - } - - /* Let HW know we've queued commands. */ - if (cmds_pending) { - atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); - mod_timer(&ctrl->activity_timer, - jiffies + DMA_ACTIVITY_TIMEOUT); - - if (unlikely(ctrl->card->eeh_state)) { - del_timer_sync(&ctrl->activity_timer); - return; - } - - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - } -} - -static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl) -{ - struct rsxx_dma *dma; - unsigned long flags; - u16 count; - u8 status; - u8 tag; - struct hw_status *hw_st_buf; - - hw_st_buf = ctrl->status.buf; - - if (unlikely(ctrl->card->halt) || - unlikely(ctrl->card->dma_fault) || - unlikely(ctrl->card->eeh_state)) - return; - - count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); - - while (count == ctrl->e_cnt) { - /* - * The read memory-barrier is necessary to keep aggressive - * processors/optimizers (such as the PPC Apple G5) from - * reordering the following status-buffer tag & status read - * *before* the count read on subsequent iterations of the - * loop! - */ - rmb(); - - status = hw_st_buf[ctrl->status.idx].status; - tag = hw_st_buf[ctrl->status.idx].tag; - - dma = get_tracker_dma(ctrl->trackers, tag); - if (dma == NULL) { - spin_lock_irqsave(&ctrl->card->irq_lock, flags); - rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL); - spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); - - dev_err(CARD_TO_DEV(ctrl->card), - "No tracker for tag %d " - "(idx %d id %d)\n", - tag, ctrl->status.idx, ctrl->id); - return; - } - - dev_dbg(CARD_TO_DEV(ctrl->card), - "Completing DMA%d" - "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n", - ctrl->id, dma->laddr, tag, status, count, - ctrl->status.idx); - - atomic_dec(&ctrl->stats.hw_q_depth); - - mod_timer(&ctrl->activity_timer, - jiffies + DMA_ACTIVITY_TIMEOUT); - - if (status) - rsxx_handle_dma_error(ctrl, dma, status); - else - rsxx_complete_dma(ctrl, dma, 0); - - push_tracker(ctrl->trackers, tag); - - ctrl->status.idx = (ctrl->status.idx + 1) & - RSXX_CS_IDX_MASK; - ctrl->e_cnt++; - - count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); - } - - dma_intr_coal_auto_tune(ctrl->card); - - if (atomic_read(&ctrl->stats.hw_q_depth) == 0) - del_timer_sync(&ctrl->activity_timer); - - spin_lock_irqsave(&ctrl->card->irq_lock, flags); - rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); - spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); - - spin_lock_bh(&ctrl->queue_lock); - if (ctrl->stats.sw_q_depth) - queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); - spin_unlock_bh(&ctrl->queue_lock); -} - -static void rsxx_schedule_issue(struct work_struct *work) -{ - struct rsxx_dma_ctrl *ctrl; - - ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); - - mutex_lock(&ctrl->work_lock); - rsxx_issue_dmas(ctrl); - mutex_unlock(&ctrl->work_lock); -} - -static void rsxx_schedule_done(struct work_struct *work) -{ - struct rsxx_dma_ctrl *ctrl; - - ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); - - mutex_lock(&ctrl->work_lock); - rsxx_dma_done(ctrl); - mutex_unlock(&ctrl->work_lock); -} - -static blk_status_t rsxx_queue_discard(struct rsxx_cardinfo *card, - struct list_head *q, - unsigned int laddr, - rsxx_dma_cb cb, - void *cb_data) -{ - struct rsxx_dma *dma; - - dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); - if (!dma) - return BLK_STS_RESOURCE; - - dma->cmd = HW_CMD_BLK_DISCARD; - dma->laddr = laddr; - dma->dma_addr = 0; - dma->sub_page.off = 0; - dma->sub_page.cnt = 0; - dma->page = NULL; - dma->pg_off = 0; - dma->cb = cb; - dma->cb_data = cb_data; - - dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr); - - list_add_tail(&dma->list, q); - - return 0; -} - -static blk_status_t rsxx_queue_dma(struct rsxx_cardinfo *card, - struct list_head *q, - int dir, - unsigned int dma_off, - unsigned int dma_len, - unsigned int laddr, - struct page *page, - unsigned int pg_off, - rsxx_dma_cb cb, - void *cb_data) -{ - struct rsxx_dma *dma; - - dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); - if (!dma) - return BLK_STS_RESOURCE; - - dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; - dma->laddr = laddr; - dma->sub_page.off = (dma_off >> 9); - dma->sub_page.cnt = (dma_len >> 9); - dma->page = page; - dma->pg_off = pg_off; - dma->cb = cb; - dma->cb_data = cb_data; - - dev_dbg(CARD_TO_DEV(card), - "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n", - dir ? 'W' : 'R', dma->laddr, dma->sub_page.off, - dma->sub_page.cnt, dma->page, dma->pg_off); - - /* Queue the DMA */ - list_add_tail(&dma->list, q); - - return 0; -} - -blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, - struct bio *bio, - atomic_t *n_dmas, - rsxx_dma_cb cb, - void *cb_data) -{ - struct list_head dma_list[RSXX_MAX_TARGETS]; - struct bio_vec bvec; - struct bvec_iter iter; - unsigned long long addr8; - unsigned int laddr; - unsigned int bv_len; - unsigned int bv_off; - unsigned int dma_off; - unsigned int dma_len; - int dma_cnt[RSXX_MAX_TARGETS]; - int tgt; - blk_status_t st; - int i; - - addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */ - atomic_set(n_dmas, 0); - - for (i = 0; i < card->n_targets; i++) { - INIT_LIST_HEAD(&dma_list[i]); - dma_cnt[i] = 0; - } - - if (bio_op(bio) == REQ_OP_DISCARD) { - bv_len = bio->bi_iter.bi_size; - - while (bv_len > 0) { - tgt = rsxx_get_dma_tgt(card, addr8); - laddr = rsxx_addr8_to_laddr(addr8, card); - - st = rsxx_queue_discard(card, &dma_list[tgt], laddr, - cb, cb_data); - if (st) - goto bvec_err; - - dma_cnt[tgt]++; - atomic_inc(n_dmas); - addr8 += RSXX_HW_BLK_SIZE; - bv_len -= RSXX_HW_BLK_SIZE; - } - } else { - bio_for_each_segment(bvec, bio, iter) { - bv_len = bvec.bv_len; - bv_off = bvec.bv_offset; - - while (bv_len > 0) { - tgt = rsxx_get_dma_tgt(card, addr8); - laddr = rsxx_addr8_to_laddr(addr8, card); - dma_off = addr8 & RSXX_HW_BLK_MASK; - dma_len = min(bv_len, - RSXX_HW_BLK_SIZE - dma_off); - - st = rsxx_queue_dma(card, &dma_list[tgt], - bio_data_dir(bio), - dma_off, dma_len, - laddr, bvec.bv_page, - bv_off, cb, cb_data); - if (st) - goto bvec_err; - - dma_cnt[tgt]++; - atomic_inc(n_dmas); - addr8 += dma_len; - bv_off += dma_len; - bv_len -= dma_len; - } - } - } - - for (i = 0; i < card->n_targets; i++) { - if (!list_empty(&dma_list[i])) { - spin_lock_bh(&card->ctrl[i].queue_lock); - card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; - list_splice_tail(&dma_list[i], &card->ctrl[i].queue); - spin_unlock_bh(&card->ctrl[i].queue_lock); - - queue_work(card->ctrl[i].issue_wq, - &card->ctrl[i].issue_dma_work); - } - } - - return 0; - -bvec_err: - for (i = 0; i < card->n_targets; i++) - rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], - FREE_DMA); - return st; -} - - -/*----------------- DMA Engine Initialization & Setup -------------------*/ -int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl) -{ - ctrl->status.buf = dma_alloc_coherent(&dev->dev, STATUS_BUFFER_SIZE8, - &ctrl->status.dma_addr, GFP_KERNEL); - ctrl->cmd.buf = dma_alloc_coherent(&dev->dev, COMMAND_BUFFER_SIZE8, - &ctrl->cmd.dma_addr, GFP_KERNEL); - if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) - return -ENOMEM; - - memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); - iowrite32(lower_32_bits(ctrl->status.dma_addr), - ctrl->regmap + SB_ADD_LO); - iowrite32(upper_32_bits(ctrl->status.dma_addr), - ctrl->regmap + SB_ADD_HI); - - memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); - iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); - iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); - - ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); - if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { - dev_crit(&dev->dev, "Failed reading status cnt x%x\n", - ctrl->status.idx); - return -EINVAL; - } - iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); - iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); - - ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); - if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { - dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", - ctrl->status.idx); - return -EINVAL; - } - iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - - return 0; -} - -static int rsxx_dma_ctrl_init(struct pci_dev *dev, - struct rsxx_dma_ctrl *ctrl) -{ - int i; - int st; - - memset(&ctrl->stats, 0, sizeof(ctrl->stats)); - - ctrl->trackers = vmalloc(struct_size(ctrl->trackers, list, - RSXX_MAX_OUTSTANDING_CMDS)); - if (!ctrl->trackers) - return -ENOMEM; - - ctrl->trackers->head = 0; - for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { - ctrl->trackers->list[i].next_tag = i + 1; - ctrl->trackers->list[i].dma = NULL; - } - ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1; - spin_lock_init(&ctrl->trackers->lock); - - spin_lock_init(&ctrl->queue_lock); - mutex_init(&ctrl->work_lock); - INIT_LIST_HEAD(&ctrl->queue); - - timer_setup(&ctrl->activity_timer, dma_engine_stalled, 0); - - ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0); - if (!ctrl->issue_wq) - return -ENOMEM; - - ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0); - if (!ctrl->done_wq) - return -ENOMEM; - - INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue); - INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done); - - st = rsxx_hw_buffers_init(dev, ctrl); - if (st) - return st; - - return 0; -} - -static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card, - unsigned int stripe_size8) -{ - if (!is_power_of_2(stripe_size8)) { - dev_err(CARD_TO_DEV(card), - "stripe_size is NOT a power of 2!\n"); - return -EINVAL; - } - - card->_stripe.lower_mask = stripe_size8 - 1; - - card->_stripe.upper_mask = ~(card->_stripe.lower_mask); - card->_stripe.upper_shift = ffs(card->n_targets) - 1; - - card->_stripe.target_mask = card->n_targets - 1; - card->_stripe.target_shift = ffs(stripe_size8) - 1; - - dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n", - card->_stripe.lower_mask); - dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n", - card->_stripe.upper_shift); - dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n", - card->_stripe.upper_mask); - dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n", - card->_stripe.target_mask); - dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n", - card->_stripe.target_shift); - - return 0; -} - -int rsxx_dma_configure(struct rsxx_cardinfo *card) -{ - u32 intr_coal; - - intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, - card->config.data.intr_coal.count, - card->config.data.intr_coal.latency); - iowrite32(intr_coal, card->regmap + INTR_COAL); - - return rsxx_dma_stripe_setup(card, card->config.data.stripe_size); -} - -int rsxx_dma_setup(struct rsxx_cardinfo *card) -{ - unsigned long flags; - int st; - int i; - - dev_info(CARD_TO_DEV(card), - "Initializing %d DMA targets\n", - card->n_targets); - - /* Regmap is divided up into 4K chunks. One for each DMA channel */ - for (i = 0; i < card->n_targets; i++) - card->ctrl[i].regmap = card->regmap + (i * 4096); - - card->dma_fault = 0; - - /* Reset the DMA queues */ - rsxx_dma_queue_reset(card); - - /************* Setup DMA Control *************/ - for (i = 0; i < card->n_targets; i++) { - st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]); - if (st) - goto failed_dma_setup; - - card->ctrl[i].card = card; - card->ctrl[i].id = i; - } - - card->scrub_hard = 1; - - if (card->config_valid) - rsxx_dma_configure(card); - - /* Enable the interrupts after all setup has completed. */ - for (i = 0; i < card->n_targets; i++) { - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i)); - spin_unlock_irqrestore(&card->irq_lock, flags); - } - - return 0; - -failed_dma_setup: - for (i = 0; i < card->n_targets; i++) { - struct rsxx_dma_ctrl *ctrl = &card->ctrl[i]; - - if (ctrl->issue_wq) { - destroy_workqueue(ctrl->issue_wq); - ctrl->issue_wq = NULL; - } - - if (ctrl->done_wq) { - destroy_workqueue(ctrl->done_wq); - ctrl->done_wq = NULL; - } - - vfree(ctrl->trackers); - - if (ctrl->status.buf) - dma_free_coherent(&card->dev->dev, STATUS_BUFFER_SIZE8, - ctrl->status.buf, - ctrl->status.dma_addr); - if (ctrl->cmd.buf) - dma_free_coherent(&card->dev->dev, COMMAND_BUFFER_SIZE8, - ctrl->cmd.buf, ctrl->cmd.dma_addr); - } - - return st; -} - -int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl) -{ - struct rsxx_dma *dma; - int i; - int cnt = 0; - - /* Clean up issued DMAs */ - for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { - dma = get_tracker_dma(ctrl->trackers, i); - if (dma) { - atomic_dec(&ctrl->stats.hw_q_depth); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - push_tracker(ctrl->trackers, i); - cnt++; - } - } - - return cnt; -} - -void rsxx_dma_destroy(struct rsxx_cardinfo *card) -{ - struct rsxx_dma_ctrl *ctrl; - int i; - - for (i = 0; i < card->n_targets; i++) { - ctrl = &card->ctrl[i]; - - if (ctrl->issue_wq) { - destroy_workqueue(ctrl->issue_wq); - ctrl->issue_wq = NULL; - } - - if (ctrl->done_wq) { - destroy_workqueue(ctrl->done_wq); - ctrl->done_wq = NULL; - } - - if (timer_pending(&ctrl->activity_timer)) - del_timer_sync(&ctrl->activity_timer); - - /* Clean up the DMA queue */ - spin_lock_bh(&ctrl->queue_lock); - rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); - spin_unlock_bh(&ctrl->queue_lock); - - rsxx_dma_cancel(ctrl); - - vfree(ctrl->trackers); - - dma_free_coherent(&card->dev->dev, STATUS_BUFFER_SIZE8, - ctrl->status.buf, ctrl->status.dma_addr); - dma_free_coherent(&card->dev->dev, COMMAND_BUFFER_SIZE8, - ctrl->cmd.buf, ctrl->cmd.dma_addr); - } -} - -int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) -{ - int i; - int j; - int cnt; - struct rsxx_dma *dma; - struct list_head *issued_dmas; - - issued_dmas = kcalloc(card->n_targets, sizeof(*issued_dmas), - GFP_KERNEL); - if (!issued_dmas) - return -ENOMEM; - - for (i = 0; i < card->n_targets; i++) { - INIT_LIST_HEAD(&issued_dmas[i]); - cnt = 0; - for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { - dma = get_tracker_dma(card->ctrl[i].trackers, j); - if (dma == NULL) - continue; - - if (dma->cmd == HW_CMD_BLK_WRITE) - card->ctrl[i].stats.writes_issued--; - else if (dma->cmd == HW_CMD_BLK_DISCARD) - card->ctrl[i].stats.discards_issued--; - else - card->ctrl[i].stats.reads_issued--; - - if (dma->cmd != HW_CMD_BLK_DISCARD) { - dma_unmap_page(&card->dev->dev, dma->dma_addr, - get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - DMA_TO_DEVICE : - DMA_FROM_DEVICE); - } - - list_add_tail(&dma->list, &issued_dmas[i]); - push_tracker(card->ctrl[i].trackers, j); - cnt++; - } - - spin_lock_bh(&card->ctrl[i].queue_lock); - list_splice(&issued_dmas[i], &card->ctrl[i].queue); - - atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); - card->ctrl[i].stats.sw_q_depth += cnt; - card->ctrl[i].e_cnt = 0; - spin_unlock_bh(&card->ctrl[i].queue_lock); - } - - kfree(issued_dmas); - - return 0; -} - -int rsxx_dma_init(void) -{ - rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); - if (!rsxx_dma_pool) - return -ENOMEM; - - return 0; -} - - -void rsxx_dma_cleanup(void) -{ - kmem_cache_destroy(rsxx_dma_pool); -} - diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h deleted file mode 100644 index 4f84905a6fd2..000000000000 --- a/drivers/block/rsxx/rsxx.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -* Filename: rsxx.h -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#ifndef __RSXX_H__ -#define __RSXX_H__ - -/*----------------- IOCTL Definitions -------------------*/ - -#define RSXX_MAX_DATA 8 - -struct rsxx_reg_access { - __u32 addr; - __u32 cnt; - __u32 stat; - __u32 stream; - __u32 data[RSXX_MAX_DATA]; -}; - -#define RSXX_MAX_REG_CNT (RSXX_MAX_DATA * (sizeof(__u32))) - -#define RSXX_IOC_MAGIC 'r' - -#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access) -#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access) - -#endif /* __RSXX_H_ */ diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h deleted file mode 100644 index 2b79015f5849..000000000000 --- a/drivers/block/rsxx/rsxx_cfg.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -* Filename: rsXX_cfg.h -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#ifndef __RSXX_CFG_H__ -#define __RSXX_CFG_H__ - -/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */ -#include <linux/types.h> - -/* - * The card config version must match the driver's expected version. If it does - * not, the DMA interfaces will not be attached and the user will need to - * initialize/upgrade the card configuration using the card config utility. - */ -#define RSXX_CFG_VERSION 4 - -struct card_cfg_hdr { - __u32 version; - __u32 crc; -}; - -struct card_cfg_data { - __u32 block_size; - __u32 stripe_size; - __u32 vendor_id; - __u32 cache_order; - struct { - __u32 mode; /* Disabled, manual, auto-tune... */ - __u32 count; /* Number of intr to coalesce */ - __u32 latency;/* Max wait time (in ns) */ - } intr_coal; -}; - -struct rsxx_card_cfg { - struct card_cfg_hdr hdr; - struct card_cfg_data data; -}; - -/* Vendor ID Values */ -#define RSXX_VENDOR_ID_IBM 0 -#define RSXX_VENDOR_ID_DSI 1 -#define RSXX_VENDOR_COUNT 2 - -/* Interrupt Coalescing Values */ -#define RSXX_INTR_COAL_DISABLED 0 -#define RSXX_INTR_COAL_EXPLICIT 1 -#define RSXX_INTR_COAL_AUTO_TUNE 2 - - -#endif /* __RSXX_CFG_H__ */ - diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h deleted file mode 100644 index 26c320c0d924..000000000000 --- a/drivers/block/rsxx/rsxx_priv.h +++ /dev/null @@ -1,418 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -* Filename: rsxx_priv.h -* -* Authors: Joshua Morris <josh.h.morris@us.ibm.com> -* Philip Kelleher <pjk1939@linux.vnet.ibm.com> -* -* (C) Copyright 2013 IBM Corporation -*/ - -#ifndef __RSXX_PRIV_H__ -#define __RSXX_PRIV_H__ - -#include <linux/semaphore.h> - -#include <linux/fs.h> -#include <linux/interrupt.h> -#include <linux/mutex.h> -#include <linux/pci.h> -#include <linux/spinlock.h> -#include <linux/sysfs.h> -#include <linux/workqueue.h> -#include <linux/bio.h> -#include <linux/vmalloc.h> -#include <linux/timer.h> -#include <linux/ioctl.h> -#include <linux/delay.h> - -#include "rsxx.h" -#include "rsxx_cfg.h" - -struct proc_cmd; - -#define PCI_DEVICE_ID_FS70_FLASH 0x04A9 -#define PCI_DEVICE_ID_FS80_FLASH 0x04AA - -#define RS70_PCI_REV_SUPPORTED 4 - -#define DRIVER_NAME "rsxx" -#define DRIVER_VERSION "4.0.3.2516" - -/* Block size is 4096 */ -#define RSXX_HW_BLK_SHIFT 12 -#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT) -#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1) - -#define MAX_CREG_DATA8 32 -#define LOG_BUF_SIZE8 128 - -#define RSXX_MAX_OUTSTANDING_CMDS 255 -#define RSXX_CS_IDX_MASK 0xff - -#define STATUS_BUFFER_SIZE8 4096 -#define COMMAND_BUFFER_SIZE8 4096 - -#define RSXX_MAX_TARGETS 8 - -struct dma_tracker_list; - -/* DMA Command/Status Buffer structure */ -struct rsxx_cs_buffer { - dma_addr_t dma_addr; - void *buf; - u32 idx; -}; - -struct rsxx_dma_stats { - u32 crc_errors; - u32 hard_errors; - u32 soft_errors; - u32 writes_issued; - u32 writes_failed; - u32 reads_issued; - u32 reads_failed; - u32 reads_retried; - u32 discards_issued; - u32 discards_failed; - u32 done_rescheduled; - u32 issue_rescheduled; - u32 dma_sw_err; - u32 dma_hw_fault; - u32 dma_cancelled; - u32 sw_q_depth; /* Number of DMAs on the SW queue. */ - atomic_t hw_q_depth; /* Number of DMAs queued to HW. */ -}; - -struct rsxx_dma_ctrl { - struct rsxx_cardinfo *card; - int id; - void __iomem *regmap; - struct rsxx_cs_buffer status; - struct rsxx_cs_buffer cmd; - u16 e_cnt; - spinlock_t queue_lock; - struct list_head queue; - struct workqueue_struct *issue_wq; - struct work_struct issue_dma_work; - struct workqueue_struct *done_wq; - struct work_struct dma_done_work; - struct timer_list activity_timer; - struct dma_tracker_list *trackers; - struct rsxx_dma_stats stats; - struct mutex work_lock; -}; - -struct rsxx_cardinfo { - struct pci_dev *dev; - unsigned int halt; - unsigned int eeh_state; - - void __iomem *regmap; - spinlock_t irq_lock; - unsigned int isr_mask; - unsigned int ier_mask; - - struct rsxx_card_cfg config; - int config_valid; - - /* Embedded CPU Communication */ - struct { - spinlock_t lock; - bool active; - struct creg_cmd *active_cmd; - struct workqueue_struct *creg_wq; - struct work_struct done_work; - struct list_head queue; - unsigned int q_depth; - /* Cache the creg status to prevent ioreads */ - struct { - u32 stat; - u32 failed_cancel_timer; - u32 creg_timeout; - } creg_stats; - struct timer_list cmd_timer; - struct mutex reset_lock; - int reset; - } creg_ctrl; - - struct { - char tmp[MAX_CREG_DATA8]; - char buf[LOG_BUF_SIZE8]; /* terminated */ - int buf_len; - } log; - - struct workqueue_struct *event_wq; - struct work_struct event_work; - unsigned int state; - u64 size8; - - /* Lock the device attach/detach function */ - struct mutex dev_lock; - - /* Block Device Variables */ - bool bdev_attached; - int disk_id; - int major; - struct gendisk *gendisk; - struct { - /* Used to convert a byte address to a device address. */ - u64 lower_mask; - u64 upper_shift; - u64 upper_mask; - u64 target_mask; - u64 target_shift; - } _stripe; - unsigned int dma_fault; - - int scrub_hard; - - int n_targets; - struct rsxx_dma_ctrl *ctrl; - - struct dentry *debugfs_dir; -}; - -enum rsxx_pci_regmap { - HWID = 0x00, /* Hardware Identification Register */ - SCRATCH = 0x04, /* Scratch/Debug Register */ - RESET = 0x08, /* Reset Register */ - ISR = 0x10, /* Interrupt Status Register */ - IER = 0x14, /* Interrupt Enable Register */ - IPR = 0x18, /* Interrupt Poll Register */ - CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */ - CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/ - HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */ - SW_CMD_IDX = 0x2C, /* Software Processed Command Index */ - SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */ - SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */ - HW_STATUS_CNT = 0x38, /* Hardware Status Counter */ - SW_STATUS_CNT = 0x3C, /* Deprecated */ - CREG_CMD = 0x40, /* CPU Command Register */ - CREG_ADD = 0x44, /* CPU Address Register */ - CREG_CNT = 0x48, /* CPU Count Register */ - CREG_STAT = 0x4C, /* CPU Status Register */ - CREG_DATA0 = 0x50, /* CPU Data Registers */ - CREG_DATA1 = 0x54, - CREG_DATA2 = 0x58, - CREG_DATA3 = 0x5C, - CREG_DATA4 = 0x60, - CREG_DATA5 = 0x64, - CREG_DATA6 = 0x68, - CREG_DATA7 = 0x6c, - INTR_COAL = 0x70, /* Interrupt Coalescing Register */ - HW_ERROR = 0x74, /* Card Error Register */ - PCI_DEBUG0 = 0x78, /* PCI Debug Registers */ - PCI_DEBUG1 = 0x7C, - PCI_DEBUG2 = 0x80, - PCI_DEBUG3 = 0x84, - PCI_DEBUG4 = 0x88, - PCI_DEBUG5 = 0x8C, - PCI_DEBUG6 = 0x90, - PCI_DEBUG7 = 0x94, - PCI_POWER_THROTTLE = 0x98, - PERF_CTRL = 0x9c, - PERF_TIMER_LO = 0xa0, - PERF_TIMER_HI = 0xa4, - PERF_RD512_LO = 0xa8, - PERF_RD512_HI = 0xac, - PERF_WR512_LO = 0xb0, - PERF_WR512_HI = 0xb4, - PCI_RECONFIG = 0xb8, -}; - -enum rsxx_intr { - CR_INTR_DMA0 = 0x00000001, - CR_INTR_CREG = 0x00000002, - CR_INTR_DMA1 = 0x00000004, - CR_INTR_EVENT = 0x00000008, - CR_INTR_DMA2 = 0x00000010, - CR_INTR_DMA3 = 0x00000020, - CR_INTR_DMA4 = 0x00000040, - CR_INTR_DMA5 = 0x00000080, - CR_INTR_DMA6 = 0x00000100, - CR_INTR_DMA7 = 0x00000200, - CR_INTR_ALL_C = 0x0000003f, - CR_INTR_ALL_G = 0x000003ff, - CR_INTR_DMA_ALL = 0x000003f5, - CR_INTR_ALL = 0xffffffff, -}; - -static inline int CR_INTR_DMA(int N) -{ - static const unsigned int _CR_INTR_DMA[] = { - CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3, - CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7 - }; - return _CR_INTR_DMA[N]; -} -enum rsxx_pci_reset { - DMA_QUEUE_RESET = 0x00000001, -}; - -enum rsxx_hw_fifo_flush { - RSXX_FLUSH_BUSY = 0x00000002, - RSXX_FLUSH_TIMEOUT = 0x00000004, -}; - -enum rsxx_pci_revision { - RSXX_DISCARD_SUPPORT = 2, - RSXX_EEH_SUPPORT = 3, -}; - -enum rsxx_creg_cmd { - CREG_CMD_TAG_MASK = 0x0000FF00, - CREG_OP_WRITE = 0x000000C0, - CREG_OP_READ = 0x000000E0, -}; - -enum rsxx_creg_addr { - CREG_ADD_CARD_CMD = 0x80001000, - CREG_ADD_CARD_STATE = 0x80001004, - CREG_ADD_CARD_SIZE = 0x8000100c, - CREG_ADD_CAPABILITIES = 0x80001050, - CREG_ADD_LOG = 0x80002000, - CREG_ADD_NUM_TARGETS = 0x80003000, - CREG_ADD_CRAM = 0xA0000000, - CREG_ADD_CONFIG = 0xB0000000, -}; - -enum rsxx_creg_card_cmd { - CARD_CMD_STARTUP = 1, - CARD_CMD_SHUTDOWN = 2, - CARD_CMD_LOW_LEVEL_FORMAT = 3, - CARD_CMD_FPGA_RECONFIG_BR = 4, - CARD_CMD_FPGA_RECONFIG_MAIN = 5, - CARD_CMD_BACKUP = 6, - CARD_CMD_RESET = 7, - CARD_CMD_deprecated = 8, - CARD_CMD_UNINITIALIZE = 9, - CARD_CMD_DSTROY_EMERGENCY = 10, - CARD_CMD_DSTROY_NORMAL = 11, - CARD_CMD_DSTROY_EXTENDED = 12, - CARD_CMD_DSTROY_ABORT = 13, -}; - -enum rsxx_card_state { - CARD_STATE_SHUTDOWN = 0x00000001, - CARD_STATE_STARTING = 0x00000002, - CARD_STATE_FORMATTING = 0x00000004, - CARD_STATE_UNINITIALIZED = 0x00000008, - CARD_STATE_GOOD = 0x00000010, - CARD_STATE_SHUTTING_DOWN = 0x00000020, - CARD_STATE_FAULT = 0x00000040, - CARD_STATE_RD_ONLY_FAULT = 0x00000080, - CARD_STATE_DSTROYING = 0x00000100, -}; - -enum rsxx_led { - LED_DEFAULT = 0x0, - LED_IDENTIFY = 0x1, - LED_SOAK = 0x2, -}; - -enum rsxx_creg_flash_lock { - CREG_FLASH_LOCK = 1, - CREG_FLASH_UNLOCK = 2, -}; - -enum rsxx_card_capabilities { - CARD_CAP_SUBPAGE_WRITES = 0x00000080, -}; - -enum rsxx_creg_stat { - CREG_STAT_STATUS_MASK = 0x00000003, - CREG_STAT_SUCCESS = 0x1, - CREG_STAT_ERROR = 0x2, - CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */ - CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */ - CREG_STAT_TAG_MASK = 0x0000ff00, -}; - -enum rsxx_dma_finish { - FREE_DMA = 0x0, - COMPLETE_DMA = 0x1, -}; - -static inline unsigned int CREG_DATA(int N) -{ - return CREG_DATA0 + (N << 2); -} - -/*----------------- Convenient Log Wrappers -------------------*/ -#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev) - -/***** config.c *****/ -int rsxx_load_config(struct rsxx_cardinfo *card); - -/***** core.c *****/ -void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr); -void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr); -void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr); -void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr); - -/***** dev.c *****/ -int rsxx_attach_dev(struct rsxx_cardinfo *card); -void rsxx_detach_dev(struct rsxx_cardinfo *card); -int rsxx_setup_dev(struct rsxx_cardinfo *card); -void rsxx_destroy_dev(struct rsxx_cardinfo *card); -int rsxx_dev_init(void); -void rsxx_dev_cleanup(void); - -/***** dma.c ****/ -typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, - void *cb_data, - unsigned int status); -int rsxx_dma_setup(struct rsxx_cardinfo *card); -void rsxx_dma_destroy(struct rsxx_cardinfo *card); -int rsxx_dma_init(void); -int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, - struct list_head *q, - unsigned int done); -int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); -void rsxx_dma_cleanup(void); -void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); -int rsxx_dma_configure(struct rsxx_cardinfo *card); -blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, - struct bio *bio, - atomic_t *n_dmas, - rsxx_dma_cb cb, - void *cb_data); -int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); -int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); -int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); - -/***** cregs.c *****/ -int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr, - unsigned int size8, - void *data, - int byte_stream); -int rsxx_creg_read(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int byte_stream); -int rsxx_read_hw_log(struct rsxx_cardinfo *card); -int rsxx_get_card_state(struct rsxx_cardinfo *card, - unsigned int *state); -int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8); -int rsxx_get_num_targets(struct rsxx_cardinfo *card, - unsigned int *n_targets); -int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, - u32 *capabilities); -int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd); -int rsxx_creg_setup(struct rsxx_cardinfo *card); -void rsxx_creg_destroy(struct rsxx_cardinfo *card); -int rsxx_creg_init(void); -void rsxx_creg_cleanup(void); -int rsxx_reg_access(struct rsxx_cardinfo *card, - struct rsxx_reg_access __user *ucmd, - int read); -void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card); -void rsxx_kick_creg_queue(struct rsxx_cardinfo *card); - - - -#endif /* __DRIVERS_BLOCK_RSXX_H__ */ diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 6f45a53f7cbf..fb855da971ee 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/blk-mq.h> #include <linux/hdreg.h> -#include <linux/genhd.h> #include <linux/cdrom.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -143,8 +142,8 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int vdc_ioctl(struct block_device *bdev, fmode_t mode, unsigned command, unsigned long argument) { + struct vdc_port *port = bdev->bd_disk->private_data; int i; - struct gendisk *disk; switch (command) { case CDROMMULTISESSION: @@ -155,12 +154,15 @@ static int vdc_ioctl(struct block_device *bdev, fmode_t mode, return 0; case CDROM_GET_CAPABILITY: - disk = bdev->bd_disk; - - if (bdev->bd_disk && (disk->flags & GENHD_FL_CD)) + if (!vdc_version_supported(port, 1, 1)) + return -EINVAL; + switch (port->vdisk_mtype) { + case VD_MEDIA_TYPE_CD: + case VD_MEDIA_TYPE_DVD: return 0; - return -EINVAL; - + default: + return -EINVAL; + } default: pr_debug(PFX "ioctl %08x not supported\n", command); return -EINVAL; @@ -459,7 +461,7 @@ static int __vdc_tx_trigger(struct vdc_port *port) static int __send_request(struct request *req) { - struct vdc_port *port = req->rq_disk->private_data; + struct vdc_port *port = req->q->disk->private_data; struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; struct scatterlist sg[MAX_RING_COOKIES]; struct vdc_req_entry *rqe; @@ -854,14 +856,12 @@ static int probe_disk(struct vdc_port *port) switch (port->vdisk_mtype) { case VD_MEDIA_TYPE_CD: pr_info(PFX "Virtual CDROM %s\n", port->disk_name); - g->flags |= GENHD_FL_CD; g->flags |= GENHD_FL_REMOVABLE; set_disk_ro(g, 1); break; case VD_MEDIA_TYPE_DVD: pr_info(PFX "Virtual DVD %s\n", port->disk_name); - g->flags |= GENHD_FL_CD; g->flags |= GENHD_FL_REMOVABLE; set_disk_ro(g, 1); break; @@ -886,7 +886,7 @@ static int probe_disk(struct vdc_port *port) return 0; out_cleanup_disk: - blk_cleanup_disk(g); + put_disk(g); out_free_tag: blk_mq_free_tag_set(&port->tag_set); return err; @@ -1070,7 +1070,7 @@ static void vdc_port_remove(struct vio_dev *vdev) del_timer_sync(&port->vio.timer); del_gendisk(port->disk); - blk_cleanup_disk(port->disk); + put_disk(port->disk); blk_mq_free_tag_set(&port->tag_set); vdc_free_tx_ring(port); diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 821594cd1315..42b4b6828690 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -783,7 +783,7 @@ static void swim_cleanup_floppy_disk(struct floppy_state *fs) if (fs->registered) del_gendisk(fs->disk); - blk_cleanup_disk(disk); + put_disk(disk); blk_mq_free_tag_set(&fs->tag_set); } @@ -840,6 +840,7 @@ static int swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->minors = 1; sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; + swd->unit[drive].disk->flags |= GENHD_FL_NO_PART; swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; swd->unit[drive].disk->private_data = &swd->unit[drive]; set_capacity(swd->unit[drive].disk, 2880); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 4b91c9aa5892..da811a7da03f 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1227,7 +1227,7 @@ static int swim3_attach(struct macio_dev *mdev, disk->fops = &floppy_fops; disk->private_data = fs; disk->events = DISK_EVENT_MEDIA_CHANGE; - disk->flags |= GENHD_FL_REMOVABLE; + disk->flags |= GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; sprintf(disk->disk_name, "fd%d", floppy_count); set_capacity(disk, 2880); rc = add_disk(disk); @@ -1238,7 +1238,7 @@ static int swim3_attach(struct macio_dev *mdev, return 0; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_free_tag_set: blk_mq_free_tag_set(&fs->tag_set); out_unregister: diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c deleted file mode 100644 index d1676fe0da1a..000000000000 --- a/drivers/block/sx8.c +++ /dev/null @@ -1,1582 +0,0 @@ -/* - * sx8.c: Driver for Promise SATA SX8 looks-like-I2O hardware - * - * Copyright 2004-2005 Red Hat, Inc. - * - * Author/maintainer: Jeff Garzik <jgarzik@pobox.com> - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/blk-mq.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/compiler.h> -#include <linux/workqueue.h> -#include <linux/bitops.h> -#include <linux/delay.h> -#include <linux/ktime.h> -#include <linux/hdreg.h> -#include <linux/dma-mapping.h> -#include <linux/completion.h> -#include <linux/scatterlist.h> -#include <asm/io.h> -#include <linux/uaccess.h> - -#if 0 -#define CARM_DEBUG -#define CARM_VERBOSE_DEBUG -#else -#undef CARM_DEBUG -#undef CARM_VERBOSE_DEBUG -#endif -#undef CARM_NDEBUG - -#define DRV_NAME "sx8" -#define DRV_VERSION "1.0" -#define PFX DRV_NAME ": " - -MODULE_AUTHOR("Jeff Garzik"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Promise SATA SX8 block driver"); -MODULE_VERSION(DRV_VERSION); - -/* - * SX8 hardware has a single message queue for all ATA ports. - * When this driver was written, the hardware (firmware?) would - * corrupt data eventually, if more than one request was outstanding. - * As one can imagine, having 8 ports bottlenecking on a single - * command hurts performance. - * - * Based on user reports, later versions of the hardware (firmware?) - * seem to be able to survive with more than one command queued. - * - * Therefore, we default to the safe option -- 1 command -- but - * allow the user to increase this. - * - * SX8 should be able to support up to ~60 queued commands (CARM_MAX_REQ), - * but problems seem to occur when you exceed ~30, even on newer hardware. - */ -static int max_queue = 1; -module_param(max_queue, int, 0444); -MODULE_PARM_DESC(max_queue, "Maximum number of queued commands. (min==1, max==30, safe==1)"); - - -#define NEXT_RESP(idx) ((idx + 1) % RMSG_Q_LEN) - -/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */ -#define TAG_ENCODE(tag) (((tag) << 16) | 0xf) -#define TAG_DECODE(tag) (((tag) >> 16) & 0x1f) -#define TAG_VALID(tag) ((((tag) & 0xf) == 0xf) && (TAG_DECODE(tag) < 32)) - -/* note: prints function name for you */ -#ifdef CARM_DEBUG -#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) -#ifdef CARM_VERBOSE_DEBUG -#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) -#else -#define VPRINTK(fmt, args...) -#endif /* CARM_VERBOSE_DEBUG */ -#else -#define DPRINTK(fmt, args...) -#define VPRINTK(fmt, args...) -#endif /* CARM_DEBUG */ - -#ifdef CARM_NDEBUG -#define assert(expr) -#else -#define assert(expr) \ - if(unlikely(!(expr))) { \ - printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ - #expr, __FILE__, __func__, __LINE__); \ - } -#endif - -/* defines only for the constants which don't work well as enums */ -struct carm_host; - -enum { - /* adapter-wide limits */ - CARM_MAX_PORTS = 8, - CARM_SHM_SIZE = (4096 << 7), - CARM_MINORS_PER_MAJOR = 256 / CARM_MAX_PORTS, - CARM_MAX_WAIT_Q = CARM_MAX_PORTS + 1, - - /* command message queue limits */ - CARM_MAX_REQ = 64, /* max command msgs per host */ - CARM_MSG_LOW_WATER = (CARM_MAX_REQ / 4), /* refill mark */ - - /* S/G limits, host-wide and per-request */ - CARM_MAX_REQ_SG = 32, /* max s/g entries per request */ - CARM_MAX_HOST_SG = 600, /* max s/g entries per host */ - CARM_SG_LOW_WATER = (CARM_MAX_HOST_SG / 4), /* re-fill mark */ - - /* hardware registers */ - CARM_IHQP = 0x1c, - CARM_INT_STAT = 0x10, /* interrupt status */ - CARM_INT_MASK = 0x14, /* interrupt mask */ - CARM_HMUC = 0x18, /* host message unit control */ - RBUF_ADDR_LO = 0x20, /* response msg DMA buf low 32 bits */ - RBUF_ADDR_HI = 0x24, /* response msg DMA buf high 32 bits */ - RBUF_BYTE_SZ = 0x28, - CARM_RESP_IDX = 0x2c, - CARM_CMS0 = 0x30, /* command message size reg 0 */ - CARM_LMUC = 0x48, - CARM_HMPHA = 0x6c, - CARM_INITC = 0xb5, - - /* bits in CARM_INT_{STAT,MASK} */ - INT_RESERVED = 0xfffffff0, - INT_WATCHDOG = (1 << 3), /* watchdog timer */ - INT_Q_OVERFLOW = (1 << 2), /* cmd msg q overflow */ - INT_Q_AVAILABLE = (1 << 1), /* cmd msg q has free space */ - INT_RESPONSE = (1 << 0), /* response msg available */ - INT_ACK_MASK = INT_WATCHDOG | INT_Q_OVERFLOW, - INT_DEF_MASK = INT_RESERVED | INT_Q_OVERFLOW | - INT_RESPONSE, - - /* command messages, and related register bits */ - CARM_HAVE_RESP = 0x01, - CARM_MSG_READ = 1, - CARM_MSG_WRITE = 2, - CARM_MSG_VERIFY = 3, - CARM_MSG_GET_CAPACITY = 4, - CARM_MSG_FLUSH = 5, - CARM_MSG_IOCTL = 6, - CARM_MSG_ARRAY = 8, - CARM_MSG_MISC = 9, - CARM_CME = (1 << 2), - CARM_RME = (1 << 1), - CARM_WZBC = (1 << 0), - CARM_RMI = (1 << 0), - CARM_Q_FULL = (1 << 3), - CARM_MSG_SIZE = 288, - CARM_Q_LEN = 48, - - /* CARM_MSG_IOCTL messages */ - CARM_IOC_SCAN_CHAN = 5, /* scan channels for devices */ - CARM_IOC_GET_TCQ = 13, /* get tcq/ncq depth */ - CARM_IOC_SET_TCQ = 14, /* set tcq/ncq depth */ - - IOC_SCAN_CHAN_NODEV = 0x1f, - IOC_SCAN_CHAN_OFFSET = 0x40, - - /* CARM_MSG_ARRAY messages */ - CARM_ARRAY_INFO = 0, - - ARRAY_NO_EXIST = (1 << 31), - - /* response messages */ - RMSG_SZ = 8, /* sizeof(struct carm_response) */ - RMSG_Q_LEN = 48, /* resp. msg list length */ - RMSG_OK = 1, /* bit indicating msg was successful */ - /* length of entire resp. msg buffer */ - RBUF_LEN = RMSG_SZ * RMSG_Q_LEN, - - PDC_SHM_SIZE = (4096 << 7), /* length of entire h/w buffer */ - - /* CARM_MSG_MISC messages */ - MISC_GET_FW_VER = 2, - MISC_ALLOC_MEM = 3, - MISC_SET_TIME = 5, - - /* MISC_GET_FW_VER feature bits */ - FW_VER_4PORT = (1 << 2), /* 1=4 ports, 0=8 ports */ - FW_VER_NON_RAID = (1 << 1), /* 1=non-RAID firmware, 0=RAID */ - FW_VER_ZCR = (1 << 0), /* zero channel RAID (whatever that is) */ - - /* carm_host flags */ - FL_NON_RAID = FW_VER_NON_RAID, - FL_4PORT = FW_VER_4PORT, - FL_FW_VER_MASK = (FW_VER_NON_RAID | FW_VER_4PORT), - FL_DYN_MAJOR = (1 << 17), -}; - -enum { - CARM_SG_BOUNDARY = 0xffffUL, /* s/g segment boundary */ -}; - -enum scatter_gather_types { - SGT_32BIT = 0, - SGT_64BIT = 1, -}; - -enum host_states { - HST_INVALID, /* invalid state; never used */ - HST_ALLOC_BUF, /* setting up master SHM area */ - HST_ERROR, /* we never leave here */ - HST_PORT_SCAN, /* start dev scan */ - HST_DEV_SCAN_START, /* start per-device probe */ - HST_DEV_SCAN, /* continue per-device probe */ - HST_DEV_ACTIVATE, /* activate devices we found */ - HST_PROBE_FINISHED, /* probe is complete */ - HST_PROBE_START, /* initiate probe */ - HST_SYNC_TIME, /* tell firmware what time it is */ - HST_GET_FW_VER, /* get firmware version, adapter port cnt */ -}; - -#ifdef CARM_DEBUG -static const char *state_name[] = { - "HST_INVALID", - "HST_ALLOC_BUF", - "HST_ERROR", - "HST_PORT_SCAN", - "HST_DEV_SCAN_START", - "HST_DEV_SCAN", - "HST_DEV_ACTIVATE", - "HST_PROBE_FINISHED", - "HST_PROBE_START", - "HST_SYNC_TIME", - "HST_GET_FW_VER", -}; -#endif - -struct carm_port { - unsigned int port_no; - struct gendisk *disk; - struct carm_host *host; - - /* attached device characteristics */ - u64 capacity; - char name[41]; - u16 dev_geom_head; - u16 dev_geom_sect; - u16 dev_geom_cyl; -}; - -struct carm_request { - int n_elem; - unsigned int msg_type; - unsigned int msg_subtype; - unsigned int msg_bucket; - struct scatterlist sg[CARM_MAX_REQ_SG]; -}; - -struct carm_host { - unsigned long flags; - void __iomem *mmio; - void *shm; - dma_addr_t shm_dma; - - int major; - int id; - char name[32]; - - spinlock_t lock; - struct pci_dev *pdev; - unsigned int state; - u32 fw_ver; - - struct blk_mq_tag_set tag_set; - struct request_queue *oob_q; - unsigned int n_oob; - - unsigned int hw_sg_used; - - unsigned int resp_idx; - - unsigned int wait_q_prod; - unsigned int wait_q_cons; - struct request_queue *wait_q[CARM_MAX_WAIT_Q]; - - void *msg_base; - dma_addr_t msg_dma; - - int cur_scan_dev; - unsigned long dev_active; - unsigned long dev_present; - struct carm_port port[CARM_MAX_PORTS]; - - struct work_struct fsm_task; - - int probe_err; - struct completion probe_comp; -}; - -struct carm_response { - __le32 ret_handle; - __le32 status; -} __attribute__((packed)); - -struct carm_msg_sg { - __le32 start; - __le32 len; -} __attribute__((packed)); - -struct carm_msg_rw { - u8 type; - u8 id; - u8 sg_count; - u8 sg_type; - __le32 handle; - __le32 lba; - __le16 lba_count; - __le16 lba_high; - struct carm_msg_sg sg[32]; -} __attribute__((packed)); - -struct carm_msg_allocbuf { - u8 type; - u8 subtype; - u8 n_sg; - u8 sg_type; - __le32 handle; - __le32 addr; - __le32 len; - __le32 evt_pool; - __le32 n_evt; - __le32 rbuf_pool; - __le32 n_rbuf; - __le32 msg_pool; - __le32 n_msg; - struct carm_msg_sg sg[8]; -} __attribute__((packed)); - -struct carm_msg_ioctl { - u8 type; - u8 subtype; - u8 array_id; - u8 reserved1; - __le32 handle; - __le32 data_addr; - u32 reserved2; -} __attribute__((packed)); - -struct carm_msg_sync_time { - u8 type; - u8 subtype; - u16 reserved1; - __le32 handle; - u32 reserved2; - __le32 timestamp; -} __attribute__((packed)); - -struct carm_msg_get_fw_ver { - u8 type; - u8 subtype; - u16 reserved1; - __le32 handle; - __le32 data_addr; - u32 reserved2; -} __attribute__((packed)); - -struct carm_fw_ver { - __le32 version; - u8 features; - u8 reserved1; - u16 reserved2; -} __attribute__((packed)); - -struct carm_array_info { - __le32 size; - - __le16 size_hi; - __le16 stripe_size; - - __le32 mode; - - __le16 stripe_blk_sz; - __le16 reserved1; - - __le16 cyl; - __le16 head; - - __le16 sect; - u8 array_id; - u8 reserved2; - - char name[40]; - - __le32 array_status; - - /* device list continues beyond this point? */ -} __attribute__((packed)); - -static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent); -static void carm_remove_one (struct pci_dev *pdev); -static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo); - -static const struct pci_device_id carm_pci_tbl[] = { - { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, - { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, - { } /* terminate list */ -}; -MODULE_DEVICE_TABLE(pci, carm_pci_tbl); - -static struct pci_driver carm_driver = { - .name = DRV_NAME, - .id_table = carm_pci_tbl, - .probe = carm_init_one, - .remove = carm_remove_one, -}; - -static const struct block_device_operations carm_bd_ops = { - .owner = THIS_MODULE, - .getgeo = carm_bdev_getgeo, -}; - -static unsigned int carm_host_id; -static unsigned long carm_major_alloc; - - - -static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct carm_port *port = bdev->bd_disk->private_data; - - geo->heads = (u8) port->dev_geom_head; - geo->sectors = (u8) port->dev_geom_sect; - geo->cylinders = port->dev_geom_cyl; - return 0; -} - -static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE }; - -static inline int carm_lookup_bucket(u32 msg_size) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(msg_sizes); i++) - if (msg_size <= msg_sizes[i]) - return i; - - return -ENOENT; -} - -static void carm_init_buckets(void __iomem *mmio) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(msg_sizes); i++) - writel(msg_sizes[i], mmio + CARM_CMS0 + (4 * i)); -} - -static inline void *carm_ref_msg(struct carm_host *host, - unsigned int msg_idx) -{ - return host->msg_base + (msg_idx * CARM_MSG_SIZE); -} - -static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host, - unsigned int msg_idx) -{ - return host->msg_dma + (msg_idx * CARM_MSG_SIZE); -} - -static int carm_send_msg(struct carm_host *host, - struct carm_request *crq, unsigned tag) -{ - void __iomem *mmio = host->mmio; - u32 msg = (u32) carm_ref_msg_dma(host, tag); - u32 cm_bucket = crq->msg_bucket; - u32 tmp; - int rc = 0; - - VPRINTK("ENTER\n"); - - tmp = readl(mmio + CARM_HMUC); - if (tmp & CARM_Q_FULL) { -#if 0 - tmp = readl(mmio + CARM_INT_MASK); - tmp |= INT_Q_AVAILABLE; - writel(tmp, mmio + CARM_INT_MASK); - readl(mmio + CARM_INT_MASK); /* flush */ -#endif - DPRINTK("host msg queue full\n"); - rc = -EBUSY; - } else { - writel(msg | (cm_bucket << 1), mmio + CARM_IHQP); - readl(mmio + CARM_IHQP); /* flush */ - } - - return rc; -} - -static int carm_array_info (struct carm_host *host, unsigned int array_idx) -{ - struct carm_msg_ioctl *ioc; - u32 msg_data; - dma_addr_t msg_dma; - struct carm_request *crq; - struct request *rq; - int rc; - - rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0); - if (IS_ERR(rq)) { - rc = -ENOMEM; - goto err_out; - } - crq = blk_mq_rq_to_pdu(rq); - - ioc = carm_ref_msg(host, rq->tag); - msg_dma = carm_ref_msg_dma(host, rq->tag); - msg_data = (u32) (msg_dma + sizeof(struct carm_array_info)); - - crq->msg_type = CARM_MSG_ARRAY; - crq->msg_subtype = CARM_ARRAY_INFO; - rc = carm_lookup_bucket(sizeof(struct carm_msg_ioctl) + - sizeof(struct carm_array_info)); - BUG_ON(rc < 0); - crq->msg_bucket = (u32) rc; - - memset(ioc, 0, sizeof(*ioc)); - ioc->type = CARM_MSG_ARRAY; - ioc->subtype = CARM_ARRAY_INFO; - ioc->array_id = (u8) array_idx; - ioc->handle = cpu_to_le32(TAG_ENCODE(rq->tag)); - ioc->data_addr = cpu_to_le32(msg_data); - - spin_lock_irq(&host->lock); - assert(host->state == HST_DEV_SCAN_START || - host->state == HST_DEV_SCAN); - spin_unlock_irq(&host->lock); - - DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); - - return 0; - -err_out: - spin_lock_irq(&host->lock); - host->state = HST_ERROR; - spin_unlock_irq(&host->lock); - return rc; -} - -typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *); - -static int carm_send_special (struct carm_host *host, carm_sspc_t func) -{ - struct request *rq; - struct carm_request *crq; - struct carm_msg_ioctl *ioc; - void *mem; - unsigned int msg_size; - int rc; - - rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0); - if (IS_ERR(rq)) - return -ENOMEM; - crq = blk_mq_rq_to_pdu(rq); - - mem = carm_ref_msg(host, rq->tag); - - msg_size = func(host, rq->tag, mem); - - ioc = mem; - crq->msg_type = ioc->type; - crq->msg_subtype = ioc->subtype; - rc = carm_lookup_bucket(msg_size); - BUG_ON(rc < 0); - crq->msg_bucket = (u32) rc; - - DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); - - return 0; -} - -static unsigned int carm_fill_sync_time(struct carm_host *host, - unsigned int idx, void *mem) -{ - struct carm_msg_sync_time *st = mem; - - time64_t tv = ktime_get_real_seconds(); - - memset(st, 0, sizeof(*st)); - st->type = CARM_MSG_MISC; - st->subtype = MISC_SET_TIME; - st->handle = cpu_to_le32(TAG_ENCODE(idx)); - st->timestamp = cpu_to_le32(tv); - - return sizeof(struct carm_msg_sync_time); -} - -static unsigned int carm_fill_alloc_buf(struct carm_host *host, - unsigned int idx, void *mem) -{ - struct carm_msg_allocbuf *ab = mem; - - memset(ab, 0, sizeof(*ab)); - ab->type = CARM_MSG_MISC; - ab->subtype = MISC_ALLOC_MEM; - ab->handle = cpu_to_le32(TAG_ENCODE(idx)); - ab->n_sg = 1; - ab->sg_type = SGT_32BIT; - ab->addr = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1)); - ab->len = cpu_to_le32(PDC_SHM_SIZE >> 1); - ab->evt_pool = cpu_to_le32(host->shm_dma + (16 * 1024)); - ab->n_evt = cpu_to_le32(1024); - ab->rbuf_pool = cpu_to_le32(host->shm_dma); - ab->n_rbuf = cpu_to_le32(RMSG_Q_LEN); - ab->msg_pool = cpu_to_le32(host->shm_dma + RBUF_LEN); - ab->n_msg = cpu_to_le32(CARM_Q_LEN); - ab->sg[0].start = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1)); - ab->sg[0].len = cpu_to_le32(65536); - - return sizeof(struct carm_msg_allocbuf); -} - -static unsigned int carm_fill_scan_channels(struct carm_host *host, - unsigned int idx, void *mem) -{ - struct carm_msg_ioctl *ioc = mem; - u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + - IOC_SCAN_CHAN_OFFSET); - - memset(ioc, 0, sizeof(*ioc)); - ioc->type = CARM_MSG_IOCTL; - ioc->subtype = CARM_IOC_SCAN_CHAN; - ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); - ioc->data_addr = cpu_to_le32(msg_data); - - /* fill output data area with "no device" default values */ - mem += IOC_SCAN_CHAN_OFFSET; - memset(mem, IOC_SCAN_CHAN_NODEV, CARM_MAX_PORTS); - - return IOC_SCAN_CHAN_OFFSET + CARM_MAX_PORTS; -} - -static unsigned int carm_fill_get_fw_ver(struct carm_host *host, - unsigned int idx, void *mem) -{ - struct carm_msg_get_fw_ver *ioc = mem; - u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + sizeof(*ioc)); - - memset(ioc, 0, sizeof(*ioc)); - ioc->type = CARM_MSG_MISC; - ioc->subtype = MISC_GET_FW_VER; - ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); - ioc->data_addr = cpu_to_le32(msg_data); - - return sizeof(struct carm_msg_get_fw_ver) + - sizeof(struct carm_fw_ver); -} - -static inline void carm_push_q (struct carm_host *host, struct request_queue *q) -{ - unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q; - - blk_mq_stop_hw_queues(q); - VPRINTK("STOPPED QUEUE %p\n", q); - - host->wait_q[idx] = q; - host->wait_q_prod++; - BUG_ON(host->wait_q_prod == host->wait_q_cons); /* overrun */ -} - -static inline struct request_queue *carm_pop_q(struct carm_host *host) -{ - unsigned int idx; - - if (host->wait_q_prod == host->wait_q_cons) - return NULL; - - idx = host->wait_q_cons % CARM_MAX_WAIT_Q; - host->wait_q_cons++; - - return host->wait_q[idx]; -} - -static inline void carm_round_robin(struct carm_host *host) -{ - struct request_queue *q = carm_pop_q(host); - if (q) { - blk_mq_start_hw_queues(q); - VPRINTK("STARTED QUEUE %p\n", q); - } -} - -static inline enum dma_data_direction carm_rq_dir(struct request *rq) -{ - return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; -} - -static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct request_queue *q = hctx->queue; - struct request *rq = bd->rq; - struct carm_port *port = q->queuedata; - struct carm_host *host = port->host; - struct carm_request *crq = blk_mq_rq_to_pdu(rq); - struct carm_msg_rw *msg; - struct scatterlist *sg; - int i, n_elem = 0, rc; - unsigned int msg_size; - u32 tmp; - - crq->n_elem = 0; - sg_init_table(crq->sg, CARM_MAX_REQ_SG); - - blk_mq_start_request(rq); - - spin_lock_irq(&host->lock); - if (req_op(rq) == REQ_OP_DRV_OUT) - goto send_msg; - - /* get scatterlist from block layer */ - sg = &crq->sg[0]; - n_elem = blk_rq_map_sg(q, rq, sg); - if (n_elem <= 0) - goto out_ioerr; - - /* map scatterlist to PCI bus addresses */ - n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq)); - if (n_elem <= 0) - goto out_ioerr; - - /* obey global hardware limit on S/G entries */ - if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem) - goto out_resource; - - crq->n_elem = n_elem; - host->hw_sg_used += n_elem; - - /* - * build read/write message - */ - - VPRINTK("build msg\n"); - msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag); - - if (rq_data_dir(rq) == WRITE) { - msg->type = CARM_MSG_WRITE; - crq->msg_type = CARM_MSG_WRITE; - } else { - msg->type = CARM_MSG_READ; - crq->msg_type = CARM_MSG_READ; - } - - msg->id = port->port_no; - msg->sg_count = n_elem; - msg->sg_type = SGT_32BIT; - msg->handle = cpu_to_le32(TAG_ENCODE(rq->tag)); - msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff); - tmp = (blk_rq_pos(rq) >> 16) >> 16; - msg->lba_high = cpu_to_le16( (u16) tmp ); - msg->lba_count = cpu_to_le16(blk_rq_sectors(rq)); - - msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg); - for (i = 0; i < n_elem; i++) { - struct carm_msg_sg *carm_sg = &msg->sg[i]; - carm_sg->start = cpu_to_le32(sg_dma_address(&crq->sg[i])); - carm_sg->len = cpu_to_le32(sg_dma_len(&crq->sg[i])); - msg_size += sizeof(struct carm_msg_sg); - } - - rc = carm_lookup_bucket(msg_size); - BUG_ON(rc < 0); - crq->msg_bucket = (u32) rc; -send_msg: - /* - * queue read/write message to hardware - */ - VPRINTK("send msg, tag == %u\n", rq->tag); - rc = carm_send_msg(host, crq, rq->tag); - if (rc) { - host->hw_sg_used -= n_elem; - goto out_resource; - } - - spin_unlock_irq(&host->lock); - return BLK_STS_OK; -out_resource: - dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq)); - carm_push_q(host, q); - spin_unlock_irq(&host->lock); - return BLK_STS_DEV_RESOURCE; -out_ioerr: - carm_round_robin(host); - spin_unlock_irq(&host->lock); - return BLK_STS_IOERR; -} - -static void carm_handle_array_info(struct carm_host *host, - struct carm_request *crq, u8 *mem, - blk_status_t error) -{ - struct carm_port *port; - u8 *msg_data = mem + sizeof(struct carm_array_info); - struct carm_array_info *desc = (struct carm_array_info *) msg_data; - u64 lo, hi; - int cur_port; - size_t slen; - - DPRINTK("ENTER\n"); - - if (error) - goto out; - if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST) - goto out; - - cur_port = host->cur_scan_dev; - - /* should never occur */ - if ((cur_port < 0) || (cur_port >= CARM_MAX_PORTS)) { - printk(KERN_ERR PFX "BUG: cur_scan_dev==%d, array_id==%d\n", - cur_port, (int) desc->array_id); - goto out; - } - - port = &host->port[cur_port]; - - lo = (u64) le32_to_cpu(desc->size); - hi = (u64) le16_to_cpu(desc->size_hi); - - port->capacity = lo | (hi << 32); - port->dev_geom_head = le16_to_cpu(desc->head); - port->dev_geom_sect = le16_to_cpu(desc->sect); - port->dev_geom_cyl = le16_to_cpu(desc->cyl); - - host->dev_active |= (1 << cur_port); - - strncpy(port->name, desc->name, sizeof(port->name)); - port->name[sizeof(port->name) - 1] = 0; - slen = strlen(port->name); - while (slen && (port->name[slen - 1] == ' ')) { - port->name[slen - 1] = 0; - slen--; - } - - printk(KERN_INFO DRV_NAME "(%s): port %u device %Lu sectors\n", - pci_name(host->pdev), port->port_no, - (unsigned long long) port->capacity); - printk(KERN_INFO DRV_NAME "(%s): port %u device \"%s\"\n", - pci_name(host->pdev), port->port_no, port->name); - -out: - assert(host->state == HST_DEV_SCAN); - schedule_work(&host->fsm_task); -} - -static void carm_handle_scan_chan(struct carm_host *host, - struct carm_request *crq, u8 *mem, - blk_status_t error) -{ - u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET; - unsigned int i, dev_count = 0; - int new_state = HST_DEV_SCAN_START; - - DPRINTK("ENTER\n"); - - if (error) { - new_state = HST_ERROR; - goto out; - } - - /* TODO: scan and support non-disk devices */ - for (i = 0; i < 8; i++) - if (msg_data[i] == 0) { /* direct-access device (disk) */ - host->dev_present |= (1 << i); - dev_count++; - } - - printk(KERN_INFO DRV_NAME "(%s): found %u interesting devices\n", - pci_name(host->pdev), dev_count); - -out: - assert(host->state == HST_PORT_SCAN); - host->state = new_state; - schedule_work(&host->fsm_task); -} - -static void carm_handle_generic(struct carm_host *host, - struct carm_request *crq, blk_status_t error, - int cur_state, int next_state) -{ - DPRINTK("ENTER\n"); - - assert(host->state == cur_state); - if (error) - host->state = HST_ERROR; - else - host->state = next_state; - schedule_work(&host->fsm_task); -} - -static inline void carm_handle_resp(struct carm_host *host, - __le32 ret_handle_le, u32 status) -{ - u32 handle = le32_to_cpu(ret_handle_le); - unsigned int msg_idx; - struct request *rq; - struct carm_request *crq; - blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR; - u8 *mem; - - VPRINTK("ENTER, handle == 0x%x\n", handle); - - if (unlikely(!TAG_VALID(handle))) { - printk(KERN_ERR DRV_NAME "(%s): BUG: invalid tag 0x%x\n", - pci_name(host->pdev), handle); - return; - } - - msg_idx = TAG_DECODE(handle); - VPRINTK("tag == %u\n", msg_idx); - - rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx); - crq = blk_mq_rq_to_pdu(rq); - - /* fast path */ - if (likely(crq->msg_type == CARM_MSG_READ || - crq->msg_type == CARM_MSG_WRITE)) { - dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, - carm_rq_dir(rq)); - goto done; - } - - mem = carm_ref_msg(host, msg_idx); - - switch (crq->msg_type) { - case CARM_MSG_IOCTL: { - switch (crq->msg_subtype) { - case CARM_IOC_SCAN_CHAN: - carm_handle_scan_chan(host, crq, mem, error); - goto done; - default: - /* unknown / invalid response */ - goto err_out; - } - break; - } - - case CARM_MSG_MISC: { - switch (crq->msg_subtype) { - case MISC_ALLOC_MEM: - carm_handle_generic(host, crq, error, - HST_ALLOC_BUF, HST_SYNC_TIME); - goto done; - case MISC_SET_TIME: - carm_handle_generic(host, crq, error, - HST_SYNC_TIME, HST_GET_FW_VER); - goto done; - case MISC_GET_FW_VER: { - struct carm_fw_ver *ver = (struct carm_fw_ver *) - (mem + sizeof(struct carm_msg_get_fw_ver)); - if (!error) { - host->fw_ver = le32_to_cpu(ver->version); - host->flags |= (ver->features & FL_FW_VER_MASK); - } - carm_handle_generic(host, crq, error, - HST_GET_FW_VER, HST_PORT_SCAN); - goto done; - } - default: - /* unknown / invalid response */ - goto err_out; - } - break; - } - - case CARM_MSG_ARRAY: { - switch (crq->msg_subtype) { - case CARM_ARRAY_INFO: - carm_handle_array_info(host, crq, mem, error); - break; - default: - /* unknown / invalid response */ - goto err_out; - } - break; - } - - default: - /* unknown / invalid response */ - goto err_out; - } - - return; - -err_out: - printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", - pci_name(host->pdev), crq->msg_type, crq->msg_subtype); - error = BLK_STS_IOERR; -done: - host->hw_sg_used -= crq->n_elem; - blk_mq_end_request(blk_mq_rq_from_pdu(crq), error); - - if (host->hw_sg_used <= CARM_SG_LOW_WATER) - carm_round_robin(host); -} - -static inline void carm_handle_responses(struct carm_host *host) -{ - void __iomem *mmio = host->mmio; - struct carm_response *resp = (struct carm_response *) host->shm; - unsigned int work = 0; - unsigned int idx = host->resp_idx % RMSG_Q_LEN; - - while (1) { - u32 status = le32_to_cpu(resp[idx].status); - - if (status == 0xffffffff) { - VPRINTK("ending response on index %u\n", idx); - writel(idx << 3, mmio + CARM_RESP_IDX); - break; - } - - /* response to a message we sent */ - else if ((status & (1 << 31)) == 0) { - VPRINTK("handling msg response on index %u\n", idx); - carm_handle_resp(host, resp[idx].ret_handle, status); - resp[idx].status = cpu_to_le32(0xffffffff); - } - - /* asynchronous events the hardware throws our way */ - else if ((status & 0xff000000) == (1 << 31)) { - u8 *evt_type_ptr = (u8 *) &resp[idx]; - u8 evt_type = *evt_type_ptr; - printk(KERN_WARNING DRV_NAME "(%s): unhandled event type %d\n", - pci_name(host->pdev), (int) evt_type); - resp[idx].status = cpu_to_le32(0xffffffff); - } - - idx = NEXT_RESP(idx); - work++; - } - - VPRINTK("EXIT, work==%u\n", work); - host->resp_idx += work; -} - -static irqreturn_t carm_interrupt(int irq, void *__host) -{ - struct carm_host *host = __host; - void __iomem *mmio; - u32 mask; - int handled = 0; - unsigned long flags; - - if (!host) { - VPRINTK("no host\n"); - return IRQ_NONE; - } - - spin_lock_irqsave(&host->lock, flags); - - mmio = host->mmio; - - /* reading should also clear interrupts */ - mask = readl(mmio + CARM_INT_STAT); - - if (mask == 0 || mask == 0xffffffff) { - VPRINTK("no work, mask == 0x%x\n", mask); - goto out; - } - - if (mask & INT_ACK_MASK) - writel(mask, mmio + CARM_INT_STAT); - - if (unlikely(host->state == HST_INVALID)) { - VPRINTK("not initialized yet, mask = 0x%x\n", mask); - goto out; - } - - if (mask & CARM_HAVE_RESP) { - handled = 1; - carm_handle_responses(host); - } - -out: - spin_unlock_irqrestore(&host->lock, flags); - VPRINTK("EXIT\n"); - return IRQ_RETVAL(handled); -} - -static void carm_fsm_task (struct work_struct *work) -{ - struct carm_host *host = - container_of(work, struct carm_host, fsm_task); - unsigned long flags; - unsigned int state; - int rc, i, next_dev; - int reschedule = 0; - int new_state = HST_INVALID; - - spin_lock_irqsave(&host->lock, flags); - state = host->state; - spin_unlock_irqrestore(&host->lock, flags); - - DPRINTK("ENTER, state == %s\n", state_name[state]); - - switch (state) { - case HST_PROBE_START: - new_state = HST_ALLOC_BUF; - reschedule = 1; - break; - - case HST_ALLOC_BUF: - rc = carm_send_special(host, carm_fill_alloc_buf); - if (rc) { - new_state = HST_ERROR; - reschedule = 1; - } - break; - - case HST_SYNC_TIME: - rc = carm_send_special(host, carm_fill_sync_time); - if (rc) { - new_state = HST_ERROR; - reschedule = 1; - } - break; - - case HST_GET_FW_VER: - rc = carm_send_special(host, carm_fill_get_fw_ver); - if (rc) { - new_state = HST_ERROR; - reschedule = 1; - } - break; - - case HST_PORT_SCAN: - rc = carm_send_special(host, carm_fill_scan_channels); - if (rc) { - new_state = HST_ERROR; - reschedule = 1; - } - break; - - case HST_DEV_SCAN_START: - host->cur_scan_dev = -1; - new_state = HST_DEV_SCAN; - reschedule = 1; - break; - - case HST_DEV_SCAN: - next_dev = -1; - for (i = host->cur_scan_dev + 1; i < CARM_MAX_PORTS; i++) - if (host->dev_present & (1 << i)) { - next_dev = i; - break; - } - - if (next_dev >= 0) { - host->cur_scan_dev = next_dev; - rc = carm_array_info(host, next_dev); - if (rc) { - new_state = HST_ERROR; - reschedule = 1; - } - } else { - new_state = HST_DEV_ACTIVATE; - reschedule = 1; - } - break; - - case HST_DEV_ACTIVATE: { - int activated = 0; - for (i = 0; i < CARM_MAX_PORTS; i++) - if (host->dev_active & (1 << i)) { - struct carm_port *port = &host->port[i]; - struct gendisk *disk = port->disk; - - set_capacity(disk, port->capacity); - host->probe_err = add_disk(disk); - if (!host->probe_err) - activated++; - else - break; - } - - printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n", - pci_name(host->pdev), activated); - - new_state = HST_PROBE_FINISHED; - reschedule = 1; - break; - } - case HST_PROBE_FINISHED: - complete(&host->probe_comp); - break; - case HST_ERROR: - /* FIXME: TODO */ - break; - - default: - /* should never occur */ - printk(KERN_ERR PFX "BUG: unknown state %d\n", state); - assert(0); - break; - } - - if (new_state != HST_INVALID) { - spin_lock_irqsave(&host->lock, flags); - host->state = new_state; - spin_unlock_irqrestore(&host->lock, flags); - } - if (reschedule) - schedule_work(&host->fsm_task); -} - -static int carm_init_wait(void __iomem *mmio, u32 bits, unsigned int test_bit) -{ - unsigned int i; - - for (i = 0; i < 50000; i++) { - u32 tmp = readl(mmio + CARM_LMUC); - udelay(100); - - if (test_bit) { - if ((tmp & bits) == bits) - return 0; - } else { - if ((tmp & bits) == 0) - return 0; - } - - cond_resched(); - } - - printk(KERN_ERR PFX "carm_init_wait timeout, bits == 0x%x, test_bit == %s\n", - bits, test_bit ? "yes" : "no"); - return -EBUSY; -} - -static void carm_init_responses(struct carm_host *host) -{ - void __iomem *mmio = host->mmio; - unsigned int i; - struct carm_response *resp = (struct carm_response *) host->shm; - - for (i = 0; i < RMSG_Q_LEN; i++) - resp[i].status = cpu_to_le32(0xffffffff); - - writel(0, mmio + CARM_RESP_IDX); -} - -static int carm_init_host(struct carm_host *host) -{ - void __iomem *mmio = host->mmio; - u32 tmp; - u8 tmp8; - int rc; - - DPRINTK("ENTER\n"); - - writel(0, mmio + CARM_INT_MASK); - - tmp8 = readb(mmio + CARM_INITC); - if (tmp8 & 0x01) { - tmp8 &= ~0x01; - writeb(tmp8, mmio + CARM_INITC); - readb(mmio + CARM_INITC); /* flush */ - - DPRINTK("snooze...\n"); - msleep(5000); - } - - tmp = readl(mmio + CARM_HMUC); - if (tmp & CARM_CME) { - DPRINTK("CME bit present, waiting\n"); - rc = carm_init_wait(mmio, CARM_CME, 1); - if (rc) { - DPRINTK("EXIT, carm_init_wait 1 failed\n"); - return rc; - } - } - if (tmp & CARM_RME) { - DPRINTK("RME bit present, waiting\n"); - rc = carm_init_wait(mmio, CARM_RME, 1); - if (rc) { - DPRINTK("EXIT, carm_init_wait 2 failed\n"); - return rc; - } - } - - tmp &= ~(CARM_RME | CARM_CME); - writel(tmp, mmio + CARM_HMUC); - readl(mmio + CARM_HMUC); /* flush */ - - rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 0); - if (rc) { - DPRINTK("EXIT, carm_init_wait 3 failed\n"); - return rc; - } - - carm_init_buckets(mmio); - - writel(host->shm_dma & 0xffffffff, mmio + RBUF_ADDR_LO); - writel((host->shm_dma >> 16) >> 16, mmio + RBUF_ADDR_HI); - writel(RBUF_LEN, mmio + RBUF_BYTE_SZ); - - tmp = readl(mmio + CARM_HMUC); - tmp |= (CARM_RME | CARM_CME | CARM_WZBC); - writel(tmp, mmio + CARM_HMUC); - readl(mmio + CARM_HMUC); /* flush */ - - rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 1); - if (rc) { - DPRINTK("EXIT, carm_init_wait 4 failed\n"); - return rc; - } - - writel(0, mmio + CARM_HMPHA); - writel(INT_DEF_MASK, mmio + CARM_INT_MASK); - - carm_init_responses(host); - - /* start initialization, probing state machine */ - spin_lock_irq(&host->lock); - assert(host->state == HST_INVALID); - host->state = HST_PROBE_START; - spin_unlock_irq(&host->lock); - schedule_work(&host->fsm_task); - - DPRINTK("EXIT\n"); - return 0; -} - -static const struct blk_mq_ops carm_mq_ops = { - .queue_rq = carm_queue_rq, -}; - -static int carm_init_disk(struct carm_host *host, unsigned int port_no) -{ - struct carm_port *port = &host->port[port_no]; - struct gendisk *disk; - - port->host = host; - port->port_no = port_no; - - disk = blk_mq_alloc_disk(&host->tag_set, port); - if (IS_ERR(disk)) - return PTR_ERR(disk); - - port->disk = disk; - sprintf(disk->disk_name, DRV_NAME "/%u", - (unsigned int)host->id * CARM_MAX_PORTS + port_no); - disk->major = host->major; - disk->first_minor = port_no * CARM_MINORS_PER_MAJOR; - disk->minors = CARM_MINORS_PER_MAJOR; - disk->fops = &carm_bd_ops; - disk->private_data = port; - - blk_queue_max_segments(disk->queue, CARM_MAX_REQ_SG); - blk_queue_segment_boundary(disk->queue, CARM_SG_BOUNDARY); - return 0; -} - -static void carm_free_disk(struct carm_host *host, unsigned int port_no) -{ - struct carm_port *port = &host->port[port_no]; - struct gendisk *disk = port->disk; - - if (!disk) - return; - - if (host->state > HST_DEV_ACTIVATE) - del_gendisk(disk); - blk_cleanup_disk(disk); -} - -static int carm_init_shm(struct carm_host *host) -{ - host->shm = dma_alloc_coherent(&host->pdev->dev, CARM_SHM_SIZE, - &host->shm_dma, GFP_KERNEL); - if (!host->shm) - return -ENOMEM; - - host->msg_base = host->shm + RBUF_LEN; - host->msg_dma = host->shm_dma + RBUF_LEN; - - memset(host->shm, 0xff, RBUF_LEN); - memset(host->msg_base, 0, PDC_SHM_SIZE - RBUF_LEN); - - return 0; -} - -static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) -{ - struct carm_host *host; - int rc; - struct request_queue *q; - unsigned int i; - - printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); - - rc = pci_enable_device(pdev); - if (rc) - return rc; - - rc = pci_request_regions(pdev, DRV_NAME); - if (rc) - goto err_out; - - rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); - if (rc) { - printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n", - pci_name(pdev)); - goto err_out_regions; - } - - host = kzalloc(sizeof(*host), GFP_KERNEL); - if (!host) { - rc = -ENOMEM; - goto err_out_regions; - } - - host->pdev = pdev; - spin_lock_init(&host->lock); - INIT_WORK(&host->fsm_task, carm_fsm_task); - init_completion(&host->probe_comp); - - host->mmio = ioremap(pci_resource_start(pdev, 0), - pci_resource_len(pdev, 0)); - if (!host->mmio) { - printk(KERN_ERR DRV_NAME "(%s): MMIO alloc failure\n", - pci_name(pdev)); - rc = -ENOMEM; - goto err_out_kfree; - } - - rc = carm_init_shm(host); - if (rc) { - printk(KERN_ERR DRV_NAME "(%s): DMA SHM alloc failure\n", - pci_name(pdev)); - goto err_out_iounmap; - } - - memset(&host->tag_set, 0, sizeof(host->tag_set)); - host->tag_set.ops = &carm_mq_ops; - host->tag_set.cmd_size = sizeof(struct carm_request); - host->tag_set.nr_hw_queues = 1; - host->tag_set.nr_maps = 1; - host->tag_set.queue_depth = max_queue; - host->tag_set.numa_node = NUMA_NO_NODE; - host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - - rc = blk_mq_alloc_tag_set(&host->tag_set); - if (rc) - goto err_out_dma_free; - - q = blk_mq_init_queue(&host->tag_set); - if (IS_ERR(q)) { - rc = PTR_ERR(q); - blk_mq_free_tag_set(&host->tag_set); - goto err_out_dma_free; - } - - host->oob_q = q; - q->queuedata = host; - - /* - * Figure out which major to use: 160, 161, or dynamic - */ - if (!test_and_set_bit(0, &carm_major_alloc)) - host->major = 160; - else if (!test_and_set_bit(1, &carm_major_alloc)) - host->major = 161; - else - host->flags |= FL_DYN_MAJOR; - - host->id = carm_host_id; - sprintf(host->name, DRV_NAME "%d", carm_host_id); - - rc = register_blkdev(host->major, host->name); - if (rc < 0) - goto err_out_free_majors; - if (host->flags & FL_DYN_MAJOR) - host->major = rc; - - for (i = 0; i < CARM_MAX_PORTS; i++) { - rc = carm_init_disk(host, i); - if (rc) - goto err_out_blkdev_disks; - } - - pci_set_master(pdev); - - rc = request_irq(pdev->irq, carm_interrupt, IRQF_SHARED, DRV_NAME, host); - if (rc) { - printk(KERN_ERR DRV_NAME "(%s): irq alloc failure\n", - pci_name(pdev)); - goto err_out_blkdev_disks; - } - - rc = carm_init_host(host); - if (rc) - goto err_out_free_irq; - - DPRINTK("waiting for probe_comp\n"); - host->probe_err = -ENODEV; - wait_for_completion(&host->probe_comp); - if (host->probe_err) { - rc = host->probe_err; - goto err_out_free_irq; - } - - printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n", - host->name, pci_name(pdev), (int) CARM_MAX_PORTS, - (unsigned long long)pci_resource_start(pdev, 0), - pdev->irq, host->major); - - carm_host_id++; - pci_set_drvdata(pdev, host); - return 0; - -err_out_free_irq: - free_irq(pdev->irq, host); -err_out_blkdev_disks: - for (i = 0; i < CARM_MAX_PORTS; i++) - carm_free_disk(host, i); - unregister_blkdev(host->major, host->name); -err_out_free_majors: - if (host->major == 160) - clear_bit(0, &carm_major_alloc); - else if (host->major == 161) - clear_bit(1, &carm_major_alloc); - blk_cleanup_queue(host->oob_q); - blk_mq_free_tag_set(&host->tag_set); -err_out_dma_free: - dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); -err_out_iounmap: - iounmap(host->mmio); -err_out_kfree: - kfree(host); -err_out_regions: - pci_release_regions(pdev); -err_out: - pci_disable_device(pdev); - return rc; -} - -static void carm_remove_one (struct pci_dev *pdev) -{ - struct carm_host *host = pci_get_drvdata(pdev); - unsigned int i; - - if (!host) { - printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n", - pci_name(pdev)); - return; - } - - free_irq(pdev->irq, host); - for (i = 0; i < CARM_MAX_PORTS; i++) - carm_free_disk(host, i); - unregister_blkdev(host->major, host->name); - if (host->major == 160) - clear_bit(0, &carm_major_alloc); - else if (host->major == 161) - clear_bit(1, &carm_major_alloc); - blk_cleanup_queue(host->oob_q); - blk_mq_free_tag_set(&host->tag_set); - dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); - iounmap(host->mmio); - kfree(host); - pci_release_regions(pdev); - pci_disable_device(pdev); -} - -module_pci_driver(carm_driver); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c new file mode 100644 index 000000000000..f96cb01e9604 --- /dev/null +++ b/drivers/block/ublk_drv.c @@ -0,0 +1,2113 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Userspace block device - block device which IO is handled from userspace + * + * Take full use of io_uring passthrough command for communicating with + * ublk userspace daemon(ublksrvd) for handling basic IO request. + * + * Copyright 2022 Ming Lei <ming.lei@redhat.com> + * + * (part of code stolen from loop.c) + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/major.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/mutex.h> +#include <linux/writeback.h> +#include <linux/completion.h> +#include <linux/highmem.h> +#include <linux/sysfs.h> +#include <linux/miscdevice.h> +#include <linux/falloc.h> +#include <linux/uio.h> +#include <linux/ioprio.h> +#include <linux/sched/mm.h> +#include <linux/uaccess.h> +#include <linux/cdev.h> +#include <linux/io_uring.h> +#include <linux/blk-mq.h> +#include <linux/delay.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <linux/task_work.h> +#include <uapi/linux/ublk_cmd.h> + +#define UBLK_MINORS (1U << MINORBITS) + +/* All UBLK_F_* have to be included into UBLK_F_ALL */ +#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ + | UBLK_F_URING_CMD_COMP_IN_TASK \ + | UBLK_F_NEED_GET_DATA \ + | UBLK_F_USER_RECOVERY \ + | UBLK_F_USER_RECOVERY_REISSUE) + +/* All UBLK_PARAM_TYPE_* should be included here */ +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) + +struct ublk_rq_data { + union { + struct callback_head work; + struct llist_node node; + }; +}; + +struct ublk_uring_cmd_pdu { + struct ublk_queue *ubq; +}; + +/* + * io command is active: sqe cmd is received, and its cqe isn't done + * + * If the flag is set, the io command is owned by ublk driver, and waited + * for incoming blk-mq request from the ublk block device. + * + * If the flag is cleared, the io command will be completed, and owned by + * ublk server. + */ +#define UBLK_IO_FLAG_ACTIVE 0x01 + +/* + * IO command is completed via cqe, and it is being handled by ublksrv, and + * not committed yet + * + * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for + * cross verification + */ +#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 + +/* + * IO command is aborted, so this flag is set in case of + * !UBLK_IO_FLAG_ACTIVE. + * + * After this flag is observed, any pending or new incoming request + * associated with this io command will be failed immediately + */ +#define UBLK_IO_FLAG_ABORTED 0x04 + +/* + * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires + * get data buffer address from ublksrv. + * + * Then, bio data could be copied into this data buffer for a WRITE request + * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset. + */ +#define UBLK_IO_FLAG_NEED_GET_DATA 0x08 + +struct ublk_io { + /* userspace buffer address from io cmd */ + __u64 addr; + unsigned int flags; + int res; + + struct io_uring_cmd *cmd; +}; + +struct ublk_queue { + int q_id; + int q_depth; + + unsigned long flags; + struct task_struct *ubq_daemon; + char *io_cmd_buf; + + struct llist_head io_cmds; + + unsigned long io_addr; /* mapped vm address */ + unsigned int max_io_sz; + bool force_abort; + unsigned short nr_io_ready; /* how many ios setup */ + struct ublk_device *dev; + struct ublk_io ios[]; +}; + +#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) + +struct ublk_device { + struct gendisk *ub_disk; + + char *__queues; + + unsigned short queue_size; + struct ublksrv_ctrl_dev_info dev_info; + + struct blk_mq_tag_set tag_set; + + struct cdev cdev; + struct device cdev_dev; + +#define UB_STATE_OPEN 0 +#define UB_STATE_USED 1 + unsigned long state; + int ub_number; + + struct mutex mutex; + + spinlock_t mm_lock; + struct mm_struct *mm; + + struct ublk_params params; + + struct completion completion; + unsigned int nr_queues_ready; + atomic_t nr_aborted_queues; + + /* + * Our ubq->daemon may be killed without any notification, so + * monitor each queue's daemon periodically + */ + struct delayed_work monitor_work; + struct work_struct quiesce_work; + struct work_struct stop_work; +}; + +/* header of ublk_params */ +struct ublk_params_header { + __u32 len; + __u32 types; +}; + +static dev_t ublk_chr_devt; +static struct class *ublk_chr_class; + +static DEFINE_IDR(ublk_index_idr); +static DEFINE_SPINLOCK(ublk_idr_lock); +static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ + +static DEFINE_MUTEX(ublk_ctl_mutex); + +static struct miscdevice ublk_misc; + +static void ublk_dev_param_basic_apply(struct ublk_device *ub) +{ + struct request_queue *q = ub->ub_disk->queue; + const struct ublk_param_basic *p = &ub->params.basic; + + blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); + blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); + blk_queue_io_min(q, 1 << p->io_min_shift); + blk_queue_io_opt(q, 1 << p->io_opt_shift); + + blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, + p->attrs & UBLK_ATTR_FUA); + if (p->attrs & UBLK_ATTR_ROTATIONAL) + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + + blk_queue_max_hw_sectors(q, p->max_sectors); + blk_queue_chunk_sectors(q, p->chunk_sectors); + blk_queue_virt_boundary(q, p->virt_boundary_mask); + + if (p->attrs & UBLK_ATTR_READ_ONLY) + set_disk_ro(ub->ub_disk, true); + + set_capacity(ub->ub_disk, p->dev_sectors); +} + +static void ublk_dev_param_discard_apply(struct ublk_device *ub) +{ + struct request_queue *q = ub->ub_disk->queue; + const struct ublk_param_discard *p = &ub->params.discard; + + q->limits.discard_alignment = p->discard_alignment; + q->limits.discard_granularity = p->discard_granularity; + blk_queue_max_discard_sectors(q, p->max_discard_sectors); + blk_queue_max_write_zeroes_sectors(q, + p->max_write_zeroes_sectors); + blk_queue_max_discard_segments(q, p->max_discard_segments); +} + +static int ublk_validate_params(const struct ublk_device *ub) +{ + /* basic param is the only one which must be set */ + if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { + const struct ublk_param_basic *p = &ub->params.basic; + + if (p->logical_bs_shift > PAGE_SHIFT) + return -EINVAL; + + if (p->logical_bs_shift > p->physical_bs_shift) + return -EINVAL; + + if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) + return -EINVAL; + } else + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { + const struct ublk_param_discard *p = &ub->params.discard; + + /* So far, only support single segment discard */ + if (p->max_discard_sectors && p->max_discard_segments != 1) + return -EINVAL; + + if (!p->discard_granularity) + return -EINVAL; + } + + return 0; +} + +static int ublk_apply_params(struct ublk_device *ub) +{ + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) + return -EINVAL; + + ublk_dev_param_basic_apply(ub); + + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) + ublk_dev_param_discard_apply(ub); + + return 0; +} + +static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) +{ + if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && + !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) + return true; + return false; +} + +static inline bool ublk_need_get_data(const struct ublk_queue *ubq) +{ + if (ubq->flags & UBLK_F_NEED_GET_DATA) + return true; + return false; +} + +static struct ublk_device *ublk_get_device(struct ublk_device *ub) +{ + if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) + return ub; + return NULL; +} + +static void ublk_put_device(struct ublk_device *ub) +{ + put_device(&ub->cdev_dev); +} + +static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, + int qid) +{ + return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); +} + +static inline bool ublk_rq_has_data(const struct request *rq) +{ + return rq->bio && bio_has_data(rq->bio); +} + +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, + int tag) +{ + return (struct ublksrv_io_desc *) + &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); +} + +static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) +{ + return ublk_get_queue(ub, q_id)->io_cmd_buf; +} + +static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + + return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), + PAGE_SIZE); +} + +static inline bool ublk_queue_can_use_recovery_reissue( + struct ublk_queue *ubq) +{ + if ((ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) + return true; + return false; +} + +static inline bool ublk_queue_can_use_recovery( + struct ublk_queue *ubq) +{ + if (ubq->flags & UBLK_F_USER_RECOVERY) + return true; + return false; +} + +static inline bool ublk_can_use_recovery(struct ublk_device *ub) +{ + if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) + return true; + return false; +} + +static void ublk_free_disk(struct gendisk *disk) +{ + struct ublk_device *ub = disk->private_data; + + clear_bit(UB_STATE_USED, &ub->state); + put_device(&ub->cdev_dev); +} + +static const struct block_device_operations ub_fops = { + .owner = THIS_MODULE, + .free_disk = ublk_free_disk, +}; + +#define UBLK_MAX_PIN_PAGES 32 + +struct ublk_map_data { + const struct ublk_queue *ubq; + const struct request *rq; + const struct ublk_io *io; + unsigned max_bytes; +}; + +struct ublk_io_iter { + struct page *pages[UBLK_MAX_PIN_PAGES]; + unsigned pg_off; /* offset in the 1st page in pages */ + int nr_pages; /* how many page pointers in pages */ + struct bio *bio; + struct bvec_iter iter; +}; + +static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, + unsigned max_bytes, bool to_vm) +{ + const unsigned total = min_t(unsigned, max_bytes, + PAGE_SIZE - data->pg_off + + ((data->nr_pages - 1) << PAGE_SHIFT)); + unsigned done = 0; + unsigned pg_idx = 0; + + while (done < total) { + struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); + const unsigned int bytes = min3(bv.bv_len, total - done, + (unsigned)(PAGE_SIZE - data->pg_off)); + void *bv_buf = bvec_kmap_local(&bv); + void *pg_buf = kmap_local_page(data->pages[pg_idx]); + + if (to_vm) + memcpy(pg_buf + data->pg_off, bv_buf, bytes); + else + memcpy(bv_buf, pg_buf + data->pg_off, bytes); + + kunmap_local(pg_buf); + kunmap_local(bv_buf); + + /* advance page array */ + data->pg_off += bytes; + if (data->pg_off == PAGE_SIZE) { + pg_idx += 1; + data->pg_off = 0; + } + + done += bytes; + + /* advance bio */ + bio_advance_iter_single(data->bio, &data->iter, bytes); + if (!data->iter.bi_size) { + data->bio = data->bio->bi_next; + if (data->bio == NULL) + break; + data->iter = data->bio->bi_iter; + } + } + + return done; +} + +static inline int ublk_copy_user_pages(struct ublk_map_data *data, + bool to_vm) +{ + const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; + const unsigned long start_vm = data->io->addr; + unsigned int done = 0; + struct ublk_io_iter iter = { + .pg_off = start_vm & (PAGE_SIZE - 1), + .bio = data->rq->bio, + .iter = data->rq->bio->bi_iter, + }; + const unsigned int nr_pages = round_up(data->max_bytes + + (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; + + while (done < nr_pages) { + const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, + nr_pages - done); + unsigned i, len; + + iter.nr_pages = get_user_pages_fast(start_vm + + (done << PAGE_SHIFT), to_pin, gup_flags, + iter.pages); + if (iter.nr_pages <= 0) + return done == 0 ? iter.nr_pages : done; + len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); + for (i = 0; i < iter.nr_pages; i++) { + if (to_vm) + set_page_dirty(iter.pages[i]); + put_page(iter.pages[i]); + } + data->max_bytes -= len; + done += iter.nr_pages; + } + + return done; +} + +static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, + struct ublk_io *io) +{ + const unsigned int rq_bytes = blk_rq_bytes(req); + /* + * no zero copy, we delay copy WRITE request data into ublksrv + * context and the big benefit is that pinning pages in current + * context is pretty fast, see ublk_pin_user_pages + */ + if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) + return rq_bytes; + + if (ublk_rq_has_data(req)) { + struct ublk_map_data data = { + .ubq = ubq, + .rq = req, + .io = io, + .max_bytes = rq_bytes, + }; + + ublk_copy_user_pages(&data, true); + + return rq_bytes - data.max_bytes; + } + return rq_bytes; +} + +static int ublk_unmap_io(const struct ublk_queue *ubq, + const struct request *req, + struct ublk_io *io) +{ + const unsigned int rq_bytes = blk_rq_bytes(req); + + if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { + struct ublk_map_data data = { + .ubq = ubq, + .rq = req, + .io = io, + .max_bytes = io->res, + }; + + WARN_ON_ONCE(io->res > rq_bytes); + + ublk_copy_user_pages(&data, false); + + return io->res - data.max_bytes; + } + return rq_bytes; +} + +static inline unsigned int ublk_req_build_flags(struct request *req) +{ + unsigned flags = 0; + + if (req->cmd_flags & REQ_FAILFAST_DEV) + flags |= UBLK_IO_F_FAILFAST_DEV; + + if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) + flags |= UBLK_IO_F_FAILFAST_TRANSPORT; + + if (req->cmd_flags & REQ_FAILFAST_DRIVER) + flags |= UBLK_IO_F_FAILFAST_DRIVER; + + if (req->cmd_flags & REQ_META) + flags |= UBLK_IO_F_META; + + if (req->cmd_flags & REQ_FUA) + flags |= UBLK_IO_F_FUA; + + if (req->cmd_flags & REQ_NOUNMAP) + flags |= UBLK_IO_F_NOUNMAP; + + if (req->cmd_flags & REQ_SWAP) + flags |= UBLK_IO_F_SWAP; + + return flags; +} + +static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_READ: + ublk_op = UBLK_IO_OP_READ; + break; + case REQ_OP_WRITE: + ublk_op = UBLK_IO_OP_WRITE; + break; + case REQ_OP_FLUSH: + ublk_op = UBLK_IO_OP_FLUSH; + break; + case REQ_OP_DISCARD: + ublk_op = UBLK_IO_OP_DISCARD; + break; + case REQ_OP_WRITE_ZEROES: + ublk_op = UBLK_IO_OP_WRITE_ZEROES; + break; + default: + return BLK_STS_IOERR; + } + + /* need to translate since kernel may change */ + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->addr; + + return BLK_STS_OK; +} + +static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; +} + +static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) +{ + return ubq->ubq_daemon->flags & PF_EXITING; +} + +/* todo: handle partial completion */ +static void ublk_complete_rq(struct request *req) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[req->tag]; + unsigned int unmapped_bytes; + + /* failed read IO if nothing is read */ + if (!io->res && req_op(req) == REQ_OP_READ) + io->res = -EIO; + + if (io->res < 0) { + blk_mq_end_request(req, errno_to_blk_status(io->res)); + return; + } + + /* + * FLUSH or DISCARD usually won't return bytes returned, so end them + * directly. + * + * Both the two needn't unmap. + */ + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { + blk_mq_end_request(req, BLK_STS_OK); + return; + } + + /* for READ request, writing data in iod->addr to rq buffers */ + unmapped_bytes = ublk_unmap_io(ubq, req, io); + + /* + * Extremely impossible since we got data filled in just before + * + * Re-read simply for this unlikely case. + */ + if (unlikely(unmapped_bytes < io->res)) + io->res = unmapped_bytes; + + if (blk_update_request(req, BLK_STS_OK, io->res)) + blk_mq_requeue_request(req, true); + else + __blk_mq_end_request(req, BLK_STS_OK); +} + +/* + * Since __ublk_rq_task_work always fails requests immediately during + * exiting, __ublk_fail_req() is only called from abort context during + * exiting. So lock is unnecessary. + * + * Also aborting may not be started yet, keep in mind that one failed + * request may be issued by block layer again. + */ +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) +{ + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + + if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { + io->flags |= UBLK_IO_FLAG_ABORTED; + if (ublk_queue_can_use_recovery_reissue(ubq)) + blk_mq_requeue_request(req, false); + else + blk_mq_end_request(req, BLK_STS_IOERR); + } +} + +static void ubq_complete_io_cmd(struct ublk_io *io, int res) +{ + /* mark this cmd owned by ublksrv */ + io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; + + /* + * clear ACTIVE since we are done with this sqe/cmd slot + * We can only accept io cmd in case of being not active. + */ + io->flags &= ~UBLK_IO_FLAG_ACTIVE; + + /* tell ublksrv one io request is coming */ + io_uring_cmd_done(io->cmd, res, 0); +} + +#define UBLK_REQUEUE_DELAY_MS 3 + +static inline void __ublk_abort_rq(struct ublk_queue *ubq, + struct request *rq) +{ + /* We cannot process this rq so just requeue it. */ + if (ublk_queue_can_use_recovery(ubq)) + blk_mq_requeue_request(rq, false); + else + blk_mq_end_request(rq, BLK_STS_IOERR); + + mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); +} + +static inline void __ublk_rq_task_work(struct request *req) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + int tag = req->tag; + struct ublk_io *io = &ubq->ios[tag]; + unsigned int mapped_bytes; + + pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", + __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + + /* + * Task is exiting if either: + * + * (1) current != ubq_daemon. + * io_uring_cmd_complete_in_task() tries to run task_work + * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. + * + * (2) current->flags & PF_EXITING. + */ + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { + __ublk_abort_rq(ubq, req); + return; + } + + if (ublk_need_get_data(ubq) && + (req_op(req) == REQ_OP_WRITE || + req_op(req) == REQ_OP_FLUSH)) { + /* + * We have not handled UBLK_IO_NEED_GET_DATA command yet, + * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv + * and notify it. + */ + if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { + io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; + pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", + __func__, io->cmd->cmd_op, ubq->q_id, + req->tag, io->flags); + ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA); + return; + } + /* + * We have handled UBLK_IO_NEED_GET_DATA command, + * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just + * do the copy work. + */ + io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; + /* update iod->addr because ublksrv may have passed a new io buffer */ + ublk_get_iod(ubq, req->tag)->addr = io->addr; + pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n", + __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + } + + mapped_bytes = ublk_map_io(ubq, req, io); + + /* partially mapped, update io descriptor */ + if (unlikely(mapped_bytes != blk_rq_bytes(req))) { + /* + * Nothing mapped, retry until we succeed. + * + * We may never succeed in mapping any bytes here because + * of OOM. TODO: reserve one buffer with single page pinned + * for providing forward progress guarantee. + */ + if (unlikely(!mapped_bytes)) { + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, + UBLK_REQUEUE_DELAY_MS); + return; + } + + ublk_get_iod(ubq, req->tag)->nr_sectors = + mapped_bytes >> 9; + } + + ubq_complete_io_cmd(io, UBLK_IO_RES_OK); +} + +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data; + + llist_for_each_entry(data, io_cmds, node) + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); +} + +static void ublk_rq_task_work_fn(struct callback_head *work) +{ + struct ublk_rq_data *data = container_of(work, + struct ublk_rq_data, work); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_rq_task_work(req); +} + +static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) +{ + struct ublk_io *io = &ubq->ios[rq->tag]; + + /* + * If the check pass, we know that this is a re-issued request aborted + * previously in monitor_work because the ubq_daemon(cmd's task) is + * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore + * because this ioucmd's io_uring context may be freed now if no inflight + * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. + * + * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing + * the tag). Then the request is re-started(allocating the tag) and we are here. + * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED + * guarantees that here is a re-issued request aborted previously. + */ + if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data; + + llist_for_each_entry(data, io_cmds, node) + __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); + } else { + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->ubq = ubq; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + } +} + +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq, + bool last) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + + if (ublk_can_use_task_work(ubq)) { + enum task_work_notify_mode notify_mode = last ? + TWA_SIGNAL_NO_IPI : TWA_NONE; + + if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) + __ublk_abort_rq(ubq, rq); + } else { + if (llist_add(&data->node, &ubq->io_cmds)) + ublk_submit_cmd(ubq, rq); + } +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; + + /* fill iod to slot in io cmd buffer */ + res = ublk_setup_iod(ubq, rq); + if (unlikely(res != BLK_STS_OK)) + return BLK_STS_IOERR; + + /* With recovery feature enabled, force_abort is set in + * ublk_stop_dev() before calling del_gendisk(). We have to + * abort all requeued and new rqs here to let del_gendisk() + * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() + * to avoid UAF on io_uring ctx. + * + * Note: force_abort is guaranteed to be seen because it is set + * before request queue is unqiuesced. + */ + if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + return BLK_STS_IOERR; + + blk_mq_start_request(bd->rq); + + if (unlikely(ubq_daemon_is_dying(ubq))) { + __ublk_abort_rq(ubq, rq); + return BLK_STS_OK; + } + + ublk_queue_cmd(ubq, rq, bd->last); + + return BLK_STS_OK; +} + +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + + if (ublk_can_use_task_work(ubq)) + __set_notify_signal(ubq->ubq_daemon); +} + +static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, + unsigned int hctx_idx) +{ + struct ublk_device *ub = driver_data; + struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); + + hctx->driver_data = ubq; + return 0; +} + +static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + init_task_work(&data->work, ublk_rq_task_work_fn); + return 0; +} + +static const struct blk_mq_ops ublk_mq_ops = { + .queue_rq = ublk_queue_rq, + .commit_rqs = ublk_commit_rqs, + .init_hctx = ublk_init_hctx, + .init_request = ublk_init_rq, +}; + +static int ublk_ch_open(struct inode *inode, struct file *filp) +{ + struct ublk_device *ub = container_of(inode->i_cdev, + struct ublk_device, cdev); + + if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) + return -EBUSY; + filp->private_data = ub; + return 0; +} + +static int ublk_ch_release(struct inode *inode, struct file *filp) +{ + struct ublk_device *ub = filp->private_data; + + clear_bit(UB_STATE_OPEN, &ub->state); + return 0; +} + +/* map pre-allocated per-queue cmd buffer to ublksrv daemon */ +static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ublk_device *ub = filp->private_data; + size_t sz = vma->vm_end - vma->vm_start; + unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; + int q_id, ret = 0; + + spin_lock(&ub->mm_lock); + if (!ub->mm) + ub->mm = current->mm; + if (current->mm != ub->mm) + ret = -EINVAL; + spin_unlock(&ub->mm_lock); + + if (ret) + return ret; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; + if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) + return -EINVAL; + + q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; + pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", + __func__, q_id, current->pid, vma->vm_start, + phys_off, (unsigned long)sz); + + if (sz != ublk_queue_cmd_buf_size(ub, q_id)) + return -EINVAL; + + pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +} + +static void ublk_commit_completion(struct ublk_device *ub, + struct ublksrv_io_cmd *ub_cmd) +{ + u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; + struct ublk_queue *ubq = ublk_get_queue(ub, qid); + struct ublk_io *io = &ubq->ios[tag]; + struct request *req; + + /* now this cmd slot is owned by nbd driver */ + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + io->res = ub_cmd->result; + + /* find the io request and complete */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + + if (req && likely(!blk_should_fake_timeout(req->q))) + ublk_complete_rq(req); +} + +/* + * When ->ubq_daemon is exiting, either new request is ended immediately, + * or any queued io command is drained, so it is safe to abort queue + * lockless + */ +static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + if (!ublk_get_device(ub)) + return; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { + struct request *rq; + + /* + * Either we fail the request or ublk_rq_task_work_fn + * will do it + */ + rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); + if (rq) + __ublk_fail_req(ubq, io, rq); + } + } + ublk_put_device(ub); +} + +static void ublk_daemon_monitor_work(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, monitor_work.work); + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + if (ubq_daemon_is_dying(ubq)) { + if (ublk_queue_can_use_recovery(ubq)) + schedule_work(&ub->quiesce_work); + else + schedule_work(&ub->stop_work); + + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); + } + } + + /* + * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE. + * after ublk_remove() or __ublk_quiesce_dev() is started. + * + * No need ub->mutex, monitor work are canceled after state is marked + * as not LIVE, so new state is observed reliably. + */ + if (ub->dev_info.state == UBLK_S_DEV_LIVE) + schedule_delayed_work(&ub->monitor_work, + UBLK_DAEMON_MONITOR_PERIOD); +} + +static inline bool ublk_queue_ready(struct ublk_queue *ubq) +{ + return ubq->nr_io_ready == ubq->q_depth; +} + +static void ublk_cancel_queue(struct ublk_queue *ubq) +{ + int i; + + if (!ublk_queue_ready(ubq)) + return; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + if (io->flags & UBLK_IO_FLAG_ACTIVE) + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); + } + + /* all io commands are canceled */ + ubq->nr_io_ready = 0; +} + +/* Cancel all pending commands, must be called after del_gendisk() returns */ +static void ublk_cancel_dev(struct ublk_device *ub) +{ + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_cancel_queue(ublk_get_queue(ub, i)); +} + +static bool ublk_check_inflight_rq(struct request *rq, void *data) +{ + bool *idle = data; + + if (blk_mq_request_started(rq)) { + *idle = false; + return false; + } + return true; +} + +static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) +{ + bool idle; + + WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); + while (true) { + idle = true; + blk_mq_tagset_busy_iter(&ub->tag_set, + ublk_check_inflight_rq, &idle); + if (idle) + break; + msleep(UBLK_REQUEUE_DELAY_MS); + } +} + +static void __ublk_quiesce_dev(struct ublk_device *ub) +{ + pr_devel("%s: quiesce ub: dev_id %d state %s\n", + __func__, ub->dev_info.dev_id, + ub->dev_info.state == UBLK_S_DEV_LIVE ? + "LIVE" : "QUIESCED"); + blk_mq_quiesce_queue(ub->ub_disk->queue); + ublk_wait_tagset_rqs_idle(ub); + ub->dev_info.state = UBLK_S_DEV_QUIESCED; + ublk_cancel_dev(ub); + /* we are going to release task_struct of ubq_daemon and resets + * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. + * Besides, monitor_work is not necessary in QUIESCED state since we have + * already scheduled quiesce_work and quiesced all ubqs. + * + * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel + * it here and re-schedule it in END_USER_RECOVERY to avoid UAF. + */ + cancel_delayed_work_sync(&ub->monitor_work); +} + +static void ublk_quiesce_work_fn(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, quiesce_work); + + mutex_lock(&ub->mutex); + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto unlock; + __ublk_quiesce_dev(ub); + unlock: + mutex_unlock(&ub->mutex); +} + +static void ublk_unquiesce_dev(struct ublk_device *ub) +{ + int i; + + pr_devel("%s: unquiesce ub: dev_id %d state %s\n", + __func__, ub->dev_info.dev_id, + ub->dev_info.state == UBLK_S_DEV_LIVE ? + "LIVE" : "QUIESCED"); + /* quiesce_work has run. We let requeued rqs be aborted + * before running fallback_wq. "force_abort" must be seen + * after request queue is unqiuesced. Then del_gendisk() + * can move on. + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->force_abort = true; + + blk_mq_unquiesce_queue(ub->ub_disk->queue); + /* We may have requeued some rqs in ublk_quiesce_queue() */ + blk_mq_kick_requeue_list(ub->ub_disk->queue); +} + +static void ublk_stop_dev(struct ublk_device *ub) +{ + mutex_lock(&ub->mutex); + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + goto unlock; + if (ublk_can_use_recovery(ub)) { + if (ub->dev_info.state == UBLK_S_DEV_LIVE) + __ublk_quiesce_dev(ub); + ublk_unquiesce_dev(ub); + } + del_gendisk(ub->ub_disk); + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + put_disk(ub->ub_disk); + ub->ub_disk = NULL; + unlock: + ublk_cancel_dev(ub); + mutex_unlock(&ub->mutex); + cancel_delayed_work_sync(&ub->monitor_work); +} + +/* device can only be started after all IOs are ready */ +static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) +{ + mutex_lock(&ub->mutex); + ubq->nr_io_ready++; + if (ublk_queue_ready(ubq)) { + ubq->ubq_daemon = current; + get_task_struct(ubq->ubq_daemon); + ub->nr_queues_ready++; + } + if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) + complete_all(&ub->completion); + mutex_unlock(&ub->mutex); +} + +static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, + int tag) +{ + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); + + ublk_queue_cmd(ubq, req, true); +} + +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; + struct ublk_device *ub = cmd->file->private_data; + struct ublk_queue *ubq; + struct ublk_io *io; + u32 cmd_op = cmd->cmd_op; + unsigned tag = ub_cmd->tag; + int ret = -EINVAL; + + pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", + __func__, cmd->cmd_op, ub_cmd->q_id, tag, + ub_cmd->result); + + if (!(issue_flags & IO_URING_F_SQE128)) + goto out; + + if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) + goto out; + + ubq = ublk_get_queue(ub, ub_cmd->q_id); + if (!ubq || ub_cmd->q_id != ubq->q_id) + goto out; + + if (ubq->ubq_daemon && ubq->ubq_daemon != current) + goto out; + + if (tag >= ubq->q_depth) + goto out; + + io = &ubq->ios[tag]; + + /* there is pending io cmd, something must be wrong */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + ret = -EBUSY; + goto out; + } + + /* + * ensure that the user issues UBLK_IO_NEED_GET_DATA + * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. + */ + if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) + ^ (cmd_op == UBLK_IO_NEED_GET_DATA)) + goto out; + + switch (cmd_op) { + case UBLK_IO_FETCH_REQ: + /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ + if (ublk_queue_ready(ubq)) { + ret = -EBUSY; + goto out; + } + /* + * The io is being handled by server, so COMMIT_RQ is expected + * instead of FETCH_REQ + */ + if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) + goto out; + /* FETCH_RQ has to provide IO buffer */ + if (!ub_cmd->addr) + goto out; + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->addr = ub_cmd->addr; + + ublk_mark_io_ready(ub, ubq); + break; + case UBLK_IO_COMMIT_AND_FETCH_REQ: + /* FETCH_RQ has to provide IO buffer */ + if (!ub_cmd->addr) + goto out; + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + goto out; + io->addr = ub_cmd->addr; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->cmd = cmd; + ublk_commit_completion(ub, ub_cmd); + break; + case UBLK_IO_NEED_GET_DATA: + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + goto out; + io->addr = ub_cmd->addr; + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); + break; + default: + goto out; + } + return -EIOCBQUEUED; + + out: + io_uring_cmd_done(cmd, ret, 0); + pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", + __func__, cmd_op, tag, ret, io->flags); + return -EIOCBQUEUED; +} + +static const struct file_operations ublk_ch_fops = { + .owner = THIS_MODULE, + .open = ublk_ch_open, + .release = ublk_ch_release, + .llseek = no_llseek, + .uring_cmd = ublk_ch_uring_cmd, + .mmap = ublk_ch_mmap, +}; + +static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +{ + int size = ublk_queue_cmd_buf_size(ub, q_id); + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + + if (ubq->ubq_daemon) + put_task_struct(ubq->ubq_daemon); + if (ubq->io_cmd_buf) + free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); +} + +static int ublk_init_queue(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; + void *ptr; + int size; + + ubq->flags = ub->dev_info.flags; + ubq->q_id = q_id; + ubq->q_depth = ub->dev_info.queue_depth; + size = ublk_queue_cmd_buf_size(ub, q_id); + + ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); + if (!ptr) + return -ENOMEM; + + ubq->io_cmd_buf = ptr; + ubq->dev = ub; + return 0; +} + +static void ublk_deinit_queues(struct ublk_device *ub) +{ + int nr_queues = ub->dev_info.nr_hw_queues; + int i; + + if (!ub->__queues) + return; + + for (i = 0; i < nr_queues; i++) + ublk_deinit_queue(ub, i); + kfree(ub->__queues); +} + +static int ublk_init_queues(struct ublk_device *ub) +{ + int nr_queues = ub->dev_info.nr_hw_queues; + int depth = ub->dev_info.queue_depth; + int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); + int i, ret = -ENOMEM; + + ub->queue_size = ubq_size; + ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + if (!ub->__queues) + return ret; + + for (i = 0; i < nr_queues; i++) { + if (ublk_init_queue(ub, i)) + goto fail; + } + + init_completion(&ub->completion); + return 0; + + fail: + ublk_deinit_queues(ub); + return ret; +} + +static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) +{ + int i = idx; + int err; + + spin_lock(&ublk_idr_lock); + /* allocate id, if @id >= 0, we're requesting that specific id */ + if (i >= 0) { + err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); + if (err == -ENOSPC) + err = -EEXIST; + } else { + err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT); + } + spin_unlock(&ublk_idr_lock); + + if (err >= 0) + ub->ub_number = err; + + return err; +} + +static void ublk_free_dev_number(struct ublk_device *ub) +{ + spin_lock(&ublk_idr_lock); + idr_remove(&ublk_index_idr, ub->ub_number); + wake_up_all(&ublk_idr_wq); + spin_unlock(&ublk_idr_lock); +} + +static void ublk_cdev_rel(struct device *dev) +{ + struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); + + blk_mq_free_tag_set(&ub->tag_set); + ublk_deinit_queues(ub); + ublk_free_dev_number(ub); + mutex_destroy(&ub->mutex); + kfree(ub); +} + +static int ublk_add_chdev(struct ublk_device *ub) +{ + struct device *dev = &ub->cdev_dev; + int minor = ub->ub_number; + int ret; + + dev->parent = ublk_misc.this_device; + dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); + dev->class = ublk_chr_class; + dev->release = ublk_cdev_rel; + device_initialize(dev); + + ret = dev_set_name(dev, "ublkc%d", minor); + if (ret) + goto fail; + + cdev_init(&ub->cdev, &ublk_ch_fops); + ret = cdev_device_add(&ub->cdev, dev); + if (ret) + goto fail; + return 0; + fail: + put_device(dev); + return ret; +} + +static void ublk_stop_work_fn(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, stop_work); + + ublk_stop_dev(ub); +} + +/* align max io buffer size with PAGE_SIZE */ +static void ublk_align_max_io_size(struct ublk_device *ub) +{ + unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes; + + ub->dev_info.max_io_buf_bytes = + round_down(max_io_bytes, PAGE_SIZE); +} + +static int ublk_add_tag_set(struct ublk_device *ub) +{ + ub->tag_set.ops = &ublk_mq_ops; + ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; + ub->tag_set.queue_depth = ub->dev_info.queue_depth; + ub->tag_set.numa_node = NUMA_NO_NODE; + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); + ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ub->tag_set.driver_data = ub; + return blk_mq_alloc_tag_set(&ub->tag_set); +} + +static void ublk_remove(struct ublk_device *ub) +{ + ublk_stop_dev(ub); + cancel_work_sync(&ub->stop_work); + cancel_work_sync(&ub->quiesce_work); + cdev_device_del(&ub->cdev, &ub->cdev_dev); + put_device(&ub->cdev_dev); +} + +static struct ublk_device *ublk_get_device_from_id(int idx) +{ + struct ublk_device *ub = NULL; + + if (idx < 0) + return NULL; + + spin_lock(&ublk_idr_lock); + ub = idr_find(&ublk_index_idr, idx); + if (ub) + ub = ublk_get_device(ub); + spin_unlock(&ublk_idr_lock); + + return ub; +} + +static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ublksrv_pid = (int)header->data[0]; + struct ublk_device *ub; + struct gendisk *disk; + int ret = -EINVAL; + + if (ublksrv_pid <= 0) + return -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + wait_for_completion_interruptible(&ub->completion); + + schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); + + mutex_lock(&ub->mutex); + if (ub->dev_info.state == UBLK_S_DEV_LIVE || + test_bit(UB_STATE_USED, &ub->state)) { + ret = -EEXIST; + goto out_unlock; + } + + disk = blk_mq_alloc_disk(&ub->tag_set, ub); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); + goto out_unlock; + } + sprintf(disk->disk_name, "ublkb%d", ub->ub_number); + disk->fops = &ub_fops; + disk->private_data = ub; + + ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->ub_disk = disk; + + ret = ublk_apply_params(ub); + if (ret) + goto out_put_disk; + + get_device(&ub->cdev_dev); + ret = add_disk(disk); + if (ret) { + /* + * Has to drop the reference since ->free_disk won't be + * called in case of add_disk failure. + */ + ublk_put_device(ub); + goto out_put_disk; + } + set_bit(UB_STATE_USED, &ub->state); + ub->dev_info.state = UBLK_S_DEV_LIVE; +out_put_disk: + if (ret) + put_disk(disk); +out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_device *ub; + cpumask_var_t cpumask; + unsigned long queue; + unsigned int retlen; + unsigned int i; + int ret = -EINVAL; + + if (header->len * BITS_PER_BYTE < nr_cpu_ids) + return -EINVAL; + if (header->len & (sizeof(unsigned long)-1)) + return -EINVAL; + if (!header->addr) + return -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + queue = header->data[0]; + if (queue >= ub->dev_info.nr_hw_queues) + goto out_put_device; + + ret = -ENOMEM; + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out_put_device; + + for_each_possible_cpu(i) { + if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) + cpumask_set_cpu(i, cpumask); + } + + ret = -EFAULT; + retlen = min_t(unsigned short, header->len, cpumask_size()); + if (copy_to_user(argp, cpumask, retlen)) + goto out_free_cpumask; + if (retlen != header->len && + clear_user(argp + retlen, header->len - retlen)) + goto out_free_cpumask; + + ret = 0; +out_free_cpumask: + free_cpumask_var(cpumask); +out_put_device: + ublk_put_device(ub); + return ret; +} + +static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) +{ + pr_devel("%s: dev id %d flags %llx\n", __func__, + info->dev_id, info->flags); + pr_devel("\t nr_hw_queues %d queue_depth %d\n", + info->nr_hw_queues, info->queue_depth); +} + +static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublksrv_ctrl_dev_info info; + struct ublk_device *ub; + int ret = -EINVAL; + + if (header->len < sizeof(info) || !header->addr) + return -EINVAL; + if (header->queue_id != (u16)-1) { + pr_warn("%s: queue_id is wrong %x\n", + __func__, header->queue_id); + return -EINVAL; + } + if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + ublk_dump_dev_info(&info); + if (header->dev_id != info.dev_id) { + pr_warn("%s: dev id not match %u %u\n", + __func__, header->dev_id, info.dev_id); + return -EINVAL; + } + + ret = mutex_lock_killable(&ublk_ctl_mutex); + if (ret) + return ret; + + ret = -ENOMEM; + ub = kzalloc(sizeof(*ub), GFP_KERNEL); + if (!ub) + goto out_unlock; + mutex_init(&ub->mutex); + spin_lock_init(&ub->mm_lock); + INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); + INIT_WORK(&ub->stop_work, ublk_stop_work_fn); + INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); + + ret = ublk_alloc_dev_number(ub, header->dev_id); + if (ret < 0) + goto out_free_ub; + + memcpy(&ub->dev_info, &info, sizeof(info)); + + /* update device id */ + ub->dev_info.dev_id = ub->ub_number; + + /* + * 64bit flags will be copied back to userspace as feature + * negotiation result, so have to clear flags which driver + * doesn't support yet, then userspace can get correct flags + * (features) to handle. + */ + ub->dev_info.flags &= UBLK_F_ALL; + + if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) + ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; + + /* We are not ready to support zero copy */ + ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; + + ub->dev_info.nr_hw_queues = min_t(unsigned int, + ub->dev_info.nr_hw_queues, nr_cpu_ids); + ublk_align_max_io_size(ub); + + ret = ublk_init_queues(ub); + if (ret) + goto out_free_dev_number; + + ret = ublk_add_tag_set(ub); + if (ret) + goto out_deinit_queues; + + ret = -EFAULT; + if (copy_to_user(argp, &ub->dev_info, sizeof(info))) + goto out_free_tag_set; + + /* + * Add the char dev so that ublksrv daemon can be setup. + * ublk_add_chdev() will cleanup everything if it fails. + */ + ret = ublk_add_chdev(ub); + goto out_unlock; + +out_free_tag_set: + blk_mq_free_tag_set(&ub->tag_set); +out_deinit_queues: + ublk_deinit_queues(ub); +out_free_dev_number: + ublk_free_dev_number(ub); +out_free_ub: + mutex_destroy(&ub->mutex); + kfree(ub); +out_unlock: + mutex_unlock(&ublk_ctl_mutex); + return ret; +} + +static inline bool ublk_idr_freed(int id) +{ + void *ptr; + + spin_lock(&ublk_idr_lock); + ptr = idr_find(&ublk_index_idr, id); + spin_unlock(&ublk_idr_lock); + + return ptr == NULL; +} + +static int ublk_ctrl_del_dev(int idx) +{ + struct ublk_device *ub; + int ret; + + ret = mutex_lock_killable(&ublk_ctl_mutex); + if (ret) + return ret; + + ub = ublk_get_device_from_id(idx); + if (ub) { + ublk_remove(ub); + ublk_put_device(ub); + ret = 0; + } else { + ret = -ENODEV; + } + + /* + * Wait until the idr is removed, then it can be reused after + * DEL_DEV command is returned. + */ + if (!ret) + wait_event(ublk_idr_wq, ublk_idr_freed(idx)); + mutex_unlock(&ublk_ctl_mutex); + + return ret; +} + +static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + + pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", + __func__, cmd->cmd_op, header->dev_id, header->queue_id, + header->data[0], header->addr, header->len); +} + +static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + ublk_stop_dev(ub); + cancel_work_sync(&ub->stop_work); + cancel_work_sync(&ub->quiesce_work); + + ublk_put_device(ub); + return 0; +} + +static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_device *ub; + int ret = 0; + + if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) + return -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) + ret = -EFAULT; + ublk_put_device(ub); + + return ret; +} + +static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_params_header ph; + struct ublk_device *ub; + int ret; + + if (header->len <= sizeof(ph) || !header->addr) + return -EINVAL; + + if (copy_from_user(&ph, argp, sizeof(ph))) + return -EFAULT; + + if (ph.len > header->len || !ph.len) + return -EINVAL; + + if (ph.len > sizeof(struct ublk_params)) + ph.len = sizeof(struct ublk_params); + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + mutex_lock(&ub->mutex); + if (copy_to_user(argp, &ub->params, ph.len)) + ret = -EFAULT; + else + ret = 0; + mutex_unlock(&ub->mutex); + + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_params_header ph; + struct ublk_device *ub; + int ret = -EFAULT; + + if (header->len <= sizeof(ph) || !header->addr) + return -EINVAL; + + if (copy_from_user(&ph, argp, sizeof(ph))) + return -EFAULT; + + if (ph.len > header->len || !ph.len || !ph.types) + return -EINVAL; + + if (ph.len > sizeof(struct ublk_params)) + ph.len = sizeof(struct ublk_params); + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + /* parameters can only be changed when device isn't live */ + mutex_lock(&ub->mutex); + if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + ret = -EACCES; + } else if (copy_from_user(&ub->params, argp, ph.len)) { + ret = -EFAULT; + } else { + /* clear all we don't support yet */ + ub->params.types &= UBLK_PARAM_TYPE_ALL; + ret = ublk_validate_params(ub); + } + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + + return ret; +} + +static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); + /* All old ioucmds have to be completed */ + WARN_ON_ONCE(ubq->nr_io_ready); + /* old daemon is PF_EXITING, put it now */ + put_task_struct(ubq->ubq_daemon); + /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ + ubq->ubq_daemon = NULL; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + /* forget everything now and be ready for new FETCH_REQ */ + io->flags = 0; + io->cmd = NULL; + io->addr = 0; + } +} + +static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub; + int ret = -EINVAL; + int i; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return ret; + + mutex_lock(&ub->mutex); + if (!ublk_can_use_recovery(ub)) + goto out_unlock; + /* + * START_RECOVERY is only allowd after: + * + * (1) UB_STATE_OPEN is not set, which means the dying process is exited + * and related io_uring ctx is freed so file struct of /dev/ublkcX is + * released. + * + * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: + * (a)has quiesced request queue + * (b)has requeued every inflight rqs whose io_flags is ACTIVE + * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE + * (d)has completed/camceled all ioucmds owned by ther dying process + */ + if (test_bit(UB_STATE_OPEN, &ub->state) || + ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + ret = -EBUSY; + goto out_unlock; + } + pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + ub->mm = NULL; + ub->nr_queues_ready = 0; + init_completion(&ub->completion); + ret = 0; + out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ublksrv_pid = (int)header->data[0]; + struct ublk_device *ub; + int ret = -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return ret; + + pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", + __func__, ub->dev_info.nr_hw_queues, header->dev_id); + /* wait until new ubq_daemon sending all FETCH_REQ */ + wait_for_completion_interruptible(&ub->completion); + pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", + __func__, ub->dev_info.nr_hw_queues, header->dev_id); + + mutex_lock(&ub->mutex); + if (!ublk_can_use_recovery(ub)) + goto out_unlock; + + if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + ret = -EBUSY; + goto out_unlock; + } + ub->dev_info.ublksrv_pid = ublksrv_pid; + pr_devel("%s: new ublksrv_pid %d, dev id %d\n", + __func__, ublksrv_pid, header->dev_id); + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); + ret = 0; + out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ret = -EINVAL; + + ublk_ctrl_cmd_dump(cmd); + + if (!(issue_flags & IO_URING_F_SQE128)) + goto out; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENODEV; + switch (cmd->cmd_op) { + case UBLK_CMD_START_DEV: + ret = ublk_ctrl_start_dev(cmd); + break; + case UBLK_CMD_STOP_DEV: + ret = ublk_ctrl_stop_dev(cmd); + break; + case UBLK_CMD_GET_DEV_INFO: + ret = ublk_ctrl_get_dev_info(cmd); + break; + case UBLK_CMD_ADD_DEV: + ret = ublk_ctrl_add_dev(cmd); + break; + case UBLK_CMD_DEL_DEV: + ret = ublk_ctrl_del_dev(header->dev_id); + break; + case UBLK_CMD_GET_QUEUE_AFFINITY: + ret = ublk_ctrl_get_queue_affinity(cmd); + break; + case UBLK_CMD_GET_PARAMS: + ret = ublk_ctrl_get_params(cmd); + break; + case UBLK_CMD_SET_PARAMS: + ret = ublk_ctrl_set_params(cmd); + break; + case UBLK_CMD_START_USER_RECOVERY: + ret = ublk_ctrl_start_recovery(cmd); + break; + case UBLK_CMD_END_USER_RECOVERY: + ret = ublk_ctrl_end_recovery(cmd); + break; + default: + break; + } + out: + io_uring_cmd_done(cmd, ret, 0); + pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", + __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); + return -EIOCBQUEUED; +} + +static const struct file_operations ublk_ctl_fops = { + .open = nonseekable_open, + .uring_cmd = ublk_ctrl_uring_cmd, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice ublk_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ublk-control", + .fops = &ublk_ctl_fops, +}; + +static int __init ublk_init(void) +{ + int ret; + + init_waitqueue_head(&ublk_idr_wq); + + ret = misc_register(&ublk_misc); + if (ret) + return ret; + + ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); + if (ret) + goto unregister_mis; + + ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); + if (IS_ERR(ublk_chr_class)) { + ret = PTR_ERR(ublk_chr_class); + goto free_chrdev_region; + } + return 0; + +free_chrdev_region: + unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); +unregister_mis: + misc_deregister(&ublk_misc); + return ret; +} + +static void __exit ublk_exit(void) +{ + struct ublk_device *ub; + int id; + + class_destroy(ublk_chr_class); + + misc_deregister(&ublk_misc); + + idr_for_each_entry(&ublk_index_idr, ub, id) + ublk_remove(ub); + + idr_destroy(&ublk_index_idr); + unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); +} + +module_init(ublk_init); +module_exit(ublk_exit); + +MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6ae38776e30e..19da5defd734 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -37,6 +37,10 @@ MODULE_PARM_DESC(num_request_queues, "0 for no limit. " "Values > nr_cpu_ids truncated to nr_cpu_ids."); +static unsigned int poll_queues; +module_param(poll_queues, uint, 0644); +MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/O"); + static int major; static DEFINE_IDA(vd_index_ida); @@ -69,21 +73,12 @@ struct virtio_blk { /* Process context for config space updates */ struct work_struct config_work; - /* - * Tracks references from block_device_operations open/release and - * virtio_driver probe/remove so this object can be freed once no - * longer in use. - */ - refcount_t refs; - - /* What host tells us, plus 2 for header & tailer. */ - unsigned int sg_elems; - /* Ida index - used to track minor number allocations. */ int index; /* num of vqs */ int num_vqs; + int io_queues[HCTX_MAX_TYPES]; struct virtio_blk_vq *vqs; }; @@ -106,8 +101,15 @@ static inline blk_status_t virtblk_result(struct virtblk_req *vbr) } } -static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, - struct scatterlist *data_sg, bool have_data) +static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num]; + + return vq; +} + +static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) { struct scatterlist hdr, status, *sgs[3]; unsigned int num_out = 0, num_in = 0; @@ -115,11 +117,11 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &hdr; - if (have_data) { + if (vbr->sg_table.nents) { if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) - sgs[num_out++] = data_sg; + sgs[num_out++] = vbr->sg_table.sgl; else - sgs[num_out + num_in++] = data_sg; + sgs[num_out + num_in++] = vbr->sg_table.sgl; } sg_init_one(&status, &vbr->status, sizeof(vbr->status)); @@ -128,7 +130,7 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap) +static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool unmap) { unsigned short segments = blk_rq_nr_discard_segments(req); unsigned short n = 0; @@ -238,6 +240,9 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, type = VIRTIO_BLK_T_WRITE_ZEROES; unmap = !(req->cmd_flags & REQ_NOUNMAP); break; + case REQ_OP_SECURE_ERASE: + type = VIRTIO_BLK_T_SECURE_ERASE; + break; case REQ_OP_DRV_IN: type = VIRTIO_BLK_T_GET_ID; break; @@ -249,8 +254,9 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, vbr->out_hdr.type = cpu_to_virtio32(vdev, type); vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); - if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { - if (virtblk_setup_discard_write_zeroes(req, unmap)) + if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES || + type == VIRTIO_BLK_T_SECURE_ERASE) { + if (virtblk_setup_discard_write_zeroes_erase(req, unmap)) return BLK_STS_RESOURCE; } @@ -309,6 +315,28 @@ static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) virtqueue_notify(vq->vq); } +static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, + struct virtio_blk *vblk, + struct request *req, + struct virtblk_req *vbr) +{ + blk_status_t status; + + status = virtblk_setup_cmd(vblk->vdev, req, vbr); + if (unlikely(status)) + return status; + + vbr->sg_table.nents = virtblk_map_data(hctx, req, vbr); + if (unlikely(vbr->sg_table.nents < 0)) { + virtblk_cleanup_cmd(req); + return BLK_STS_RESOURCE; + } + + blk_mq_start_request(req); + + return BLK_STS_OK; +} + static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -316,28 +344,17 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; - int num; int qid = hctx->queue_num; bool notify = false; blk_status_t status; int err; - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); - - status = virtblk_setup_cmd(vblk->vdev, req, vbr); + status = virtblk_prep_rq(hctx, vblk, req, vbr); if (unlikely(status)) return status; - blk_mq_start_request(req); - - num = virtblk_map_data(hctx, req, vbr); - if (unlikely(num < 0)) { - virtblk_cleanup_cmd(req); - return BLK_STS_RESOURCE; - } - spin_lock_irqsave(&vblk->vqs[qid].lock, flags); - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); + err = virtblk_add_req(vblk->vqs[qid].vq, vbr); if (err) { virtqueue_kick(vblk->vqs[qid].vq); /* Don't stop the queue if -ENOMEM: we may have failed to @@ -367,6 +384,74 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static bool virtblk_prep_rq_batch(struct request *req) +{ + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + req->mq_hctx->tags->rqs[req->tag] = req; + + return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK; +} + +static bool virtblk_add_req_batch(struct virtio_blk_vq *vq, + struct request **rqlist) +{ + unsigned long flags; + int err; + bool kick; + + spin_lock_irqsave(&vq->lock, flags); + + while (!rq_list_empty(*rqlist)) { + struct request *req = rq_list_pop(rqlist); + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + err = virtblk_add_req(vq->vq, vbr); + if (err) { + virtblk_unmap_data(req, vbr); + virtblk_cleanup_cmd(req); + blk_mq_requeue_request(req, true); + } + } + + kick = virtqueue_kick_prepare(vq->vq); + spin_unlock_irqrestore(&vq->lock, flags); + + return kick; +} + +static void virtio_queue_rqs(struct request **rqlist) +{ + struct request *req, *next, *prev = NULL; + struct request *requeue_list = NULL; + + rq_list_for_each_safe(rqlist, req, next) { + struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx); + bool kick; + + if (!virtblk_prep_rq_batch(req)) { + rq_list_move(rqlist, &requeue_list, req, prev); + req = prev; + if (!req) + continue; + } + + if (!next || req->mq_hctx != next->mq_hctx) { + req->rq_next = NULL; + kick = virtblk_add_req_batch(vq, rqlist); + if (kick) + virtqueue_notify(vq->vq); + + *rqlist = next; + prev = NULL; + } else + prev = req; + } + + *rqlist = requeue_list; +} + /* return id (s/n) string for *disk to *id_str */ static int virtblk_get_id(struct gendisk *disk, char *id_str) @@ -384,50 +469,13 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) if (err) goto out; - blk_execute_rq(vblk->disk, req, false); + blk_execute_rq(req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_mq_free_request(req); return err; } -static void virtblk_get(struct virtio_blk *vblk) -{ - refcount_inc(&vblk->refs); -} - -static void virtblk_put(struct virtio_blk *vblk) -{ - if (refcount_dec_and_test(&vblk->refs)) { - ida_simple_remove(&vd_index_ida, vblk->index); - mutex_destroy(&vblk->vdev_mutex); - kfree(vblk); - } -} - -static int virtblk_open(struct block_device *bd, fmode_t mode) -{ - struct virtio_blk *vblk = bd->bd_disk->private_data; - int ret = 0; - - mutex_lock(&vblk->vdev_mutex); - - if (vblk->vdev) - virtblk_get(vblk); - else - ret = -ENXIO; - - mutex_unlock(&vblk->vdev_mutex); - return ret; -} - -static void virtblk_release(struct gendisk *disk, fmode_t mode) -{ - struct virtio_blk *vblk = disk->private_data; - - virtblk_put(vblk); -} - /* We provide getgeo only to please some old bootloader/partitioning tools */ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) { @@ -460,11 +508,19 @@ out: return ret; } +static void virtblk_free_disk(struct gendisk *disk) +{ + struct virtio_blk *vblk = disk->private_data; + + ida_simple_remove(&vd_index_ida, vblk->index); + mutex_destroy(&vblk->vdev_mutex); + kfree(vblk); +} + static const struct block_device_operations virtblk_fops = { - .owner = THIS_MODULE, - .open = virtblk_open, - .release = virtblk_release, - .getgeo = virtblk_getgeo, + .owner = THIS_MODULE, + .getgeo = virtblk_getgeo, + .free_disk = virtblk_free_disk, }; static int index_to_minor(int index) @@ -553,6 +609,7 @@ static int init_vq(struct virtio_blk *vblk) const char **names; struct virtqueue **vqs; unsigned short num_vqs; + unsigned int num_poll_vqs; struct virtio_device *vdev = vblk->vdev; struct irq_affinity desc = { 0, }; @@ -561,6 +618,7 @@ static int init_vq(struct virtio_blk *vblk) &num_vqs); if (err) num_vqs = 1; + if (!err && !num_vqs) { dev_err(&vdev->dev, "MQ advertised but zero queues reported\n"); return -EINVAL; @@ -570,6 +628,17 @@ static int init_vq(struct virtio_blk *vblk) min_not_zero(num_request_queues, nr_cpu_ids), num_vqs); + num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1); + + vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs; + vblk->io_queues[HCTX_TYPE_READ] = 0; + vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs; + + dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n", + vblk->io_queues[HCTX_TYPE_DEFAULT], + vblk->io_queues[HCTX_TYPE_READ], + vblk->io_queues[HCTX_TYPE_POLL]); + vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL); if (!vblk->vqs) return -ENOMEM; @@ -582,12 +651,18 @@ static int init_vq(struct virtio_blk *vblk) goto out; } - for (i = 0; i < num_vqs; i++) { + for (i = 0; i < num_vqs - num_poll_vqs; i++) { callbacks[i] = virtblk_done; snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i); names[i] = vblk->vqs[i].name; } + for (; i < num_vqs; i++) { + callbacks[i] = NULL; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i); + names[i] = vblk->vqs[i].name; + } + /* Discover virtqueues and write information to configuration. */ err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); if (err) @@ -730,19 +805,79 @@ static const struct attribute_group *virtblk_attr_groups[] = { NULL, }; -static int virtblk_map_queues(struct blk_mq_tag_set *set) +static void virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; + int i, qoff; - return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT], - vblk->vdev, 0); + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + map->nr_queues = vblk->io_queues[i]; + map->queue_offset = qoff; + qoff += map->nr_queues; + + if (map->nr_queues == 0) + continue; + + /* + * Regular queues have interrupts and hence CPU affinity is + * defined by the core virtio code, but polling queues have + * no interrupts so we let the block layer assign CPU affinity. + */ + if (i == HCTX_TYPE_POLL) + blk_mq_map_queues(&set->map[i]); + else + blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); + } +} + +static void virtblk_complete_batch(struct io_comp_batch *iob) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); + virtblk_cleanup_cmd(req); + } + blk_mq_end_request_batch(iob); +} + +static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); + struct virtblk_req *vbr; + unsigned long flags; + unsigned int len; + int found = 0; + + spin_lock_irqsave(&vq->lock, flags); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_add_to_batch(req, iob, vbr->status, + virtblk_complete_batch)) + blk_mq_complete_request(req); + } + + if (found) + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + + spin_unlock_irqrestore(&vq->lock, flags); + + return found; } static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, + .queue_rqs = virtio_queue_rqs, .commit_rqs = virtio_commit_rqs, .complete = virtblk_request_done, .map_queues = virtblk_map_queues, + .poll = virtblk_poll, }; static unsigned int virtblk_queue_depth; @@ -755,6 +890,8 @@ static int virtblk_probe(struct virtio_device *vdev) int err, index; u32 v, blk_size, max_size, sg_elems, opt_io_size; + u32 max_discard_segs = 0; + u32 discard_granularity = 0; u16 min_io_size; u8 physical_block_exp, alignment_offset; unsigned int queue_depth; @@ -783,20 +920,15 @@ static int virtblk_probe(struct virtio_device *vdev) /* Prevent integer overflows and honor max vq size */ sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); - /* We need extra sg elements at head and tail. */ - sg_elems += 2; vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; goto out_free_index; } - /* This reference is dropped in virtblk_remove(). */ - refcount_set(&vblk->refs, 1); mutex_init(&vblk->vdev_mutex); vblk->vdev = vdev; - vblk->sg_elems = sg_elems; INIT_WORK(&vblk->config_work, virtblk_config_changed_work); @@ -824,6 +956,9 @@ static int virtblk_probe(struct virtio_device *vdev) sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; vblk->tag_set.driver_data = vblk; vblk->tag_set.nr_hw_queues = vblk->num_vqs; + vblk->tag_set.nr_maps = 1; + if (vblk->io_queues[HCTX_TYPE_POLL]) + vblk->tag_set.nr_maps = 3; err = blk_mq_alloc_tag_set(&vblk->tag_set); if (err) @@ -843,7 +978,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->minors = 1 << PART_BITS; vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; - vblk->disk->flags |= GENHD_FL_EXT_DEVT; vblk->index = index; /* configure queue flush support */ @@ -854,7 +988,7 @@ static int virtblk_probe(struct virtio_device *vdev) set_disk_ro(vblk->disk, 1); /* We can handle whatever the host told us to handle. */ - blk_queue_max_segments(q, vblk->sg_elems-2); + blk_queue_max_segments(q, sg_elems); /* No real sector limit. */ blk_queue_max_hw_sectors(q, -1U); @@ -914,23 +1048,15 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_io_opt(q, blk_size * opt_io_size); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { - q->limits.discard_granularity = blk_size; - virtio_cread(vdev, struct virtio_blk_config, - discard_sector_alignment, &v); - q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0; + discard_sector_alignment, &discard_granularity); virtio_cread(vdev, struct virtio_blk_config, max_discard_sectors, &v); blk_queue_max_discard_sectors(q, v ? v : UINT_MAX); virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, - &v); - blk_queue_max_discard_segments(q, - min_not_zero(v, - MAX_DISCARD_SEGMENTS)); - - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); + &max_discard_segs); } if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { @@ -939,6 +1065,85 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX); } + /* The discard and secure erase limits are combined since the Linux + * block layer uses the same limit for both commands. + * + * If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features + * are negotiated, we will use the minimum between the limits. + * + * discard sector alignment is set to the minimum between discard_sector_alignment + * and secure_erase_sector_alignment. + * + * max discard sectors is set to the minimum between max_discard_seg and + * max_secure_erase_seg. + */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) { + + virtio_cread(vdev, struct virtio_blk_config, + secure_erase_sector_alignment, &v); + + /* secure_erase_sector_alignment should not be zero, the device should set a + * valid number of sectors. + */ + if (!v) { + dev_err(&vdev->dev, + "virtio_blk: secure_erase_sector_alignment can't be 0\n"); + err = -EINVAL; + goto out_cleanup_disk; + } + + discard_granularity = min_not_zero(discard_granularity, v); + + virtio_cread(vdev, struct virtio_blk_config, + max_secure_erase_sectors, &v); + + /* max_secure_erase_sectors should not be zero, the device should set a + * valid number of sectors. + */ + if (!v) { + dev_err(&vdev->dev, + "virtio_blk: max_secure_erase_sectors can't be 0\n"); + err = -EINVAL; + goto out_cleanup_disk; + } + + blk_queue_max_secure_erase_sectors(q, v); + + virtio_cread(vdev, struct virtio_blk_config, + max_secure_erase_seg, &v); + + /* max_secure_erase_seg should not be zero, the device should set a + * valid number of segments + */ + if (!v) { + dev_err(&vdev->dev, + "virtio_blk: max_secure_erase_seg can't be 0\n"); + err = -EINVAL; + goto out_cleanup_disk; + } + + max_discard_segs = min_not_zero(max_discard_segs, v); + } + + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) || + virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) { + /* max_discard_seg and discard_granularity will be 0 only + * if max_discard_seg and discard_sector_alignment fields in the virtio + * config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated. + * In this case, we use default values. + */ + if (!max_discard_segs) + max_discard_segs = sg_elems; + + blk_queue_max_discard_segments(q, + min(max_discard_segs, MAX_DISCARD_SEGMENTS)); + + if (discard_granularity) + q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT; + else + q->limits.discard_granularity = blk_size; + } + virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); @@ -949,7 +1154,7 @@ static int virtblk_probe(struct virtio_device *vdev) return 0; out_cleanup_disk: - blk_cleanup_disk(vblk->disk); + put_disk(vblk->disk); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); out_free_vq: @@ -971,13 +1176,12 @@ static void virtblk_remove(struct virtio_device *vdev) flush_work(&vblk->config_work); del_gendisk(vblk->disk); - blk_cleanup_disk(vblk->disk); blk_mq_free_tag_set(&vblk->tag_set); mutex_lock(&vblk->vdev_mutex); /* Stop all the virtqueues. */ - vdev->config->reset(vdev); + virtio_reset_device(vdev); /* Virtqueues are stopped, nothing can use vblk->vdev anymore. */ vblk->vdev = NULL; @@ -987,7 +1191,7 @@ static void virtblk_remove(struct virtio_device *vdev) mutex_unlock(&vblk->vdev_mutex); - virtblk_put(vblk); + put_disk(vblk->disk); } #ifdef CONFIG_PM_SLEEP @@ -996,7 +1200,7 @@ static int virtblk_freeze(struct virtio_device *vdev) struct virtio_blk *vblk = vdev->priv; /* Ensure we don't receive any more interrupts */ - vdev->config->reset(vdev); + virtio_reset_device(vdev); /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); @@ -1035,6 +1239,7 @@ static unsigned int features_legacy[] = { VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, + VIRTIO_BLK_F_SECURE_ERASE, } ; static unsigned int features[] = { @@ -1042,6 +1247,7 @@ static unsigned int features[] = { VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, + VIRTIO_BLK_F_SECURE_ERASE, }; static struct virtio_driver virtio_blk = { @@ -1061,7 +1267,7 @@ static struct virtio_driver virtio_blk = { #endif }; -static int __init init(void) +static int __init virtio_blk_init(void) { int error; @@ -1087,14 +1293,14 @@ out_destroy_workqueue: return error; } -static void __exit fini(void) +static void __exit virtio_blk_fini(void) { unregister_virtio_driver(&virtio_blk); unregister_blkdev(major, "virtblk"); destroy_workqueue(virtblk_wq); } -module_init(init); -module_exit(fini); +module_init(virtio_blk_init); +module_exit(virtio_blk_fini); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio block driver"); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 14e452896d04..a5cf7f1e871c 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -442,7 +442,7 @@ static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) * Routines for managing virtual block devices (vbds). */ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, - int operation) + enum req_op operation) { struct xen_vbd *vbd = &blkif->vbd; int rc = -EACCES; @@ -931,7 +931,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, if (rc) goto unmap; - for (n = 0, i = 0; n < nseg; n++) { + for (n = 0; n < nseg; n++) { uint8_t first_sect, last_sect; if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { @@ -970,7 +970,6 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, int status = BLKIF_RSP_OKAY; struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; - unsigned long secure; struct phys_req preq; xen_blkif_get(blkif); @@ -987,13 +986,15 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, } ring->st_ds_req++; - secure = (blkif->vbd.discard_secure && - (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? - BLKDEV_DISCARD_SECURE : 0; + if (blkif->vbd.discard_secure && + (req->u.discard.flag & BLKIF_DISCARD_SECURE)) + err = blkdev_issue_secure_erase(bdev, + req->u.discard.sector_number, + req->u.discard.nr_sectors, GFP_KERNEL); + else + err = blkdev_issue_discard(bdev, req->u.discard.sector_number, + req->u.discard.nr_sectors, GFP_KERNEL); - err = blkdev_issue_discard(bdev, req->u.discard.sector_number, - req->u.discard.nr_sectors, - GFP_KERNEL, secure); fail_response: if (err == -EOPNOTSUPP) { pr_debug("discard op failed, not supported\n"); @@ -1192,8 +1193,8 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct bio *bio = NULL; struct bio **biolist = pending_req->biolist; int i, nbio = 0; - int operation; - int operation_flags = 0; + enum req_op operation; + blk_opf_t operation_flags = 0; struct blk_plug plug; bool drain = false; struct grant_page **pages = pending_req->segments; @@ -1326,16 +1327,13 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, pages[i]->page, seg[i].nsec << 9, seg[i].offset) == 0)) { - bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i)); - if (unlikely(bio == NULL)) - goto fail_put_bio; - + bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i), + operation | operation_flags, + GFP_KERNEL); biolist[nbio++] = bio; - bio_set_dev(bio, preq.bdev); bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; bio->bi_iter.bi_sector = preq.sector_number; - bio_set_op_attrs(bio, operation, operation_flags); } preq.sector_number += seg[i].nsec; @@ -1345,15 +1343,11 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, if (!bio) { BUG_ON(operation_flags != REQ_PREFLUSH); - bio = bio_alloc(GFP_KERNEL, 0); - if (unlikely(bio == NULL)) - goto fail_put_bio; - + bio = bio_alloc(preq.bdev, 0, operation | operation_flags, + GFP_KERNEL); biolist[nbio++] = bio; - bio_set_dev(bio, preq.bdev); bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; - bio_set_op_attrs(bio, operation, operation_flags); } atomic_set(&pending_req->pendcnt, nbio); @@ -1381,14 +1375,6 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, free_req(ring, pending_req); msleep(1); /* back off a bit */ return -EIO; - - fail_put_bio: - for (i = 0; i < nbio; i++) - bio_put(biolist[i]); - atomic_set(&pending_req->pendcnt, 1); - __end_block_io_op(pending_req, BLK_STS_RESOURCE); - msleep(1); /* back off a bit */ - return -EIO; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index bda5c815e441..a28473470e66 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -226,6 +226,9 @@ struct xen_vbd { sector_t size; unsigned int flush_support:1; unsigned int discard_secure:1; + /* Connect-time cached feature_persistent parameter value */ + unsigned int feature_gnt_persistent_parm:1; + /* Persistent grants feature negotiation result */ unsigned int feature_gnt_persistent:1; unsigned int overflow_max_grants:1; }; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 914587aabca0..c0227dfa4688 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/kthread.h> +#include <linux/pagemap.h> #include <xen/events.h> #include <xen/grant_table.h> #include "common.h" @@ -156,6 +157,11 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) return 0; } +/* Enable the persistent grants feature. */ +static bool feature_persistent = true; +module_param(feature_persistent, bool, 0644); +MODULE_PARM_DESC(feature_persistent, "Enables the persistent grants feature"); + static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; @@ -471,19 +477,12 @@ static void xen_vbd_free(struct xen_vbd *vbd) vbd->bdev = NULL; } -/* Enable the persistent grants feature. */ -static bool feature_persistent = true; -module_param(feature_persistent, bool, 0644); -MODULE_PARM_DESC(feature_persistent, - "Enables the persistent grants feature"); - static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, unsigned major, unsigned minor, int readonly, int cdrom) { struct xen_vbd *vbd; struct block_device *bdev; - struct request_queue *q; vbd = &blkif->vbd; vbd->handle = handle; @@ -510,20 +509,16 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, } vbd->size = vbd_sz(vbd); - if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + if (cdrom || disk_to_cdi(vbd->bdev->bd_disk)) vbd->type |= VDISK_CDROM; if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) vbd->type |= VDISK_REMOVABLE; - q = bdev_get_queue(bdev); - if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (bdev_write_cache(bdev)) vbd->flush_support = true; - - if (q && blk_queue_secure_erase(q)) + if (bdev_max_secure_erase_sectors(bdev)) vbd->discard_secure = true; - vbd->feature_gnt_persistent = feature_persistent; - pr_debug("Successful creation of handle=%04x (dom=%u)\n", handle, blkif->domid); return 0; @@ -577,22 +572,21 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info int err; int state = 0; struct block_device *bdev = be->blkif->vbd.bdev; - struct request_queue *q = bdev_get_queue(bdev); if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1)) return; - if (blk_queue_discard(q)) { + if (bdev_max_discard_sectors(bdev)) { err = xenbus_printf(xbt, dev->nodename, "discard-granularity", "%u", - q->limits.discard_granularity); + bdev_discard_granularity(bdev)); if (err) { dev_warn(&dev->dev, "writing discard-granularity (%d)", err); return; } err = xenbus_printf(xbt, dev->nodename, "discard-alignment", "%u", - q->limits.discard_alignment); + bdev_discard_alignment(bdev)); if (err) { dev_warn(&dev->dev, "writing discard-alignment (%d)", err); return; @@ -913,7 +907,7 @@ again: xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", - be->blkif->vbd.feature_gnt_persistent); + be->blkif->vbd.feature_gnt_persistent_parm); if (err) { xenbus_dev_fatal(dev, err, "writing %s/feature-persistent", dev->nodename); @@ -1090,10 +1084,11 @@ static int connect_ring(struct backend_info *be) xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); return -ENOSYS; } - if (blkif->vbd.feature_gnt_persistent) - blkif->vbd.feature_gnt_persistent = - xenbus_read_unsigned(dev->otherend, - "feature-persistent", 0); + + blkif->vbd.feature_gnt_persistent_parm = feature_persistent; + blkif->vbd.feature_gnt_persistent = + blkif->vbd.feature_gnt_persistent_parm && + xenbus_read_unsigned(dev->otherend, "feature-persistent", 0); blkif->vbd.overflow_max_grants = 0; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8e3983e456f3..35b9bcad9db9 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -152,6 +152,10 @@ static unsigned int xen_blkif_max_ring_order; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); +static bool __read_mostly xen_blkif_trusted = true; +module_param_named(trusted, xen_blkif_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + #define BLK_RING_SIZE(info) \ __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) @@ -198,6 +202,7 @@ struct blkfront_info struct gendisk *gd; u16 sector_size; unsigned int physical_sector_size; + unsigned long vdisk_info; int vdevice; blkif_vdev_t handle; enum blkif_state connected; @@ -208,7 +213,11 @@ struct blkfront_info unsigned int feature_fua:1; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; + /* Connect-time cached feature_persistent parameter */ + unsigned int feature_persistent_parm:1; + /* Persistent grants feature negotiation result */ unsigned int feature_persistent:1; + unsigned int bounce:1; unsigned int discard_granularity; unsigned int discard_alignment; /* Number of 4KB segments handled */ @@ -228,8 +237,6 @@ static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); -#define GRANT_INVALID_REF 0 - #define PARTS_PER_DISK 16 #define PARTS_PER_EXT_DISK 256 @@ -311,8 +318,8 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) if (!gnt_list_entry) goto out_of_memory; - if (info->feature_persistent) { - granted_page = alloc_page(GFP_NOIO); + if (info->bounce) { + granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); if (!granted_page) { kfree(gnt_list_entry); goto out_of_memory; @@ -320,7 +327,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) gnt_list_entry->page = granted_page; } - gnt_list_entry->gref = GRANT_INVALID_REF; + gnt_list_entry->gref = INVALID_GRANT_REF; list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -331,7 +338,7 @@ out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, &rinfo->grants, node) { list_del(&gnt_list_entry->node); - if (info->feature_persistent) + if (info->bounce) __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; @@ -349,7 +356,7 @@ static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) node); list_del(&gnt_list_entry->node); - if (gnt_list_entry->gref != GRANT_INVALID_REF) + if (gnt_list_entry->gref != INVALID_GRANT_REF) rinfo->persistent_gnts_c--; return gnt_list_entry; @@ -371,13 +378,13 @@ static struct grant *get_grant(grant_ref_t *gref_head, struct grant *gnt_list_entry = get_free_grant(rinfo); struct blkfront_info *info = rinfo->dev_info; - if (gnt_list_entry->gref != GRANT_INVALID_REF) + if (gnt_list_entry->gref != INVALID_GRANT_REF) return gnt_list_entry; /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (info->feature_persistent) + if (info->bounce) grant_foreign_access(gnt_list_entry, info); else { /* Grant access to the GFN passed by the caller */ @@ -395,13 +402,13 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct grant *gnt_list_entry = get_free_grant(rinfo); struct blkfront_info *info = rinfo->dev_info; - if (gnt_list_entry->gref != GRANT_INVALID_REF) + if (gnt_list_entry->gref != INVALID_GRANT_REF) return gnt_list_entry; /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (!info->feature_persistent) { + if (!info->bounce) { struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ @@ -505,6 +512,7 @@ static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) static int blkif_ioctl(struct block_device *bdev, fmode_t mode, unsigned command, unsigned long argument) { + struct blkfront_info *info = bdev->bd_disk->private_data; int i; switch (command) { @@ -514,9 +522,9 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return -EFAULT; return 0; case CDROM_GET_CAPABILITY: - if (bdev->bd_disk->flags & GENHD_FL_CD) - return 0; - return -EINVAL; + if (!(info->vdisk_info & VDISK_CDROM)) + return -EINVAL; + return 0; default: return -EINVAL; } @@ -574,7 +582,7 @@ struct setup_rw_req { struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; - /* Only used when persistent grant is used and it's a read request */ + /* Only used when persistent grant is used and it's a write request */ bool need_copy; unsigned int bvec_off; char *bvec_data; @@ -703,7 +711,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri .grant_idx = 0, .segments = NULL, .rinfo = rinfo, - .need_copy = rq_data_dir(req) && info->feature_persistent, + .need_copy = rq_data_dir(req) && info->bounce, }; /* @@ -942,13 +950,13 @@ static void blkif_set_queue_limits(struct blkfront_info *info) blk_queue_flag_set(QUEUE_FLAG_VIRT, rq); if (info->feature_discard) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, rq); blk_queue_max_discard_sectors(rq, get_capacity(gd)); rq->limits.discard_granularity = info->discard_granularity ?: info->physical_sector_size; rq->limits.discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, rq); + blk_queue_max_secure_erase_sectors(rq, + get_capacity(gd)); } /* Hard sector size and max sectors impersonate the equiv. hardware. */ @@ -981,11 +989,12 @@ static void xlvbd_flush(struct blkfront_info *info) { blk_queue_write_cache(info->rq, info->feature_flush ? true : false, info->feature_fua ? true : false); - pr_info("blkfront: %s: %s %s %s %s %s\n", + pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? "enabled;" : "disabled;", "indirect descriptors:", - info->max_indirect_segments ? "enabled;" : "disabled;"); + info->max_indirect_segments ? "enabled;" : "disabled;", + "bounce buffer:", info->bounce ? "enabled" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -1057,9 +1066,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, - u16 vdisk_info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info, u16 sector_size, + unsigned int physical_sector_size) { struct gendisk *gd; int nr_minors = 1; @@ -1157,15 +1165,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, xlvbd_flush(info); - if (vdisk_info & VDISK_READONLY) + if (info->vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); - - if (vdisk_info & VDISK_REMOVABLE) + if (info->vdisk_info & VDISK_REMOVABLE) gd->flags |= GENHD_FL_REMOVABLE; - if (vdisk_info & VDISK_CDROM) - gd->flags |= GENHD_FL_CD; - return 0; out_free_tag_set: @@ -1212,7 +1216,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - BUG_ON(info->feature_persistent); + BUG_ON(info->bounce); list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); @@ -1224,12 +1228,12 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) list_for_each_entry_safe(persistent_gnt, n, &rinfo->grants, node) { list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { + if (persistent_gnt->gref != INVALID_GRANT_REF) { gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); + NULL); rinfo->persistent_gnts_c--; } - if (info->feature_persistent) + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1249,8 +1253,8 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { persistent_gnt = rinfo->shadow[i].grants_used[j]; - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - if (info->feature_persistent) + gnttab_end_foreign_access(persistent_gnt->gref, NULL); + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1264,7 +1268,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) for (j = 0; j < INDIRECT_GREFS(segs); j++) { persistent_gnt = rinfo->shadow[i].indirect_grants[j]; - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + gnttab_end_foreign_access(persistent_gnt->gref, NULL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1285,14 +1289,8 @@ free_shadow: flush_work(&rinfo->work); /* Free resources associated with old device channel. */ - for (i = 0; i < info->nr_ring_pages; i++) { - if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); - rinfo->ring_ref[i] = GRANT_INVALID_REF; - } - } - free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); - rinfo->ring.sring = NULL; + xenbus_teardown_ring((void **)&rinfo->ring.sring, info->nr_ring_pages, + rinfo->ring_ref); if (rinfo->irq) unbind_from_irqhandler(rinfo->irq, rinfo); @@ -1375,9 +1373,15 @@ static int blkif_get_final_status(enum blk_req_status s1, return BLKIF_RSP_OKAY; } -static bool blkif_completion(unsigned long *id, - struct blkfront_ring_info *rinfo, - struct blkif_response *bret) +/* + * Return values: + * 1 response processed. + * 0 missing further responses. + * -1 error while processing. + */ +static int blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, + struct blkif_response *bret) { int i = 0; struct scatterlist *sg; @@ -1400,7 +1404,7 @@ static bool blkif_completion(unsigned long *id, /* Wait the second response if not yet here. */ if (s2->status < REQ_DONE) - return false; + return 0; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1433,7 +1437,7 @@ static bool blkif_completion(unsigned long *id, data.s = s; num_sg = s->num_sg; - if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { + if (bret->operation == BLKIF_OP_READ && info->bounce) { for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); @@ -1451,57 +1455,58 @@ static bool blkif_completion(unsigned long *id, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { - if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first. */ - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->grants_used[i]->gref); + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + return -1; + } list_add(&s->grants_used[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { /* - * If the grant is not mapped by the backend we end the - * foreign access and add it to the tail of the list, - * so it will not be picked again unless we run out of - * persistent grants. + * If the grant is not mapped by the backend we add it + * to the tail of the list, so it will not be picked + * again unless we run out of persistent grants. */ - gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); - s->grants_used[i]->gref = GRANT_INVALID_REF; + s->grants_used[i]->gref = INVALID_GRANT_REF; list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { - if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->indirect_grants[i]->gref); + if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + return -1; + } list_add(&s->indirect_grants[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { struct page *indirect_page; - gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); /* * Add the used indirect page back to the list of * available pages for indirect grefs. */ - if (!info->feature_persistent) { + if (!info->bounce) { indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &rinfo->indirect_pages); } - s->indirect_grants[i]->gref = GRANT_INVALID_REF; + s->indirect_grants[i]->gref = INVALID_GRANT_REF; list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } - return true; + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1512,9 +1517,12 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) unsigned long flags; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; struct blkfront_info *info = rinfo->dev_info; + unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); return IRQ_HANDLED; + } spin_lock_irqsave(&rinfo->ring_lock, flags); again: @@ -1530,6 +1538,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) unsigned long id; unsigned int op; + eoiflag = 0; + RING_COPY_RESPONSE(&rinfo->ring, i, &bret); id = bret.id; @@ -1562,12 +1572,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } if (bret.operation != BLKIF_OP_DISCARD) { + int ret; + /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, &bret)) + ret = blkif_completion(&id, rinfo, &bret); + if (!ret) continue; + if (unlikely(ret < 0)) + goto err; } if (add_id_to_freelist(rinfo, id)) { @@ -1591,8 +1606,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, rq); - blk_queue_flag_clear(QUEUE_FLAG_SECERASE, rq); + blk_queue_max_discard_sectors(rq, 0); + blk_queue_max_secure_erase_sectors(rq, 0); } break; case BLKIF_OP_FLUSH_DISKCACHE: @@ -1646,6 +1661,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) spin_unlock_irqrestore(&rinfo->ring_lock, flags); + xen_irq_lateeoi(irq, eoiflag); + return IRQ_HANDLED; err: @@ -1653,6 +1670,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) spin_unlock_irqrestore(&rinfo->ring_lock, flags); + /* No EOI in order to avoid further interrupts. */ + pr_alert("%s disabled for further use\n", info->gd->disk_name); return IRQ_HANDLED; } @@ -1662,38 +1681,23 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; - int err, i; + int err; struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; - grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; - - for (i = 0; i < info->nr_ring_pages; i++) - rinfo->ring_ref[i] = GRANT_INVALID_REF; - - sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, - get_order(ring_size)); - if (!sring) { - xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); - return -ENOMEM; - } - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); - if (err < 0) { - free_pages((unsigned long)sring, get_order(ring_size)); - rinfo->ring.sring = NULL; + err = xenbus_setup_ring(dev, GFP_NOIO, (void **)&sring, + info->nr_ring_pages, rinfo->ring_ref); + if (err) goto fail; - } - for (i = 0; i < info->nr_ring_pages; i++) - rinfo->ring_ref[i] = gref[i]; + + XEN_FRONT_RING_INIT(&rinfo->ring, sring, ring_size); err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, - "blkif", rinfo); + err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt, + 0, "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); @@ -1755,6 +1759,12 @@ abort_transaction: return err; } +/* Enable the persistent grants feature. */ +static bool feature_persistent = true; +module_param(feature_persistent, bool, 0644); +MODULE_PARM_DESC(feature_persistent, + "Enables the persistent grants feature"); + /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, struct blkfront_info *info) @@ -1769,6 +1779,10 @@ static int talk_to_blkback(struct xenbus_device *dev, if (!info) return -ENODEV; + /* Check if backend is trusted. */ + info->bounce = !xen_blkif_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + max_page_order = xenbus_read_unsigned(info->xbdev->otherend, "max-ring-page-order", 0); ring_page_order = min(xen_blkif_max_ring_order, max_page_order); @@ -1842,8 +1856,9 @@ again: message = "writing protocol"; goto abort_transaction; } + info->feature_persistent_parm = feature_persistent; err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", - info->feature_persistent); + info->feature_persistent_parm); if (err) dev_warn(&dev->dev, "writing persistent grants feature to xenbus"); @@ -1911,12 +1926,6 @@ static int negotiate_mq(struct blkfront_info *info) return 0; } -/* Enable the persistent grants feature. */ -static bool feature_persistent = true; -module_param(feature_persistent, bool, 0644); -MODULE_PARM_DESC(feature_persistent, - "Enables the persistent grants feature"); - /* * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffer for communication with the backend, and @@ -1983,8 +1992,6 @@ static int blkfront_probe(struct xenbus_device *dev, info->vdevice = vdevice; info->connected = BLKIF_STATE_DISCONNECTED; - info->feature_persistent = feature_persistent; - /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); dev_set_drvdata(&dev->dev, info); @@ -2119,9 +2126,11 @@ static void blkfront_closing(struct blkfront_info *info) return; /* No more blkif_request(). */ - blk_mq_stop_hw_queues(info->rq); - blk_set_queue_dying(info->rq); - set_capacity(info->gd, 0); + if (info->rq && info->gd) { + blk_mq_stop_hw_queues(info->rq); + blk_mark_disk_dead(info->gd); + set_capacity(info->gd, 0); + } for_each_rinfo(info, rinfo, i) { /* No more gnttab callback work. */ @@ -2176,17 +2185,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) if (err) goto out_of_memory; - if (!info->feature_persistent && info->max_indirect_segments) { + if (!info->bounce && info->max_indirect_segments) { /* - * We are using indirect descriptors but not persistent - * grants, we need to allocate a set of pages that can be + * We are using indirect descriptors but don't have a bounce + * buffer, we need to allocate a set of pages that can be * used for mapping indirect grefs */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { - struct page *indirect_page = alloc_page(GFP_KERNEL); + struct page *indirect_page = alloc_page(GFP_KERNEL | + __GFP_ZERO); if (!indirect_page) goto out_of_memory; list_add(&indirect_page->lru, &rinfo->indirect_pages); @@ -2275,10 +2285,12 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0)) blkfront_setup_discard(info); - if (info->feature_persistent) + if (info->feature_persistent_parm) info->feature_persistent = !!xenbus_read_unsigned(info->xbdev->otherend, "feature-persistent", 0); + if (info->feature_persistent) + info->bounce = true; indirect_segments = xenbus_read_unsigned(info->xbdev->otherend, "feature-max-indirect-segments", 0); @@ -2304,7 +2316,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long long sectors; unsigned long sector_size; unsigned int physical_sector_size; - unsigned int binfo; int err, i; struct blkfront_ring_info *rinfo; @@ -2342,7 +2353,7 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, - "info", "%u", &binfo, + "info", "%u", &info->vdisk_info, "sector-size", "%lu", §or_size, NULL); if (err) { @@ -2371,7 +2382,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + err = xlvbd_alloc_gendisk(sectors, info, sector_size, physical_sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", @@ -2388,7 +2399,7 @@ static void blkfront_connect(struct blkfront_info *info) err = device_add_disk(&info->xbdev->dev, info->gd, NULL); if (err) { - blk_cleanup_disk(info->gd); + put_disk(info->gd); blk_mq_free_tag_set(&info->tag_set); info->rq = NULL; goto fail; @@ -2463,16 +2474,19 @@ static int blkfront_remove(struct xenbus_device *xbdev) dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename); - del_gendisk(info->gd); + if (info->gd) + del_gendisk(info->gd); mutex_lock(&blkfront_mutex); list_del(&info->info_list); mutex_unlock(&blkfront_mutex); blkif_free(info, 0); - xlbd_release_minors(info->gd->first_minor, info->gd->minors); - blk_cleanup_disk(info->gd); - blk_mq_free_tag_set(&info->tag_set); + if (info->gd) { + xlbd_release_minors(info->gd->first_minor, info->gd->minors); + put_disk(info->gd); + blk_mq_free_tag_set(&info->tag_set); + } kfree(info); return 0; @@ -2516,6 +2530,7 @@ static void purge_persistent_grants(struct blkfront_info *info) for_each_rinfo(info, rinfo, i) { struct grant *gnt_list_entry, *tmp; + LIST_HEAD(grants); spin_lock_irqsave(&rinfo->ring_lock, flags); @@ -2526,17 +2541,18 @@ static void purge_persistent_grants(struct blkfront_info *info) list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, node) { - if (gnt_list_entry->gref == GRANT_INVALID_REF || - gnttab_query_foreign_access(gnt_list_entry->gref)) + if (gnt_list_entry->gref == INVALID_GRANT_REF || + !gnttab_try_end_foreign_access(gnt_list_entry->gref)) continue; list_del(&gnt_list_entry->node); - gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; - gnt_list_entry->gref = GRANT_INVALID_REF; - list_add_tail(&gnt_list_entry->node, &rinfo->grants); + gnt_list_entry->gref = INVALID_GRANT_REF; + list_add_tail(&gnt_list_entry->node, &grants); } + list_splice_tail(&grants, &rinfo->grants); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } } @@ -2546,6 +2562,13 @@ static void blkfront_delay_work(struct work_struct *work) struct blkfront_info *info; bool need_schedule_work = false; + /* + * Note that when using bounce buffers but not persistent grants + * there's no need to run blkfront_delay_work because grants are + * revoked in blkif_completion or else an error is reported and the + * connection is closed. + */ + mutex_lock(&blkfront_mutex); list_for_each_entry(info, &info_list, info_list) { diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index ccc52c935faf..c1e85f356e4d 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -327,6 +327,7 @@ static int z2ram_register_disk(int minor) disk->major = Z2RAM_MAJOR; disk->first_minor = minor; disk->minors = 1; + disk->flags |= GENHD_FL_NO_PART; disk->fops = &z2_fops; if (minor) sprintf(disk->disk_name, "z2ram%d", minor); @@ -336,7 +337,7 @@ static int z2ram_register_disk(int minor) z2ram_gendisk[minor] = disk; err = add_disk(disk); if (err) - blk_cleanup_disk(disk); + put_disk(disk); return err; } @@ -383,7 +384,6 @@ static void __exit z2_exit(void) for (i = 0; i < Z2MINOR_COUNT; i++) { del_gendisk(z2ram_gendisk[i]); - blk_cleanup_queue(z2ram_gendisk[i]->queue); put_disk(z2ram_gendisk[i]); } blk_mq_free_tag_set(&tag_set); diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 668c6bf2554d..d4100b0c083e 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 config ZRAM tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO + depends on BLOCK && SYSFS && MMU depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842 + select ZSMALLOC help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). Pages written to these disks are compressed and stored in memory diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 052aa3f65514..0916de952e09 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -63,12 +63,6 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) bool zcomp_available_algorithm(const char *comp) { - int i; - - i = sysfs_match_string(backends, comp); - if (i >= 0) - return true; - /* * Crypto does not ignore a trailing new line symbol, * so make sure you don't supply a string containing @@ -217,6 +211,11 @@ struct zcomp *zcomp_create(const char *compress) struct zcomp *comp; int error; + /* + * Crypto API will execute /sbin/modprobe if the compression module + * is not loaded yet. We must do it here, otherwise we are about to + * call /sbin/modprobe under CPU hot-plug lock. + */ if (!zcomp_available_algorithm(compress)) return ERR_PTR(-EINVAL); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 25071126995b..966aab902d19 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -22,7 +22,6 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/device.h> -#include <linux/genhd.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/backing-dev.h> @@ -53,7 +52,6 @@ static unsigned int num_devices = 1; static size_t huge_class_size; static const struct block_device_operations zram_devops; -static const struct block_device_operations zram_wb_devops; static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, @@ -328,8 +326,8 @@ static ssize_t idle_store(struct device *dev, if (!sysfs_streq(buf, "all")) { /* - * If it did not parse as 'all' try to treat it as an integer when - * we have memory tracking enabled. + * If it did not parse as 'all' try to treat it as an integer + * when we have memory tracking enabled. */ u64 age_sec; @@ -344,7 +342,10 @@ static ssize_t idle_store(struct device *dev, if (!init_done(zram)) goto out_unlock; - /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + /* + * A cutoff_time of 0 marks everything as idle, this is the + * "all" behavior. + */ mark_idle(zram, cutoff_time); rv = len; @@ -498,7 +499,7 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - strlcpy(file_name, buf, PATH_MAX); + strscpy(file_name, buf, PATH_MAX); /* ignore trailing newline */ sz = strlen(file_name); if (sz > 0 && file_name[sz - 1] == '\n') @@ -542,17 +543,6 @@ static ssize_t backing_dev_store(struct device *dev, zram->backing_dev = backing_dev; zram->bitmap = bitmap; zram->nr_pages = nr_pages; - /* - * With writeback feature, zram does asynchronous IO so it's no longer - * synchronous device so let's remove synchronous io flag. Othewise, - * upper layer(e.g., swap) could wait IO completion rather than - * (submit and return), which will cause system sluggish. - * Furthermore, when the IO function returns(e.g., swap_readpage), - * upper layer expects IO was done so it could deallocate the page - * freely but in fact, IO is going on so finally could cause - * use-after-free when the IO is really done. - */ - zram->disk->fops = &zram_wb_devops; up_write(&zram->init_lock); pr_info("setup backing device %s\n", file_name); @@ -617,24 +607,21 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, { struct bio *bio; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ, + GFP_NOIO); if (!bio) return -ENOMEM; bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); - bio_set_dev(bio, zram->bdev); if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { bio_put(bio); return -EIO; } - if (!parent) { - bio->bi_opf = REQ_OP_READ; + if (!parent) bio->bi_end_io = zram_page_end_io; - } else { - bio->bi_opf = parent->bi_opf; + else bio_chain(bio, parent); - } submit_bio(bio); return 1; @@ -643,8 +630,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, #define PAGE_WB_SIG "page_index=" #define PAGE_WRITEBACK 0 -#define HUGE_WRITEBACK 1 -#define IDLE_WRITEBACK 2 +#define HUGE_WRITEBACK (1<<0) +#define IDLE_WRITEBACK (1<<1) static ssize_t writeback_store(struct device *dev, @@ -664,6 +651,8 @@ static ssize_t writeback_store(struct device *dev, mode = IDLE_WRITEBACK; else if (sysfs_streq(buf, "huge")) mode = HUGE_WRITEBACK; + else if (sysfs_streq(buf, "huge_idle")) + mode = IDLE_WRITEBACK | HUGE_WRITEBACK; else { if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) return -EINVAL; @@ -725,10 +714,10 @@ static ssize_t writeback_store(struct device *dev, zram_test_flag(zram, index, ZRAM_UNDER_WB)) goto next; - if (mode == IDLE_WRITEBACK && + if (mode & IDLE_WRITEBACK && !zram_test_flag(zram, index, ZRAM_IDLE)) goto next; - if (mode == HUGE_WRITEBACK && + if (mode & HUGE_WRITEBACK && !zram_test_flag(zram, index, ZRAM_HUGE)) goto next; /* @@ -747,10 +736,9 @@ static ssize_t writeback_store(struct device *dev, continue; } - bio_init(&bio, &bio_vec, 1); - bio_set_dev(&bio, zram->bdev); + bio_init(&bio, zram->bdev, &bio_vec, 1, + REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); - bio.bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(&bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset); @@ -1032,7 +1020,7 @@ static ssize_t comp_algorithm_store(struct device *dev, char compressor[ARRAY_SIZE(zram->compressor)]; size_t sz; - strlcpy(compressor, buf, sizeof(compressor)); + strscpy(compressor, buf, sizeof(compressor)); /* ignore trailing newline */ sz = strlen(compressor); if (sz > 0 && compressor[sz - 1] == '\n') @@ -1268,6 +1256,9 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, struct bio_vec bvec; zram_slot_unlock(zram, index); + /* A null bio means rw_page was used, we must fallback to bio */ + if (!bio) + return -EOPNOTSUPP; bvec.bv_page = page; bvec.bv_len = PAGE_SIZE; @@ -1336,12 +1327,10 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, goto out; if (is_partial_io(bvec)) { - void *dst = kmap_atomic(bvec->bv_page); void *src = kmap_atomic(page); - memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len); + memcpy_to_bvec(bvec, src + offset); kunmap_atomic(src); - kunmap_atomic(dst); } out: if (is_partial_io(bvec)) @@ -1355,7 +1344,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, { int ret = 0; unsigned long alloced_pages; - unsigned long handle = 0; + unsigned long handle = -ENOMEM; unsigned int comp_len = 0; void *src, *dst, *mem; struct zcomp_strm *zstrm; @@ -1401,21 +1390,31 @@ compress_again: * if we have a 'non-null' handle here then we are coming * from the slow path and handle has already been allocated. */ - if (!handle) + if (IS_ERR((void *)handle)) handle = zs_malloc(zram->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE); - if (!handle) { + if (IS_ERR((void *)handle)) { zcomp_stream_put(zram->comp); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); - if (handle) + if (IS_ERR((void *)handle)) + return PTR_ERR((void *)handle); + + if (comp_len != PAGE_SIZE) goto compress_again; - return -ENOMEM; + /* + * If the page is not compressible, you need to acquire the + * lock and execute the code below. The zcomp_stream_get() + * call is needed to disable the cpu hotplug and grab the + * zstrm buffer back. It is necessary that the dereferencing + * of the zstrm variable below occurs correctly. + */ + zstrm = zcomp_stream_get(zram->comp); } alloced_pages = zs_get_total_pages(zram->mem_pool); @@ -1472,7 +1471,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, { int ret; struct page *page = NULL; - void *src; struct bio_vec vec; vec = *bvec; @@ -1490,11 +1488,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (ret) goto out; - src = kmap_atomic(bvec->bv_page); dst = kmap_atomic(page); - memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len); + memcpy_from_bvec(dst + offset, bvec); kunmap_atomic(dst); - kunmap_atomic(src); vec.bv_page = page; vec.bv_len = PAGE_SIZE; @@ -1552,7 +1548,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, * Returns 1 if IO request was successfully submitted. */ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, unsigned int op, struct bio *bio) + int offset, enum req_op op, struct bio *bio) { int ret; @@ -1660,7 +1656,7 @@ static void zram_slot_free_notify(struct block_device *bdev, } static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { int offset, ret; u32 index; @@ -1685,9 +1681,10 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - start_time = disk_start_io_acct(bdev->bd_disk, SECTORS_PER_PAGE, op); + start_time = bdev_start_io_acct(bdev->bd_disk->part0, + SECTORS_PER_PAGE, op, jiffies); ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); - disk_end_io_acct(bdev->bd_disk, op, start_time); + bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); out: /* * If I/O fails, just return error(ie, non-zero) without @@ -1715,9 +1712,6 @@ out: static void zram_reset_device(struct zram *zram) { - struct zcomp *comp; - u64 disksize; - down_write(&zram->init_lock); zram->limit_pages = 0; @@ -1727,17 +1721,15 @@ static void zram_reset_device(struct zram *zram) return; } - comp = zram->comp; - disksize = zram->disksize; - zram->disksize = 0; - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(zram, disksize); + zram_meta_free(zram, zram->disksize); + zram->disksize = 0; memset(&zram->stats, 0, sizeof(zram->stats)); - zcomp_destroy(comp); + zcomp_destroy(zram->comp); + zram->comp = NULL; reset_bdev(zram); up_write(&zram->init_lock); @@ -1796,7 +1788,7 @@ static ssize_t reset_store(struct device *dev, int ret; unsigned short do_reset; struct zram *zram; - struct block_device *bdev; + struct gendisk *disk; ret = kstrtou16(buf, 10, &do_reset); if (ret) @@ -1806,26 +1798,26 @@ static ssize_t reset_store(struct device *dev, return -EINVAL; zram = dev_to_zram(dev); - bdev = zram->disk->part0; + disk = zram->disk; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); /* Do not reset an active device or claimed device */ - if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_disk->open_mutex); + if (disk_openers(disk) || zram->claim) { + mutex_unlock(&disk->open_mutex); return -EBUSY; } /* From now on, anyone can't open /dev/zram[0-9] */ zram->claim = true; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); /* Make sure all the pending I/O are finished */ - sync_blockdev(bdev); + sync_blockdev(disk->part0); zram_reset_device(zram); - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); zram->claim = false; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return len; } @@ -1853,15 +1845,6 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; -#ifdef CONFIG_ZRAM_WRITEBACK -static const struct block_device_operations zram_wb_devops = { - .open = zram_open, - .submit_bio = zram_submit_bio, - .swap_slot_free_notify = zram_slot_free_notify, - .owner = THIS_MODULE -}; -#endif - static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); @@ -1903,14 +1886,7 @@ static struct attribute *zram_disk_attrs[] = { NULL, }; -static const struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; - -static const struct attribute_group *zram_disk_attr_groups[] = { - &zram_disk_attr_group, - NULL, -}; +ATTRIBUTE_GROUPS(zram_disk); /* * Allocate and initialize new zram device. the function returns @@ -1947,6 +1923,7 @@ static int zram_add(void) zram->disk->major = zram_major; zram->disk->first_minor = device_id; zram->disk->minors = 1; + zram->disk->flags |= GENHD_FL_NO_PART; zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); @@ -1968,7 +1945,6 @@ static int zram_add(void) blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); zram->disk->queue->limits.discard_granularity = PAGE_SIZE; blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, zram->disk->queue); /* * zram_bio_discard() will clear all logical blocks if logical block @@ -1982,18 +1958,18 @@ static int zram_add(void) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); - ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); + ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; - strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + strscpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; out_cleanup_disk: - blk_cleanup_disk(zram->disk); + put_disk(zram->disk); out_free_idr: idr_remove(&zram_index_idr, device_id); out_free_dev: @@ -2003,19 +1979,18 @@ out_free_dev: static int zram_remove(struct zram *zram) { - struct block_device *bdev = zram->disk->part0; bool claimed; - mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) { - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&zram->disk->open_mutex); + if (disk_openers(zram->disk)) { + mutex_unlock(&zram->disk->open_mutex); return -EBUSY; } claimed = zram->claim; if (!claimed) zram->claim = true; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&zram->disk->open_mutex); zram_debugfs_unregister(zram); @@ -2027,7 +2002,7 @@ static int zram_remove(struct zram *zram) ; } else { /* Make sure all the pending I/O are finished */ - sync_blockdev(bdev); + sync_blockdev(zram->disk->part0); zram_reset_device(zram); } @@ -2045,7 +2020,7 @@ static int zram_remove(struct zram *zram) */ zram_reset_device(zram); - blk_cleanup_disk(zram->disk); + put_disk(zram->disk); kfree(zram); return 0; } @@ -2139,6 +2114,8 @@ static int __init zram_init(void) { int ret; + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); if (ret < 0) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828..a2bda53020fd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -30,16 +30,15 @@ /* - * The lower ZRAM_FLAG_SHIFT bits of table.flags is for - * object size (excluding header), the higher bits is for - * zram_pageflags. + * ZRAM is mainly used for memory efficiency so we want to keep memory + * footprint small and thus squeeze size and zram pageflags into a flags + * member. The lower ZRAM_FLAG_SHIFT bits is for object size (excluding + * header), which cannot be larger than PAGE_SIZE (requiring PAGE_SHIFT + * bits), the higher bits are for zram_pageflags. * - * zram is mainly used for memory efficiency so we want to keep memory - * footprint small so we can squeeze size and flags into a field. - * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), - * the higher bits is for zram_pageflags. + * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ -#define ZRAM_FLAG_SHIFT 24 +#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { |