From b5dd2f6047ca108001328aac0e8588edd15f1778 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2014 13:22:57 +0000 Subject: block: loop: improve performance via blk-mq The conversion is a bit straightforward, and use work queue to dispatch requests of loop block, and one big change is that requests is submitted to backend file/device concurrently with work queue, so throughput may get improved much. Given write requests over same file are often run exclusively, so don't handle them concurrently for avoiding extra context switch cost, possible lock contention and work schedule cost. Also with blk-mq, there is opportunity to get loop I/O merged before submitting to backend file/device. In the following test: - base: v3.19-rc2-2041231 - loop over file in ext4 file system on SSD disk - bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1 - throughput: IOPS ------------------------------------------------------ | | base | base with loop-mq | delta | ------------------------------------------------------ | randread | 1740 | 25318 | +1355%| ------------------------------------------------------ | read | 42196 | 51771 | +22.6%| ----------------------------------------------------- | randwrite | 35709 | 34624 | -3% | ----------------------------------------------------- | write | 39137 | 40326 | +3% | ----------------------------------------------------- So loop-mq can improve throughput for both read and randread, meantime, performance of write and randwrite isn't hurted basically. Another benefit is that loop driver code gets simplified much after blk-mq conversion, and the patch can be thought as cleanup too. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 327 +++++++++++++++++++++++++-------------------------- drivers/block/loop.h | 17 ++- 2 files changed, 174 insertions(+), 170 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6cb1beb47c25..c678eb24a7b5 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -75,6 +75,7 @@ #include #include #include +#include #include "loop.h" #include @@ -85,6 +86,8 @@ static DEFINE_MUTEX(loop_index_mutex); static int max_part; static int part_shift; +static struct workqueue_struct *loop_wq; + /* * Transfer functions */ @@ -466,109 +469,36 @@ out: return ret; } -/* - * Add bio to back of pending list - */ -static void loop_add_bio(struct loop_device *lo, struct bio *bio) -{ - lo->lo_bio_count++; - bio_list_add(&lo->lo_bio_list, bio); -} - -/* - * Grab first pending buffer - */ -static struct bio *loop_get_bio(struct loop_device *lo) -{ - lo->lo_bio_count--; - return bio_list_pop(&lo->lo_bio_list); -} - -static void loop_make_request(struct request_queue *q, struct bio *old_bio) -{ - struct loop_device *lo = q->queuedata; - int rw = bio_rw(old_bio); - - if (rw == READA) - rw = READ; - - BUG_ON(!lo || (rw != READ && rw != WRITE)); - - spin_lock_irq(&lo->lo_lock); - if (lo->lo_state != Lo_bound) - goto out; - if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) - goto out; - if (lo->lo_bio_count >= q->nr_congestion_on) - wait_event_lock_irq(lo->lo_req_wait, - lo->lo_bio_count < q->nr_congestion_off, - lo->lo_lock); - loop_add_bio(lo, old_bio); - wake_up(&lo->lo_event); - spin_unlock_irq(&lo->lo_lock); - return; - -out: - spin_unlock_irq(&lo->lo_lock); - bio_io_error(old_bio); -} - struct switch_request { struct file *file; struct completion wait; }; -static void do_loop_switch(struct loop_device *, struct switch_request *); - -static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) +static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio) { - if (unlikely(!bio->bi_bdev)) { - do_loop_switch(lo, bio->bi_private); - bio_put(bio); - } else { - int ret = do_bio_filebacked(lo, bio); - bio_endio(bio, ret); - } + return do_bio_filebacked(lo, bio); } /* - * worker thread that handles reads/writes to file backed loop devices, - * to avoid blocking in our make_request_fn. it also does loop decrypting - * on reads for block backed loop, as that is too heavy to do from - * b_end_io context where irqs may be disabled. - * - * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before - * calling kthread_stop(). Therefore once kthread_should_stop() is - * true, make_request will not place any more requests. Therefore - * once kthread_should_stop() is true and lo_bio is NULL, we are - * done with the loop. + * Do the actual switch; called from the BIO completion routine */ -static int loop_thread(void *data) +static void do_loop_switch(struct loop_device *lo, struct switch_request *p) { - struct loop_device *lo = data; - struct bio *bio; - - set_user_nice(current, MIN_NICE); - - while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { - - wait_event_interruptible(lo->lo_event, - !bio_list_empty(&lo->lo_bio_list) || - kthread_should_stop()); - - if (bio_list_empty(&lo->lo_bio_list)) - continue; - spin_lock_irq(&lo->lo_lock); - bio = loop_get_bio(lo); - if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) - wake_up(&lo->lo_req_wait); - spin_unlock_irq(&lo->lo_lock); + struct file *file = p->file; + struct file *old_file = lo->lo_backing_file; + struct address_space *mapping; - BUG_ON(!bio); - loop_handle_bio(lo, bio); - } + /* if no new file, only flush of queued bios requested */ + if (!file) + return; - return 0; + mapping = file->f_mapping; + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? + mapping->host->i_bdev->bd_block_size : PAGE_SIZE; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); } /* @@ -579,15 +509,18 @@ static int loop_thread(void *data) static int loop_switch(struct loop_device *lo, struct file *file) { struct switch_request w; - struct bio *bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - init_completion(&w.wait); + w.file = file; - bio->bi_private = &w; - bio->bi_bdev = NULL; - loop_make_request(lo->lo_queue, bio); - wait_for_completion(&w.wait); + + /* freeze queue and wait for completion of scheduled requests */ + blk_mq_freeze_queue(lo->lo_queue); + + /* do the switch action */ + do_loop_switch(lo, &w); + + /* unfreeze */ + blk_mq_unfreeze_queue(lo->lo_queue); + return 0; } @@ -596,38 +529,9 @@ static int loop_switch(struct loop_device *lo, struct file *file) */ static int loop_flush(struct loop_device *lo) { - /* loop not yet configured, no running thread, nothing to flush */ - if (!lo->lo_thread) - return 0; - return loop_switch(lo, NULL); } -/* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) -{ - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - goto out; - - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); -out: - complete(&p->wait); -} - - /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -889,12 +793,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->transfer = transfer_none; lo->ioctl = NULL; lo->lo_sizelimit = 0; - lo->lo_bio_count = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - bio_list_init(&lo->lo_bio_list); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); @@ -906,14 +807,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_blocksize(bdev, lo_blocksize); - lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", - lo->lo_number); - if (IS_ERR(lo->lo_thread)) { - error = PTR_ERR(lo->lo_thread); - goto out_clr; - } lo->lo_state = Lo_bound; - wake_up_process(lo->lo_thread); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) @@ -925,18 +819,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, bdgrab(bdev); return 0; -out_clr: - loop_sysfs_exit(lo); - lo->lo_thread = NULL; - lo->lo_device = NULL; - lo->lo_backing_file = NULL; - lo->lo_flags = 0; - set_capacity(lo->lo_disk, 0); - invalidate_bdev(bdev); - bd_set_size(bdev, 0); - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask); - lo->lo_state = Lo_unbound; out_putf: fput(file); out: @@ -1012,11 +894,6 @@ static int loop_clr_fd(struct loop_device *lo) spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; - spin_unlock_irq(&lo->lo_lock); - - kthread_stop(lo->lo_thread); - - spin_lock_irq(&lo->lo_lock); lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1028,7 +905,6 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); @@ -1601,6 +1477,108 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); +static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + + blk_mq_start_request(bd->rq); + + if (cmd->rq->cmd_flags & REQ_WRITE) { + struct loop_device *lo = cmd->rq->q->queuedata; + bool need_sched = true; + + spin_lock_irq(&lo->lo_lock); + if (lo->write_started) + need_sched = false; + else + lo->write_started = true; + list_add_tail(&cmd->list, &lo->write_cmd_head); + spin_unlock_irq(&lo->lo_lock); + + if (need_sched) + queue_work(loop_wq, &lo->write_work); + } else { + queue_work(loop_wq, &cmd->read_work); + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static void loop_handle_cmd(struct loop_cmd *cmd) +{ + const bool write = cmd->rq->cmd_flags & REQ_WRITE; + struct loop_device *lo = cmd->rq->q->queuedata; + int ret = -EIO; + struct bio *bio; + + if (lo->lo_state != Lo_bound) + goto failed; + + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + goto failed; + + ret = 0; + __rq_for_each_bio(bio, cmd->rq) + ret |= loop_handle_bio(lo, bio); + + failed: + if (ret) + cmd->rq->errors = -EIO; + blk_mq_complete_request(cmd->rq); +} + +static void loop_queue_write_work(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, write_work); + LIST_HEAD(cmd_list); + + spin_lock_irq(&lo->lo_lock); + repeat: + list_splice_init(&lo->write_cmd_head, &cmd_list); + spin_unlock_irq(&lo->lo_lock); + + while (!list_empty(&cmd_list)) { + struct loop_cmd *cmd = list_first_entry(&cmd_list, + struct loop_cmd, list); + list_del_init(&cmd->list); + loop_handle_cmd(cmd); + } + + spin_lock_irq(&lo->lo_lock); + if (!list_empty(&lo->write_cmd_head)) + goto repeat; + lo->write_started = false; + spin_unlock_irq(&lo->lo_lock); +} + +static void loop_queue_read_work(struct work_struct *work) +{ + struct loop_cmd *cmd = + container_of(work, struct loop_cmd, read_work); + + loop_handle_cmd(cmd); +} + +static int loop_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->rq = rq; + INIT_WORK(&cmd->read_work, loop_queue_read_work); + + return 0; +} + +static struct blk_mq_ops loop_mq_ops = { + .queue_rq = loop_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = loop_init_request, +}; + static int loop_add(struct loop_device **l, int i) { struct loop_device *lo; @@ -1627,16 +1605,28 @@ static int loop_add(struct loop_device **l, int i) i = err; err = -ENOMEM; - lo->lo_queue = blk_alloc_queue(GFP_KERNEL); - if (!lo->lo_queue) + lo->tag_set.ops = &loop_mq_ops; + lo->tag_set.nr_hw_queues = 1; + lo->tag_set.queue_depth = 128; + lo->tag_set.numa_node = NUMA_NO_NODE; + lo->tag_set.cmd_size = sizeof(struct loop_cmd); + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.driver_data = lo; + + err = blk_mq_alloc_tag_set(&lo->tag_set); + if (err) goto out_free_idr; - /* - * set queue make_request_fn - */ - blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue = blk_mq_init_queue(&lo->tag_set); + if (IS_ERR_OR_NULL(lo->lo_queue)) { + err = PTR_ERR(lo->lo_queue); + goto out_cleanup_tags; + } lo->lo_queue->queuedata = lo; + INIT_LIST_HEAD(&lo->write_cmd_head); + INIT_WORK(&lo->write_work, loop_queue_write_work); + disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; @@ -1664,9 +1654,6 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; - lo->lo_thread = NULL; - init_waitqueue_head(&lo->lo_event); - init_waitqueue_head(&lo->lo_req_wait); spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; @@ -1680,6 +1667,8 @@ static int loop_add(struct loop_device **l, int i) out_free_queue: blk_cleanup_queue(lo->lo_queue); +out_cleanup_tags: + blk_mq_free_tag_set(&lo->tag_set); out_free_idr: idr_remove(&loop_index_idr, i); out_free_dev: @@ -1692,6 +1681,7 @@ static void loop_remove(struct loop_device *lo) { del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); + blk_mq_free_tag_set(&lo->tag_set); put_disk(lo->lo_disk); kfree(lo); } @@ -1875,6 +1865,13 @@ static int __init loop_init(void) goto misc_out; } + loop_wq = alloc_workqueue("kloopd", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!loop_wq) { + err = -ENOMEM; + goto misc_out; + } + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, THIS_MODULE, loop_probe, NULL, NULL); @@ -1912,6 +1909,8 @@ static void __exit loop_exit(void) blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); + destroy_workqueue(loop_wq); + misc_deregister(&loop_misc); } diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 90df5d6485b6..e20cdbbff7d5 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -13,6 +13,7 @@ #include #include #include +#include #include /* Possible states of device */ @@ -52,19 +53,23 @@ struct loop_device { gfp_t old_gfp_mask; spinlock_t lo_lock; - struct bio_list lo_bio_list; - unsigned int lo_bio_count; + struct list_head write_cmd_head; + struct work_struct write_work; + bool write_started; int lo_state; struct mutex lo_ctl_mutex; - struct task_struct *lo_thread; - wait_queue_head_t lo_event; - /* wait queue for incoming requests */ - wait_queue_head_t lo_req_wait; struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; }; +struct loop_cmd { + struct work_struct read_work; + struct request *rq; + struct list_head list; +}; + /* Support for loadable transfer modules */ struct loop_func_table { int number; /* filter type */ -- cgit v1.2.3-59-g8ed1b From 301120134628c49346e5f23f7c00e7377013d5e0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2014 13:22:58 +0000 Subject: block: loop: say goodby to bio Switch to block request completely. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c678eb24a7b5..d9589017cc1e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -287,12 +287,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, return ret; } -static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) +static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos) { int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t, struct page *page); struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; struct page *page = NULL; int ret = 0; @@ -306,7 +306,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) do_lo_send = do_lo_send_direct_write; } - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { ret = do_lo_send(lo, &bvec, pos, page); if (ret < 0) break; @@ -394,19 +394,22 @@ do_lo_receive(struct loop_device *lo, } static int -lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) +lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos) { struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; ssize_t s; - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { s = do_lo_receive(lo, &bvec, bsize, pos); if (s < 0) return s; if (s != bvec.bv_len) { - zero_fill_bio(bio); + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); break; } pos += bvec.bv_len; @@ -414,17 +417,17 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) return 0; } -static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) +static int do_req_filebacked(struct loop_device *lo, struct request *rq) { loff_t pos; int ret; - pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset; + pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - if (bio_rw(bio) == WRITE) { + if (rq->cmd_flags & REQ_WRITE) { struct file *file = lo->lo_backing_file; - if (bio->bi_rw & REQ_FLUSH) { + if (rq->cmd_flags & REQ_FLUSH) { ret = vfs_fsync(file, 0); if (unlikely(ret && ret != -EINVAL)) { ret = -EIO; @@ -438,7 +441,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) * encryption is enabled, because it may give an attacker * useful information. */ - if (bio->bi_rw & REQ_DISCARD) { + if (rq->cmd_flags & REQ_DISCARD) { struct file *file = lo->lo_backing_file; int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; @@ -448,22 +451,22 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) goto out; } ret = file->f_op->fallocate(file, mode, pos, - bio->bi_iter.bi_size); + blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) ret = -EIO; goto out; } - ret = lo_send(lo, bio, pos); + ret = lo_send(lo, rq, pos); - if ((bio->bi_rw & REQ_FUA) && !ret) { + if ((rq->cmd_flags & REQ_FUA) && !ret) { ret = vfs_fsync(file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; } } else - ret = lo_receive(lo, bio, lo->lo_blocksize, pos); + ret = lo_receive(lo, rq, lo->lo_blocksize, pos); out: return ret; @@ -474,11 +477,6 @@ struct switch_request { struct completion wait; }; -static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio) -{ - return do_bio_filebacked(lo, bio); -} - /* * Do the actual switch; called from the BIO completion routine */ @@ -1510,7 +1508,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) const bool write = cmd->rq->cmd_flags & REQ_WRITE; struct loop_device *lo = cmd->rq->q->queuedata; int ret = -EIO; - struct bio *bio; if (lo->lo_state != Lo_bound) goto failed; @@ -1518,9 +1515,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd) if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) goto failed; - ret = 0; - __rq_for_each_bio(bio, cmd->rq) - ret |= loop_handle_bio(lo, bio); + ret = do_req_filebacked(lo, cmd->rq); failed: if (ret) -- cgit v1.2.3-59-g8ed1b From cf655d953422c846558a320ac9b3c8e659b68275 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2014 13:22:59 +0000 Subject: block: loop: introduce lo_discard() and lo_req_flush() No behaviour change, just move the handling for REQ_DISCARD and REQ_FLUSH in these two functions. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 73 ++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d9589017cc1e..018af27256c5 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -417,6 +417,40 @@ lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos) return 0; } +static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) +{ + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + int ret; + + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) + ret = -EIO; + out: + return ret; +} + +static int lo_req_flush(struct loop_device *lo, struct request *rq) +{ + struct file *file = lo->lo_backing_file; + int ret = vfs_fsync(file, 0); + if (unlikely(ret && ret != -EINVAL)) + ret = -EIO; + + return ret; +} + static int do_req_filebacked(struct loop_device *lo, struct request *rq) { loff_t pos; @@ -425,46 +459,19 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; if (rq->cmd_flags & REQ_WRITE) { - struct file *file = lo->lo_backing_file; - - if (rq->cmd_flags & REQ_FLUSH) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) { - ret = -EIO; - goto out; - } - } - /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. - */ + if (rq->cmd_flags & REQ_FLUSH) + ret = lo_req_flush(lo, rq); + if (rq->cmd_flags & REQ_DISCARD) { - struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; - - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { - ret = -EOPNOTSUPP; - goto out; - } - ret = file->f_op->fallocate(file, mode, pos, - blk_rq_bytes(rq)); - if (unlikely(ret && ret != -EINVAL && - ret != -EOPNOTSUPP)) - ret = -EIO; + ret = lo_discard(lo, rq, pos); goto out; } ret = lo_send(lo, rq, pos); - if ((rq->cmd_flags & REQ_FUA) && !ret) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) - ret = -EIO; - } + if ((rq->cmd_flags & REQ_FUA) && !ret) + ret = lo_req_flush(lo, rq); } else ret = lo_receive(lo, rq, lo->lo_blocksize, pos); -- cgit v1.2.3-59-g8ed1b From af65aa8ea78b296857f257bdc52338d03101813b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2014 13:23:00 +0000 Subject: block: loop: don't handle REQ_FUA explicitly block core handles REQ_FUA by its flush state machine, so won't do it in loop explicitly. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 018af27256c5..185a86d73c3f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -459,23 +459,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; if (rq->cmd_flags & REQ_WRITE) { - if (rq->cmd_flags & REQ_FLUSH) ret = lo_req_flush(lo, rq); - - if (rq->cmd_flags & REQ_DISCARD) { + else if (rq->cmd_flags & REQ_DISCARD) ret = lo_discard(lo, rq, pos); - goto out; - } - - ret = lo_send(lo, rq, pos); - - if ((rq->cmd_flags & REQ_FUA) && !ret) - ret = lo_req_flush(lo, rq); + else + ret = lo_send(lo, rq, pos); } else ret = lo_receive(lo, rq, lo->lo_blocksize, pos); -out: return ret; } -- cgit v1.2.3-59-g8ed1b From 78e367a3601f35ea811e7f5660b7362afa2401fa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Jan 2015 15:20:25 -0700 Subject: loop: add blk-mq.h include Looks like we pull it in through other ways on x86, but we fail on sparc: In file included from drivers/block/cryptoloop.c:30:0: drivers/block/loop.h:63:24: error: field 'tag_set' has incomplete type struct blk_mq_tag_set tag_set; Add the include to loop.h, kill it from loop.c. Signed-off-by: Jens Axboe --- drivers/block/loop.c | 1 - drivers/block/loop.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 185a86d73c3f..d1f168b73634 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -75,7 +75,6 @@ #include #include #include -#include #include "loop.h" #include diff --git a/drivers/block/loop.h b/drivers/block/loop.h index e20cdbbff7d5..301c27f8323f 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 91117a20245b59f70b563523edbf998a62fc6383 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 7 Jan 2015 18:04:18 +0200 Subject: axonram: Fix bug in direct_access The 'pfn' returned by axonram was completely bogus, and has been since 2008. Signed-off-by: Matthew Wilcox Reviewed-by: Jan Kara Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- arch/powerpc/sysdev/axonram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index f532c92bf99d..367533bb3d48 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -156,7 +156,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector, } *kaddr = (void *)(bank->ph_addr + offset); - *pfn = virt_to_phys(kaddr) >> PAGE_SHIFT; + *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; return 0; } -- cgit v1.2.3-59-g8ed1b From 937af5ecd0591e84ee54180fa97dcbe9bbe5fed6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 7 Jan 2015 18:07:56 +0200 Subject: brd: Fix all partitions BUGs This patch fixes up brd's partitions scheme, now enjoying all worlds. The MAIN fix here is that currently, if one fdisks some partitions, a BAD bug will make all partitions point to the same start-end sector ie: 0 - brd_size And an mkfs of any partition would trash the partition table and the other partition. Another fix is that "mount -U uuid" will not work if show_part was not specified, because of the GENHD_FL_SUPPRESS_PARTITION_INFO flag. We now always load without it and remove the show_part parameter. [We remove Dmitry's new module-param part_show it is now always show] So NOW the logic goes like this: * max_part - Just says how many minors to reserve between ramX devices. In any way, there can be as many partition as requested. If minors between devices ends, then dynamic 259-major ids will be allocated on the fly. The default is now max_part=1, which means all partitions devt(s) will be from the dynamic (259) major-range. (If persistent partition minors is needed use max_part=X) For example with /dev/sdX max_part is hard coded 16. * Creation of new devices on the fly still/always work: mknod /path/devnod b 1 X fdisk -l /path/devnod Will create a new device if [X / max_part] was not already created before. (Just as before) partitions on the dynamically created device will work as well Same logic applies with minors as with the pre-created ones. TODO: dynamic grow of device size. So each device can have it's own size. CC: Dmitry Monakhov Tested-by: Ross Zwisler Signed-off-by: Boaz Harrosh Signed-off-by: Jens Axboe --- drivers/block/brd.c | 100 ++++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 62 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 89e90ec52f28..a7463c9595e7 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = { /* * And now the modules code and kernel interface. */ -static int rd_nr; -int rd_size = CONFIG_BLK_DEV_RAM_SIZE; -static int max_part; -static int part_shift; -static int part_show = 0; +static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); + +int rd_size = CONFIG_BLK_DEV_RAM_SIZE; module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); + +static int max_part = 1; module_param(max_part, int, S_IRUGO); -MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); -module_param(part_show, int, S_IRUGO); -MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions"); +MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -496,16 +495,15 @@ static struct brd_device *brd_alloc(int i) brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); - disk = brd->brd_disk = alloc_disk(1 << part_shift); + disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; disk->major = RAMDISK_MAJOR; - disk->first_minor = i << part_shift; + disk->first_minor = i * max_part; disk->fops = &brd_fops; disk->private_data = brd; disk->queue = brd->brd_queue; - if (!part_show) - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); @@ -527,10 +525,11 @@ static void brd_free(struct brd_device *brd) kfree(brd); } -static struct brd_device *brd_init_one(int i) +static struct brd_device *brd_init_one(int i, bool *new) { struct brd_device *brd; + *new = false; list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) goto out; @@ -541,6 +540,7 @@ static struct brd_device *brd_init_one(int i) add_disk(brd->brd_disk); list_add_tail(&brd->brd_list, &brd_devices); } + *new = true; out: return brd; } @@ -556,70 +556,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) { struct brd_device *brd; struct kobject *kobj; + bool new; mutex_lock(&brd_devices_mutex); - brd = brd_init_one(MINOR(dev) >> part_shift); + brd = brd_init_one(MINOR(dev) / max_part, &new); kobj = brd ? get_disk(brd->brd_disk) : NULL; mutex_unlock(&brd_devices_mutex); - *part = 0; + if (new) + *part = 0; + return kobj; } static int __init brd_init(void) { - int i, nr; - unsigned long range; struct brd_device *brd, *next; + int i; /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. - * However, this will not work well with user space tool that doesn't - * know about such "feature". In order to not break any existing - * tool, we do the following: * - * (1) if rd_nr is specified, create that many upfront, and this - * also becomes a hard limit. - * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT - * (default 16) rd device on module load, user can further - * extend brd device by create dev node themselves and have - * kernel automatically instantiate actual device on-demand. + * (1) if rd_nr is specified, create that many upfront. else + * it defaults to CONFIG_BLK_DEV_RAM_COUNT + * (2) User can further extend brd devices by create dev node themselves + * and have kernel automatically instantiate actual device + * on-demand. Example: + * mknod /path/devnod_name b 1 X # 1 is the rd major + * fdisk -l /path/devnod_name + * If (X / max_part) was not already created it will be created + * dynamically. */ - part_shift = 0; - if (max_part > 0) { - part_shift = fls(max_part); - - /* - * Adjust max_part according to part_shift as it is exported - * to user space so that user can decide correct minor number - * if [s]he want to create more devices. - * - * Note that -1 is required because partition 0 is reserved - * for the whole disk. - */ - max_part = (1UL << part_shift) - 1; - } - - if ((1UL << part_shift) > DISK_MAX_PARTS) - return -EINVAL; - - if (rd_nr > 1UL << (MINORBITS - part_shift)) - return -EINVAL; - - if (rd_nr) { - nr = rd_nr; - range = rd_nr << part_shift; - } else { - nr = CONFIG_BLK_DEV_RAM_COUNT; - range = 1UL << MINORBITS; - } - if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) return -EIO; - for (i = 0; i < nr; i++) { + if (unlikely(!max_part)) + max_part = 1; + + for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); if (!brd) goto out_free; @@ -631,10 +607,10 @@ static int __init brd_init(void) list_for_each_entry(brd, &brd_devices, brd_list) add_disk(brd->brd_disk); - blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range, + blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, THIS_MODULE, brd_probe, NULL, NULL); - printk(KERN_INFO "brd: module loaded\n"); + pr_info("brd: module loaded\n"); return 0; out_free: @@ -644,21 +620,21 @@ out_free: } unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + pr_info("brd: module NOT loaded !!!\n"); return -ENOMEM; } static void __exit brd_exit(void) { - unsigned long range; struct brd_device *brd, *next; - range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS; - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range); + blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + pr_info("brd: module unloaded\n"); } module_init(brd_init); -- cgit v1.2.3-59-g8ed1b From c8fa31730fc72a83bad762112a872675b3e76886 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 7 Jan 2015 18:09:38 +0200 Subject: brd: Request from fdisk 4k alignment Because of the direct_access() API which returns a PFN. partitions better start on 4K boundary, else offset ZERO of a partition will not be aligned and blk_direct_access() will fail the call. By setting blk_queue_physical_block_size(PAGE_SIZE) we can communicate this to fdisk and friends. The call to blk_queue_physical_block_size() is harmless and will not affect the Kernel behavior in any way. It is only for communication to user-mode. before this patch running fdisk on a default size brd of 4M the first sector offered is 34 (BAD), but after this patch it will be 40, ie 8 sectors aligned. Also when entering some random partition sizes the next partition-start sector is offered 8 sectors aligned after this patch. (Please note that with fdisk the user can still enter bad values, only the offered default values will be correct) Note that with bdev-size > 4M fdisk will try to align on a 1M boundary (above first-sector will be 2048), in any case. CC: Martin K. Petersen Signed-off-by: Boaz Harrosh Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/block/brd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a7463c9595e7..c01b921b1b4a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -486,10 +486,19 @@ static struct brd_device *brd_alloc(int i) brd->brd_queue = blk_alloc_queue(GFP_KERNEL); if (!brd->brd_queue) goto out_free_dev; + blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_max_hw_sectors(brd->brd_queue, 1024); blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); + /* This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; brd->brd_queue->limits.max_discard_sectors = UINT_MAX; brd->brd_queue->limits.discard_zeroes_data = 1; -- cgit v1.2.3-59-g8ed1b From 227290b4690510148d1f40f277c6c3a181fcb440 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Jan 2015 16:02:24 -0700 Subject: null_blk: suppress invalid partition info null_blk is partitionable, but it doesn't store any of the info. When it is loaded, you would normally see: [1226739.343608] nullb0: unknown partition table [1226739.343746] nullb1: unknown partition table which can confuse some people. Add the appropriate gendisk flag to suppress this info. Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index aa2224aa7caa..65cd61a4145e 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -579,7 +579,7 @@ static int null_add_dev(void) sector_div(size, bs); set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT; + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; disk->fops = &null_fops; -- cgit v1.2.3-59-g8ed1b From 121c7ad4ef3f3a9e697c4b8b78d74e4d6847d9e4 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 14 Jan 2015 21:01:58 -0700 Subject: NVMe: within nvme_free_queues(), delete RCU sychro/deferred free Converting from to blk-queue got rid of the driver's RCU locking-on-queue, so removing unnecessary RCU locking-on-queue artefacts. Reviewed-by: Keith Busch Signed-off-by: Kelly Nicole Kaoudis Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f7d083bb3bd5..f4aa64160838 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1108,21 +1108,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) static void nvme_free_queues(struct nvme_dev *dev, int lowest) { - LLIST_HEAD(q_list); - struct nvme_queue *nvmeq, *next; - struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - llist_add(&nvmeq->node, &q_list); dev->queue_count--; dev->queues[i] = NULL; - } - synchronize_rcu(); - entry = llist_del_all(&q_list); - llist_for_each_entry_safe(nvmeq, next, entry, node) nvme_free_queue(nvmeq); + } } /** -- cgit v1.2.3-59-g8ed1b From 12cb5ce101abfaf74421f8cc9f196e708209eb79 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 15 Jan 2015 17:32:27 -0800 Subject: libata: use blk taging libata uses its own tag management which is duplication and the implementation is poor. And if we switch to blk-mq, tag is build-in. It's time to switch to generic taging. The SAS driver has its own tag management, and looks we can't directly map the host controler tag to SATA tag. So I just bypassed the SAS case. I changed the code/variable name for the tag management of libata to make it self contained. Only sas will use it. Later if libsas implements its tag management, the tag management code in libata can be deleted easily. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 54 +++++++++++++++++++++++++++++++++++------------ drivers/ata/libata-scsi.c | 4 +++- drivers/ata/libata.h | 2 +- include/linux/libata.h | 5 +++-- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 5c84fb5c3372..695d33df3df5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1525,6 +1525,15 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc) complete(waiting); } +static bool ata_valid_internal_tag(struct ata_port *ap, struct ata_device *dev, + unsigned int tag) +{ + if (!ap->scsi_host) + return !test_and_set_bit(tag, &ap->sas_tag_allocated); + return !dev->sdev || + !blk_queue_find_tag(dev->sdev->request_queue, tag); +} + /** * ata_exec_internal_sg - execute libata internal command * @dev: Device to which the command is sent @@ -1585,8 +1594,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, else tag = 0; - if (test_and_set_bit(tag, &ap->qc_allocated)) - BUG(); + BUG_ON(!ata_valid_internal_tag(ap, dev, tag)); qc = __ata_qc_from_tag(ap, tag); qc->tag = tag; @@ -4737,27 +4745,23 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words) * None. */ -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) +static struct ata_queued_cmd *sas_ata_qc_new(struct ata_port *ap) { struct ata_queued_cmd *qc = NULL; unsigned int max_queue = ap->host->n_tags; unsigned int i, tag; - /* no command while frozen */ - if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) - return NULL; - - for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) { + for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) { tag = tag < max_queue ? tag : 0; /* the last tag is reserved for internal command. */ if (tag == ATA_TAG_INTERNAL) continue; - if (!test_and_set_bit(tag, &ap->qc_allocated)) { + if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) { qc = __ata_qc_from_tag(ap, tag); qc->tag = tag; - ap->last_tag = tag; + ap->sas_last_tag = tag; break; } } @@ -4765,6 +4769,24 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) return qc; } +static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap, int blktag) +{ + struct ata_queued_cmd *qc; + + /* no command while frozen */ + if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) + return NULL; + + /* SATA will directly use block tag. libsas need its own tag management */ + if (ap->scsi_host) { + qc = __ata_qc_from_tag(ap, blktag); + qc->tag = blktag; + return qc; + } + + return sas_ata_qc_new(ap); +} + /** * ata_qc_new_init - Request an available ATA command, and initialize it * @dev: Device from whom we request an available command structure @@ -4773,12 +4795,12 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) * None. */ -struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev) +struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag) { struct ata_port *ap = dev->link->ap; struct ata_queued_cmd *qc; - qc = ata_qc_new(ap); + qc = ata_qc_new(ap, blktag); if (qc) { qc->scsicmd = NULL; qc->ap = ap; @@ -4800,6 +4822,12 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev) * LOCKING: * spin_lock_irqsave(host lock) */ +static void sas_ata_qc_free(unsigned int tag, struct ata_port *ap) +{ + if (!ap->scsi_host) + clear_bit(tag, &ap->sas_tag_allocated); +} + void ata_qc_free(struct ata_queued_cmd *qc) { struct ata_port *ap; @@ -4812,7 +4840,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - clear_bit(tag, &ap->qc_allocated); + sas_ata_qc_free(tag, ap); } } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index e364e86e84d7..94339c2aed1b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev, { struct ata_queued_cmd *qc; - qc = ata_qc_new_init(dev); + qc = ata_qc_new_init(dev, cmd->request->tag); if (qc) { qc->scsicmd = cmd; qc->scsidone = cmd->scsi_done; @@ -3666,6 +3666,8 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht) */ shost->max_host_blocked = 1; + scsi_init_shared_tag_map(shost, host->n_tags); + rc = scsi_add_host_with_dma(ap->scsi_host, &ap->tdev, ap->host->dev); if (rc) diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 5f4e0cca56ec..40405135bbb6 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev); extern void ata_force_cbl(struct ata_port *ap); extern u64 ata_tf_to_lba(const struct ata_taskfile *tf); extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf); -extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev); +extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag); extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev, u64 block, u32 n_block, unsigned int tf_flags, unsigned int tag); diff --git a/include/linux/libata.h b/include/linux/libata.h index 2d182413b1db..f23454762717 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -821,10 +821,10 @@ struct ata_port { unsigned int cbl; /* cable type; ATA_CBL_xxx */ struct ata_queued_cmd qcmd[ATA_MAX_QUEUE]; - unsigned long qc_allocated; + unsigned long sas_tag_allocated; /* for sas tag allocation only */ unsigned int qc_active; int nr_active_links; /* #links with active qcs */ - unsigned int last_tag; /* track next tag hw expects */ + unsigned int sas_last_tag; /* track next tag hw expects */ struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ @@ -1344,6 +1344,7 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .ioctl = ata_scsi_ioctl, \ .queuecommand = ata_scsi_queuecmd, \ .can_queue = ATA_DEF_QUEUE, \ + .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ .cmd_per_lun = ATA_SHT_CMD_PER_LUN, \ .emulated = ATA_SHT_EMULATED, \ -- cgit v1.2.3-59-g8ed1b From 98bd4be1ba95f2fe7f543910792b7163a5de06eb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jan 2015 19:52:07 -0800 Subject: libata: move sas ata tag allocation to libata-scsi.c Basically move the sas ata tag allocation to libata-scsi.c to make it clear these staffs are just for sas. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 93 +++++++++-------------------------------------- drivers/ata/libata-scsi.c | 28 +++++++++++++- drivers/ata/libata.h | 2 + 3 files changed, 46 insertions(+), 77 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 695d33df3df5..d626605ac8f7 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1525,15 +1525,6 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc) complete(waiting); } -static bool ata_valid_internal_tag(struct ata_port *ap, struct ata_device *dev, - unsigned int tag) -{ - if (!ap->scsi_host) - return !test_and_set_bit(tag, &ap->sas_tag_allocated); - return !dev->sdev || - !blk_queue_find_tag(dev->sdev->request_queue, tag); -} - /** * ata_exec_internal_sg - execute libata internal command * @dev: Device to which the command is sent @@ -1594,7 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, else tag = 0; - BUG_ON(!ata_valid_internal_tag(ap, dev, tag)); qc = __ata_qc_from_tag(ap, tag); qc->tag = tag; @@ -4734,80 +4724,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words) } /** - * ata_qc_new - Request an available ATA command, for queueing - * @ap: target port - * - * Some ATA host controllers may implement a queue depth which is less - * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond - * the hardware limitation. + * ata_qc_new_init - Request an available ATA command, and initialize it + * @dev: Device from whom we request an available command structure * * LOCKING: * None. */ -static struct ata_queued_cmd *sas_ata_qc_new(struct ata_port *ap) -{ - struct ata_queued_cmd *qc = NULL; - unsigned int max_queue = ap->host->n_tags; - unsigned int i, tag; - - for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) { - tag = tag < max_queue ? tag : 0; - - /* the last tag is reserved for internal command. */ - if (tag == ATA_TAG_INTERNAL) - continue; - - if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) { - qc = __ata_qc_from_tag(ap, tag); - qc->tag = tag; - ap->sas_last_tag = tag; - break; - } - } - - return qc; -} - -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap, int blktag) +struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag) { + struct ata_port *ap = dev->link->ap; struct ata_queued_cmd *qc; /* no command while frozen */ if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) return NULL; - /* SATA will directly use block tag. libsas need its own tag management */ - if (ap->scsi_host) { - qc = __ata_qc_from_tag(ap, blktag); - qc->tag = blktag; - return qc; + /* libsas case */ + if (!ap->scsi_host) { + tag = ata_sas_allocate_tag(ap); + if (tag < 0) + return NULL; } - return sas_ata_qc_new(ap); -} - -/** - * ata_qc_new_init - Request an available ATA command, and initialize it - * @dev: Device from whom we request an available command structure - * - * LOCKING: - * None. - */ - -struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag) -{ - struct ata_port *ap = dev->link->ap; - struct ata_queued_cmd *qc; - - qc = ata_qc_new(ap, blktag); - if (qc) { - qc->scsicmd = NULL; - qc->ap = ap; - qc->dev = dev; + qc = __ata_qc_from_tag(ap, tag); + qc->tag = tag; + qc->scsicmd = NULL; + qc->ap = ap; + qc->dev = dev; - ata_qc_reinit(qc); - } + ata_qc_reinit(qc); return qc; } @@ -4822,12 +4768,6 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag) * LOCKING: * spin_lock_irqsave(host lock) */ -static void sas_ata_qc_free(unsigned int tag, struct ata_port *ap) -{ - if (!ap->scsi_host) - clear_bit(tag, &ap->sas_tag_allocated); -} - void ata_qc_free(struct ata_queued_cmd *qc) { struct ata_port *ap; @@ -4840,7 +4780,8 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - sas_ata_qc_free(tag, ap); + if (!ap->scsi_host) + ata_sas_free_tag(tag, ap); } } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 94339c2aed1b..59c9d721b347 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -3666,7 +3666,8 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht) */ shost->max_host_blocked = 1; - scsi_init_shared_tag_map(shost, host->n_tags); + if (scsi_init_shared_tag_map(shost, host->n_tags)) + goto err_add; rc = scsi_add_host_with_dma(ap->scsi_host, &ap->tdev, ap->host->dev); @@ -4230,3 +4231,28 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap) return rc; } EXPORT_SYMBOL_GPL(ata_sas_queuecmd); + +int ata_sas_allocate_tag(struct ata_port *ap) +{ + unsigned int max_queue = ap->host->n_tags; + unsigned int i, tag; + + for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) { + tag = tag < max_queue ? tag : 0; + + /* the last tag is reserved for internal command. */ + if (tag == ATA_TAG_INTERNAL) + continue; + + if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) { + ap->sas_last_tag = tag; + return tag; + } + } + return -1; +} + +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap) +{ + clear_bit(tag, &ap->sas_tag_allocated); +} diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 40405135bbb6..8c491cd8805b 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -145,6 +145,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work); extern int ata_bus_probe(struct ata_port *ap); extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel, unsigned int id, u64 lun); +int ata_sas_allocate_tag(struct ata_port *ap); +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap); /* libata-eh.c */ -- cgit v1.2.3-59-g8ed1b From 9269e23496ddd83b974536019cf420520df0ee7f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jan 2015 20:17:59 -0800 Subject: libata: make sata_sil24 use fifo tag allocator libata starts using block tag now, we can use BLK_TAG_ALLOC_FIFO to solve the sata_sil24 tag bug. https://bugzilla.kernel.org/show_bug.cgi?id=87101 Cc: Christoph Hellwig Signed-off-by: Shaohua Li Acked-by: Tejun Heo Acked-by: Dan Williams Signed-off-by: Jens Axboe --- drivers/ata/sata_sil24.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index d81b20ddb527..52428971ba94 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = { .can_queue = SIL24_MAX_CMDS, .sg_tablesize = SIL24_MAX_SGE, .dma_boundary = ATA_DMA_BOUNDARY, + .tag_alloc_policy = BLK_TAG_ALLOC_FIFO, }; static struct ata_port_operations sil24_ops = { -- cgit v1.2.3-59-g8ed1b From 4ca5829ac8b1297715bf609443ade2c332f3fd0c Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Wed, 28 Jan 2015 19:35:38 +0100 Subject: MAINTAINERS: Update NBD maintainer Paul stops maintining NBD and I will take his place from now on. Signed-off-by: Markus Pargmann Signed-off-by: Jens Axboe --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ddb9ac8d32b3..bb5e510e977e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6537,9 +6537,10 @@ F: include/uapi/linux/netrom.h F: net/netrom/ NETWORK BLOCK DEVICE (NBD) -M: Paul Clements +M: Markus Pargmann S: Maintained L: nbd-general@lists.sourceforge.net +T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/linux/nbd.h -- cgit v1.2.3-59-g8ed1b From ac3dd5bd128b1d1ce2a037775766f39d06a4848a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Jan 2015 12:07:58 -0700 Subject: NVMe: avoid kmalloc/kfree for smaller IO Currently we allocate an nvme_iod for each IO, which holds the sg list, prps, and other IO related info. Set a threshold of 2 pages and/or 8KB of data, below which we can just embed this in the per-command pdu in blk-mq. For any IO at or below NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree. For higher IOPS, this saves up to 1% of CPU time. Signed-off-by: Jens Axboe Reviewed-by: Keith Busch --- drivers/block/nvme-core.c | 119 ++++++++++++++++++++++++++++++++++------------ include/linux/nvme.h | 3 +- 2 files changed, 89 insertions(+), 33 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f4aa64160838..3eaa0becc52d 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -144,8 +144,37 @@ struct nvme_cmd_info { void *ctx; int aborted; struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; }; +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, cmd->aborted = 0; } +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & 0x01) == 0; +} + /* Special values must be less than 0x1000 */ #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) @@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod) return ((void *)iod) + iod->offset; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); - return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes, dev) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + sizeof(struct scatterlist) * nseg, gfp); - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; - iod->first_dma = 0ULL; - } + if (iod) + iod_init(iod, bytes, nseg, priv); return iod; } +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + unsigned long mask = 0; + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + mask = 0x01; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | 0x01); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; @@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); prp_dma = next_prp_dma; } - kfree(iod); + + if (iod_should_kfree(iod)) + kfree(iod); } static int nvme_error_status(u16 status) @@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(&cqe->status) >> 1; @@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct nvme_ns *ns) { - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_command *cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = req->nr_phys_segments; enum dma_data_direction dma_dir; - unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : - sizeof(struct nvme_dsm_range); - iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; - iod->private = req; - if (req->cmd_flags & REQ_DISCARD) { void *range; /* @@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; - } else if (psegs) { + } else if (req->nr_phys_segments) { dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - sg_init_table(iod->sg, psegs); + sg_init_table(iod->sg, req->nr_phys_segments); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); if (!iod->nents) goto error_cmd; @@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); + iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); if (!iod) goto put_pages; @@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.cmd_size = nvme_cmd_size(dev); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 258945fcabf1..19a5d4b23209 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,13 +132,12 @@ struct nvme_ns { * allocated to store the PRP list. */ struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ + unsigned long private; /* For the use of the submitter of the I/O */ int npages; /* In the PRP list. 0 means small pool in use */ int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; - struct list_head node; struct scatterlist sg[0]; }; -- cgit v1.2.3-59-g8ed1b From b7f120b211510b80cb72c1d790d9a4531271edfa Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 2 Feb 2015 17:08:45 +0100 Subject: floppy: Avoid manual call of device_create_file() Use the static attribute groups assigned to the device instead of calling device_create_file() after the device registration. Signed-off-by: Takashi Iwai Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 56d46ffb08e1..a08cda955285 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev, static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); +static struct attribute *floppy_dev_attrs[] = { + &dev_attr_cmos.attr, + NULL +}; + +ATTRIBUTE_GROUPS(floppy_dev); + static void floppy_device_release(struct device *dev) { } @@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void) floppy_device[drive].name = floppy_device_name; floppy_device[drive].id = drive; floppy_device[drive].dev.release = floppy_device_release; + floppy_device[drive].dev.groups = floppy_dev_groups; err = platform_device_register(&floppy_device[drive]); if (err) goto out_remove_drives; - err = device_create_file(&floppy_device[drive].dev, - &dev_attr_cmos); - if (err) - goto out_unreg_platform_dev; - /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; @@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void) return 0; -out_unreg_platform_dev: - platform_device_unregister(&floppy_device[drive]); out_remove_drives: while (drive--) { if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } } @@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void) if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } blk_cleanup_queue(disks[drive]->queue); -- cgit v1.2.3-59-g8ed1b From 2c561246524c3319473bf47b558354f7ff47f0cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Feb 2015 12:55:31 +0100 Subject: block: Simplify bsg complete all It took me a few tries to figure out what this code did; lets rewrite it into a more regular form. The thing that makes this one 'special' is the BSG_F_BLOCK flag, if that is not set we're not supposed/allowed to block and should spin wait for completion. The (new) io_wait_event() will never see a false condition in case of the spinning and we will therefore not block. Cc: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- block/bsg.c | 72 ++++++++++++++++++---------------------------------- include/linux/wait.h | 15 +++++++++++ 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index 276e869e686c..d214e929ce18 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; } -static int bsg_io_schedule(struct bsg_device *bd) -{ - DEFINE_WAIT(wait); - int ret = 0; - - spin_lock_irq(&bd->lock); - - BUG_ON(bd->done_cmds > bd->queued_cmds); - - /* - * -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no - * work to do", even though we return -ENOSPC after this same test - * during bsg_write() -- there, it means our buffer can't have more - * bsg_commands added to it, thus has no space left. - */ - if (bd->done_cmds == bd->queued_cmds) { - ret = -ENODATA; - goto unlock; - } - - if (!test_bit(BSG_F_BLOCK, &bd->flags)) { - ret = -EAGAIN; - goto unlock; - } - - prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bd->lock); - io_schedule(); - finish_wait(&bd->wq_done, &wait); - - return ret; -unlock: - spin_unlock_irq(&bd->lock); - return ret; -} - static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, fmode_t has_write_perm) @@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, return ret; } +static bool bsg_complete(struct bsg_device *bd) +{ + bool ret = false; + bool spin; + + do { + spin_lock_irq(&bd->lock); + + BUG_ON(bd->done_cmds > bd->queued_cmds); + + /* + * All commands consumed. + */ + if (bd->done_cmds == bd->queued_cmds) + ret = true; + + spin = !test_bit(BSG_F_BLOCK, &bd->flags); + + spin_unlock_irq(&bd->lock); + } while (!ret && spin); + + return ret; +} + static int bsg_complete_all_commands(struct bsg_device *bd) { struct bsg_command *bc; @@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) /* * wait for all commands to complete */ - ret = 0; - do { - ret = bsg_io_schedule(bd); - /* - * look for -ENODATA specifically -- we'll sometimes get - * -ERESTARTSYS when we've taken a signal, but we can't - * return until we're done freeing the queue, so ignore - * it. The signal will get handled when we're done freeing - * the bsg_device. - */ - } while (ret != -ENODATA); + io_wait_event(bd->wq_done, bsg_complete(bd)); /* * discard done commands diff --git a/include/linux/wait.h b/include/linux/wait.h index 2232ed16635a..71fc1d31e48d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -267,6 +267,21 @@ do { \ __wait_event(wq, condition); \ } while (0) +#define __io_wait_event(wq, condition) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + io_schedule()) + +/* + * io_wait_event() -- like wait_event() but with io_schedule() + */ +#define io_wait_event(wq, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __io_wait_event(wq, condition); \ +} while (0) + #define __wait_event_freezable(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule(); try_to_freeze()) -- cgit v1.2.3-59-g8ed1b From bcadb699b05b828fb43c54abb75607db7e8c3e29 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Fri, 23 Jan 2015 17:14:29 +0100 Subject: xen-blkback,xen-blkfront: add myself as maintainer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I've done quite a lot of work in blkfront/blkback, and I usually end up looking at the patches, so add myself as maintainer together with Konrad. Signed-off-by: Roger Pau Monné Acked-by: Konrad Rzeszutek Wilk --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bb5e510e977e..6660beb96e61 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10574,6 +10574,7 @@ F: drivers/pci/*xen* XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk +M: Roger Pau Monné L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: drivers/block/xen-blkback/* -- cgit v1.2.3-59-g8ed1b From 3bb8c98e5612f069010ad04e5f463389e2eb6563 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 2 Feb 2015 11:28:21 +0000 Subject: xen-blkfront: fix accounting of reqs when migrating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current migration code uses blk_put_request in order to finish a request before requeuing it. This function doesn't update the statistics of the queue, which completely screws accounting. Use blk_end_request_all instead which properly updates the statistics of the queue. Signed-off-by: Roger Pau Monné Reported-and-Tested-by: Ouyang Zhaowei (Charles) Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: xen-devel@lists.xenproject.org --- drivers/block/xen-blkfront.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2236c6f31608..7f66d2e08f19 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info) merge_bio.tail = copy[i].request->biotail; bio_list_merge(&bio_list, &merge_bio); copy[i].request->bio = NULL; - blk_put_request(copy[i].request); + blk_end_request_all(copy[i].request, 0); } kfree(copy); @@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info) req->bio = NULL; if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) pr_alert("diskcache flush request found!\n"); - __blk_put_request(info->rq, req); + __blk_end_request_all(req, 0); } spin_unlock_irq(&info->io_lock); -- cgit v1.2.3-59-g8ed1b From b042a3ca949053231950a1b15f31cccca9e305f3 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 5 Feb 2015 17:09:56 +0000 Subject: xen-blkback: default to X86_32 ABI on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to the existance of 64-bit backends using the X86_64 ABI, frontends used the X86_32 ABI. These old frontends do not specify the ABI and when used with a 64-bit backend do not work. On x86, default to the X86_32 ABI if one is not specified. Backends on ARM continue to default to their NATIVE ABI. Signed-off-by: David Vrabel Acked-by: Roger Pau Monné --- drivers/block/xen-blkback/common.h | 9 +++++++++ drivers/block/xen-blkback/xenbus.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index f65b807e3236..78b0411e5ed5 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -214,6 +214,15 @@ enum blkif_protocol { BLKIF_PROTOCOL_X86_64 = 3, }; +/* + * Default protocol if the frontend doesn't specify one. + */ +#ifdef CONFIG_X86 +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32 +#else +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE +#endif + struct xen_vbd { /* What the domain refers to this vbd as. */ blkif_vdev_t handle; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 630a489e757d..e3afe97280b1 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be) return err; } - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", "%63s", protocol, NULL); if (err) - strcpy(protocol, "unspecified, assuming native"); + strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) -- cgit v1.2.3-59-g8ed1b