aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/loop.c')
-rw-r--r--drivers/block/loop.c274
1 files changed, 218 insertions, 56 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 674f800a3b57..423f4ca7d712 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -164,6 +164,62 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
return get_size(lo->lo_offset, lo->lo_sizelimit, file);
}
+static void __loop_update_dio(struct loop_device *lo, bool dio)
+{
+ struct file *file = lo->lo_backing_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned short sb_bsize = 0;
+ unsigned dio_align = 0;
+ bool use_dio;
+
+ if (inode->i_sb->s_bdev) {
+ sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
+ dio_align = sb_bsize - 1;
+ }
+
+ /*
+ * We support direct I/O only if lo_offset is aligned with the
+ * logical I/O size of backing device, and the logical block
+ * size of loop is bigger than the backing device's and the loop
+ * needn't transform transfer.
+ *
+ * TODO: the above condition may be loosed in the future, and
+ * direct I/O may be switched runtime at that time because most
+ * of requests in sane appplications should be PAGE_SIZE algined
+ */
+ if (dio) {
+ if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
+ !(lo->lo_offset & dio_align) &&
+ mapping->a_ops->direct_IO &&
+ !lo->transfer)
+ use_dio = true;
+ else
+ use_dio = false;
+ } else {
+ use_dio = false;
+ }
+
+ if (lo->use_dio == use_dio)
+ return;
+
+ /* flush dirty pages before changing direct IO */
+ vfs_fsync(file, 0);
+
+ /*
+ * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
+ * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
+ * will get updated by ioctl(LOOP_GET_STATUS)
+ */
+ blk_mq_freeze_queue(lo->lo_queue);
+ lo->use_dio = use_dio;
+ if (use_dio)
+ lo->lo_flags |= LO_FLAGS_DIRECT_IO;
+ else
+ lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+ blk_mq_unfreeze_queue(lo->lo_queue);
+}
+
static int
figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
{
@@ -389,6 +445,89 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
return ret;
}
+static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+{
+ if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+ return;
+
+ if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+ struct bio *bio = cmd->rq->bio;
+
+ bio_advance(bio, bytes);
+ zero_fill_bio(bio);
+ }
+}
+
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+ struct request *rq = cmd->rq;
+
+ handle_partial_read(cmd, ret);
+
+ if (ret > 0)
+ ret = 0;
+ else if (ret < 0)
+ ret = -EIO;
+
+ blk_mq_complete_request(rq, ret);
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+ loff_t pos, bool rw)
+{
+ struct iov_iter iter;
+ struct bio_vec *bvec;
+ struct bio *bio = cmd->rq->bio;
+ struct file *file = lo->lo_backing_file;
+ int ret;
+
+ /* nomerge for loop request queue */
+ WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+ bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+ bio_segments(bio), blk_rq_bytes(cmd->rq));
+
+ cmd->iocb.ki_pos = pos;
+ cmd->iocb.ki_filp = file;
+ cmd->iocb.ki_complete = lo_rw_aio_complete;
+ cmd->iocb.ki_flags = IOCB_DIRECT;
+
+ if (rw == WRITE)
+ ret = file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+ if (ret != -EIOCBQUEUED)
+ cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+ return 0;
+}
+
+
+static inline int lo_rw_simple(struct loop_device *lo,
+ struct request *rq, loff_t pos, bool rw)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ if (cmd->use_aio)
+ return lo_rw_aio(lo, cmd, pos, rw);
+
+ /*
+ * lo_write_simple and lo_read_simple should have been covered
+ * by io submit style function like lo_rw_aio(), one blocker
+ * is that lo_read_simple() need to call flush_dcache_page after
+ * the page is written from kernel, and it isn't easy to handle
+ * this in io submit style function which submits all segments
+ * of the req at one time. And direct read IO doesn't need to
+ * run flush_dcache_page().
+ */
+ if (rw == WRITE)
+ return lo_write_simple(lo, rq, pos);
+ else
+ return lo_read_simple(lo, rq, pos);
+}
+
static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
loff_t pos;
@@ -404,13 +543,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
else if (lo->transfer)
ret = lo_write_transfer(lo, rq, pos);
else
- ret = lo_write_simple(lo, rq, pos);
+ ret = lo_rw_simple(lo, rq, pos, WRITE);
} else {
if (lo->transfer)
ret = lo_read_transfer(lo, rq, pos);
else
- ret = lo_read_simple(lo, rq, pos);
+ ret = lo_rw_simple(lo, rq, pos, READ);
}
return ret;
@@ -421,6 +560,12 @@ struct switch_request {
struct completion wait;
};
+static inline void loop_update_dio(struct loop_device *lo)
+{
+ __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
+ lo->use_dio);
+}
+
/*
* Do the actual switch; called from the BIO completion routine
*/
@@ -441,6 +586,7 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+ loop_update_dio(lo);
}
/*
@@ -627,11 +773,19 @@ static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
return sprintf(buf, "%s\n", partscan ? "1" : "0");
}
+static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
+{
+ int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);
+
+ return sprintf(buf, "%s\n", dio ? "1" : "0");
+}
+
LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
LOOP_ATTR_RO(partscan);
+LOOP_ATTR_RO(dio);
static struct attribute *loop_attrs[] = {
&loop_attr_backing_file.attr,
@@ -639,6 +793,7 @@ static struct attribute *loop_attrs[] = {
&loop_attr_sizelimit.attr,
&loop_attr_autoclear.attr,
&loop_attr_partscan.attr,
+ &loop_attr_dio.attr,
NULL,
};
@@ -688,6 +843,23 @@ static void loop_config_discard(struct loop_device *lo)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
+static void loop_unprepare_queue(struct loop_device *lo)
+{
+ flush_kthread_worker(&lo->worker);
+ kthread_stop(lo->worker_task);
+}
+
+static int loop_prepare_queue(struct loop_device *lo)
+{
+ init_kthread_worker(&lo->worker);
+ lo->worker_task = kthread_run(kthread_worker_fn,
+ &lo->worker, "loop%d", lo->lo_number);
+ if (IS_ERR(lo->worker_task))
+ return -ENOMEM;
+ set_user_nice(lo->worker_task, MIN_NICE);
+ return 0;
+}
+
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
@@ -745,17 +917,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
size = get_loop_size(lo, file);
if ((loff_t)(sector_t)size != size)
goto out_putf;
- error = -ENOMEM;
- lo->wq = alloc_workqueue("kloopd%d",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
- lo->lo_number);
- if (!lo->wq)
+ error = loop_prepare_queue(lo);
+ if (error)
goto out_putf;
error = 0;
set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+ lo->use_dio = false;
lo->lo_blocksize = lo_blocksize;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
@@ -769,6 +939,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+ loop_update_dio(lo);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
loop_sysfs_init(lo);
@@ -903,8 +1074,7 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
- destroy_workqueue(lo->wq);
- lo->wq = NULL;
+ loop_unprepare_queue(lo);
mutex_unlock(&lo->lo_ctl_mutex);
/*
* Need not hold lo_ctl_mutex to fput backing file.
@@ -988,6 +1158,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
lo->lo_key_owner = uid;
}
+ /* update dio if lo_offset or transfer is changed */
+ __loop_update_dio(lo, lo->use_dio);
+
return 0;
}
@@ -1138,6 +1311,20 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
}
+static int loop_set_dio(struct loop_device *lo, unsigned long arg)
+{
+ int error = -ENXIO;
+ if (lo->lo_state != Lo_bound)
+ goto out;
+
+ __loop_update_dio(lo, !!arg);
+ if (lo->use_dio == !!arg)
+ return 0;
+ error = -EINVAL;
+ out:
+ return error;
+}
+
static int lo_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@@ -1181,6 +1368,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
err = loop_set_capacity(lo, bdev);
break;
+ case LOOP_SET_DIRECT_IO:
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_dio(lo, arg);
+ break;
default:
err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
}
@@ -1461,23 +1653,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
if (lo->lo_state != Lo_bound)
return -EIO;
- if (cmd->rq->cmd_flags & REQ_WRITE) {
- struct loop_device *lo = cmd->rq->q->queuedata;
- bool need_sched = true;
-
- spin_lock_irq(&lo->lo_lock);
- if (lo->write_started)
- need_sched = false;
- else
- lo->write_started = true;
- list_add_tail(&cmd->list, &lo->write_cmd_head);
- spin_unlock_irq(&lo->lo_lock);
+ if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH |
+ REQ_DISCARD)))
+ cmd->use_aio = true;
+ else
+ cmd->use_aio = false;
- if (need_sched)
- queue_work(lo->wq, &lo->write_work);
- } else {
- queue_work(lo->wq, &cmd->read_work);
- }
+ queue_kthread_work(&lo->worker, &cmd->work);
return BLK_MQ_RQ_QUEUE_OK;
}
@@ -1495,38 +1677,15 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
ret = do_req_filebacked(lo, cmd->rq);
failed:
- blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+ /* complete non-aio request */
+ if (!cmd->use_aio || ret)
+ blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
}
-static void loop_queue_write_work(struct work_struct *work)
-{
- struct loop_device *lo =
- container_of(work, struct loop_device, write_work);
- LIST_HEAD(cmd_list);
-
- spin_lock_irq(&lo->lo_lock);
- repeat:
- list_splice_init(&lo->write_cmd_head, &cmd_list);
- spin_unlock_irq(&lo->lo_lock);
-
- while (!list_empty(&cmd_list)) {
- struct loop_cmd *cmd = list_first_entry(&cmd_list,
- struct loop_cmd, list);
- list_del_init(&cmd->list);
- loop_handle_cmd(cmd);
- }
-
- spin_lock_irq(&lo->lo_lock);
- if (!list_empty(&lo->write_cmd_head))
- goto repeat;
- lo->write_started = false;
- spin_unlock_irq(&lo->lo_lock);
-}
-
-static void loop_queue_read_work(struct work_struct *work)
+static void loop_queue_work(struct kthread_work *work)
{
struct loop_cmd *cmd =
- container_of(work, struct loop_cmd, read_work);
+ container_of(work, struct loop_cmd, work);
loop_handle_cmd(cmd);
}
@@ -1538,7 +1697,7 @@ static int loop_init_request(void *data, struct request *rq,
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
cmd->rq = rq;
- INIT_WORK(&cmd->read_work, loop_queue_read_work);
+ init_kthread_work(&cmd->work, loop_queue_work);
return 0;
}
@@ -1594,8 +1753,11 @@ static int loop_add(struct loop_device **l, int i)
}
lo->lo_queue->queuedata = lo;
- INIT_LIST_HEAD(&lo->write_cmd_head);
- INIT_WORK(&lo->write_work, loop_queue_write_work);
+ /*
+ * It doesn't make sense to enable merge because the I/O
+ * submitted to backing file is handled page by page.
+ */
+ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)