aboutsummaryrefslogtreecommitdiffstats
path: root/fs/iomap.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/iomap.c')
-rw-r--r--fs/iomap.c408
1 files changed, 384 insertions, 24 deletions
diff --git a/fs/iomap.c b/fs/iomap.c
index 65aae194aeca..7d1e9f45f098 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/backing-dev.h>
@@ -27,6 +28,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include <linux/sched/signal.h>
+#include <linux/swap.h>
#include "internal.h"
@@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
return written ? written : ret;
}
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
{
- sector_t sector = (iomap->addr +
- (pos & PAGE_MASK) - iomap->offset) >> 9;
-
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
- offset, bytes);
+ return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+ iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
}
static loff_t
@@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
case IOMAP_DELALLOC:
flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
break;
+ case IOMAP_MAPPED:
+ break;
case IOMAP_UNWRITTEN:
flags |= FIEMAP_EXTENT_UNWRITTEN;
break;
- case IOMAP_MAPPED:
+ case IOMAP_INLINE:
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
break;
}
@@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_MERGED;
if (iomap->flags & IOMAP_F_SHARED)
flags |= FIEMAP_EXTENT_SHARED;
- if (iomap->flags & IOMAP_F_DATA_INLINE)
- flags |= FIEMAP_EXTENT_DATA_INLINE;
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ * Returns true if found and updates @lastoff to the offset in file.
+ */
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+ int whence)
+{
+ const struct address_space_operations *ops = inode->i_mapping->a_ops;
+ unsigned int bsize = i_blocksize(inode), off;
+ bool seek_data = whence == SEEK_DATA;
+ loff_t poff = page_offset(page);
+
+ if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+ return false;
+
+ if (*lastoff < poff) {
+ /*
+ * Last offset smaller than the start of the page means we found
+ * a hole:
+ */
+ if (whence == SEEK_HOLE)
+ return true;
+ *lastoff = poff;
+ }
+
+ /*
+ * Just check the page unless we can and should check block ranges:
+ */
+ if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
+ return PageUptodate(page) == seek_data;
+
+ lock_page(page);
+ if (unlikely(page->mapping != inode->i_mapping))
+ goto out_unlock_not_found;
+
+ for (off = 0; off < PAGE_SIZE; off += bsize) {
+ if ((*lastoff & ~PAGE_MASK) >= off + bsize)
+ continue;
+ if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+ unlock_page(page);
+ return true;
+ }
+ *lastoff = poff + off + bsize;
+ }
+
+out_unlock_not_found:
+ unlock_page(page);
+ return false;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+ int whence)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ loff_t lastoff = offset;
+ struct pagevec pvec;
+
+ if (length <= 0)
+ return -ENOENT;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ end - 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page_seek_hole_data(inode, page, &lastoff, whence))
+ goto check_range;
+ lastoff = page_offset(page) + PAGE_SIZE;
+ }
+ pagevec_release(&pvec);
+ } while (index < end);
+
+ /* When no page at lastoff and we are not done, we found a hole. */
+ if (whence != SEEK_HOLE)
+ goto not_found;
+
+check_range:
+ if (lastoff < offset + length)
+ goto out;
+not_found:
+ lastoff = -ENOENT;
+out:
+ pagevec_release(&pvec);
+ return lastoff;
+}
+
+
static loff_t
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
void *data, struct iomap *iomap)
@@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_WRITE_FUA (1 << 28)
+#define IOMAP_DIO_NEED_SYNC (1 << 29)
#define IOMAP_DIO_WRITE (1 << 30)
#define IOMAP_DIO_DIRTY (1 << 31)
@@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
dio_warn_stale_pagecache(iocb->ki_filp);
}
+ /*
+ * If this is a DSYNC write, make sure we push it to stable storage now
+ * that we've written data.
+ */
+ if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+ ret = generic_write_sync(iocb, ret);
+
inode_dio_end(file_inode(iocb->ki_filp));
kfree(dio);
@@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- bool is_write = (dio->flags & IOMAP_DIO_WRITE);
- ssize_t ret;
- ret = iomap_dio_complete(dio);
- if (is_write && ret > 0)
- ret = generic_write_sync(iocb, ret);
- iocb->ki_complete(iocb, ret, 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
}
/*
@@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector =
- (iomap->addr + pos - iomap->offset) >> 9;
+ bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
- if (bio_add_page(bio, page, len, 0) != len)
- BUG();
+ __bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
atomic_inc(&dio->ref);
@@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iov_iter iter;
struct bio *bio;
bool need_zeroout = false;
+ bool use_fua = false;
int nr_pages, ret;
size_t copied = 0;
@@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
case IOMAP_MAPPED:
if (iomap->flags & IOMAP_F_SHARED)
dio->flags |= IOMAP_DIO_COW;
- if (iomap->flags & IOMAP_F_NEW)
+ if (iomap->flags & IOMAP_F_NEW) {
need_zeroout = true;
+ } else {
+ /*
+ * Use a FUA write if we need datasync semantics, this
+ * is a pure data IO that doesn't require any metadata
+ * updates and the underlying device supports FUA. This
+ * allows us to avoid cache flushes on IO completion.
+ */
+ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+ (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+ blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ use_fua = true;
+ }
break;
default:
WARN_ON_ONCE(1);
@@ -916,8 +1044,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector =
- (iomap->addr + pos - iomap->offset) >> 9;
+ bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
@@ -931,10 +1058,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+ if (use_fua)
+ bio->bi_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_FUA;
task_io_account_write(n);
} else {
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
if (dio->flags & IOMAP_DIO_DIRTY)
bio_set_pages_dirty(bio);
}
@@ -962,6 +1093,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
return copied;
}
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not. This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
@@ -1006,8 +1146,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iter->type == ITER_IOVEC)
dio->flags |= IOMAP_DIO_DIRTY;
} else {
- dio->flags |= IOMAP_DIO_WRITE;
flags |= IOMAP_WRITE;
+ dio->flags |= IOMAP_DIO_WRITE;
+
+ /* for data sync or sync, we need sync completion processing */
+ if (iocb->ki_flags & IOCB_DSYNC)
+ dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+ /*
+ * For datasync only writes, we optimistically try using FUA for
+ * this IO. Any non-FUA write that occurs will clear this flag,
+ * hence we know before completion whether a cache flush is
+ * necessary.
+ */
+ if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+ dio->flags |= IOMAP_DIO_WRITE_FUA;
}
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -1063,6 +1216,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret < 0)
iomap_dio_set_error(dio, ret);
+ /*
+ * If all the writes we issued were FUA, we don't need to flush the
+ * cache on IO completion. Clear the sync flag for this case.
+ */
+ if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
if (!atomic_dec_and_test(&dio->ref)) {
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
@@ -1090,3 +1250,203 @@ out_free_dio:
return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+/* Swapfile activation */
+
+#ifdef CONFIG_SWAP
+struct iomap_swapfile_info {
+ struct iomap iomap; /* accumulated iomap */
+ struct swap_info_struct *sis;
+ uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
+ uint64_t highest_ppage; /* highest physical addr seen (pages) */
+ unsigned long nr_pages; /* number of pages collected */
+ int nr_extents; /* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file. Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary. The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+ struct iomap *iomap = &isi->iomap;
+ unsigned long nr_pages;
+ uint64_t first_ppage;
+ uint64_t first_ppage_reported;
+ uint64_t next_ppage;
+ int error;
+
+ /*
+ * Round the start up and the end down so that the physical
+ * extent aligns to a page boundary.
+ */
+ first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+ PAGE_SHIFT;
+
+ /* Skip too-short physical extents. */
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
+
+ /*
+ * Calculate how much swap space we're adding; the first page contains
+ * the swap header and doesn't count. The mm still wants that first
+ * page fed to add_swap_extent, however.
+ */
+ first_ppage_reported = first_ppage;
+ if (iomap->offset == 0)
+ first_ppage_reported++;
+ if (isi->lowest_ppage > first_ppage_reported)
+ isi->lowest_ppage = first_ppage_reported;
+ if (isi->highest_ppage < (next_ppage - 1))
+ isi->highest_ppage = next_ppage - 1;
+
+ /* Add extent, set up for the next call. */
+ error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+ if (error < 0)
+ return error;
+ isi->nr_extents += error;
+ isi->nr_pages += nr_pages;
+ return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file. We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+ loff_t count, void *data, struct iomap *iomap)
+{
+ struct iomap_swapfile_info *isi = data;
+ int error;
+
+ switch (iomap->type) {
+ case IOMAP_MAPPED:
+ case IOMAP_UNWRITTEN:
+ /* Only real or unwritten extents. */
+ break;
+ case IOMAP_INLINE:
+ /* No inline data. */
+ pr_err("swapon: file is inline\n");
+ return -EINVAL;
+ default:
+ pr_err("swapon: file has unallocated extents\n");
+ return -EINVAL;
+ }
+
+ /* No uncommitted metadata or shared blocks. */
+ if (iomap->flags & IOMAP_F_DIRTY) {
+ pr_err("swapon: file is not committed\n");
+ return -EINVAL;
+ }
+ if (iomap->flags & IOMAP_F_SHARED) {
+ pr_err("swapon: file has shared extents\n");
+ return -EINVAL;
+ }
+
+ /* Only one bdev per swap file. */
+ if (iomap->bdev != isi->sis->bdev) {
+ pr_err("swapon: file is on multiple devices\n");
+ return -EINVAL;
+ }
+
+ if (isi->iomap.length == 0) {
+ /* No accumulated extent, so just store it. */
+ memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+ } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+ /* Append this to the accumulated extent. */
+ isi->iomap.length += iomap->length;
+ } else {
+ /* Otherwise, add the retained iomap and store this one. */
+ error = iomap_swapfile_add_extent(isi);
+ if (error)
+ return error;
+ memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+ }
+ return count;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *pagespan,
+ const struct iomap_ops *ops)
+{
+ struct iomap_swapfile_info isi = {
+ .sis = sis,
+ .lowest_ppage = (sector_t)-1ULL,
+ };
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = 0;
+ loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+ loff_t ret;
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+ ops, &isi, iomap_swapfile_activate_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ if (isi.iomap.length) {
+ ret = iomap_swapfile_add_extent(&isi);
+ if (ret)
+ return ret;
+ }
+
+ *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+ sis->max = isi.nr_pages;
+ sis->pages = isi.nr_pages - 1;
+ sis->highest_bit = isi.nr_pages - 1;
+ return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+#endif /* CONFIG_SWAP */
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ sector_t *bno = data, addr;
+
+ if (iomap->type == IOMAP_MAPPED) {
+ addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+ if (addr > INT_MAX)
+ WARN(1, "would truncate bmap result\n");
+ else
+ *bno = addr;
+ }
+ return 0;
+}
+
+/* legacy ->bmap interface. 0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+ const struct iomap_ops *ops)
+{
+ struct inode *inode = mapping->host;
+ loff_t pos = bno >> inode->i_blkbits;
+ unsigned blocksize = i_blocksize(inode);
+
+ if (filemap_write_and_wait(mapping))
+ return 0;
+
+ bno = 0;
+ iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+ return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);