From 5fac7408d828719db6d3fdba63e3c3726a6d1ee5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 9 Mar 2018 17:44:31 -0800 Subject: mm, fs, dax: handle layout changes to pinned dax mappings Background: get_user_pages() in the filesystem pins file backed memory pages for access by devices performing dma. However, it only pins the memory pages not the page-to-file offset association. If a file is truncated the pages are mapped out of the file and dma may continue indefinitely into a page that is owned by a device driver. This breaks coherency of the file vs dma, but the assumption is that if userspace wants the file-space truncated it does not matter what data is inbound from the device, it is not relevant anymore. The only expectation is that dma can safely continue while the filesystem reallocates the block(s). Problem: This expectation that dma can safely continue while the filesystem changes the block map is broken by dax. With dax the target dma page *is* the filesystem block. The model of leaving the page pinned for dma, but truncating the file block out of the file, means that the filesytem is free to reallocate a block under active dma to another file and now the expected data-incoherency situation has turned into active data-corruption. Solution: Defer all filesystem operations (fallocate(), truncate()) on a dax mode file while any page/block in the file is under active dma. This solution assumes that dma is transient. Cases where dma operations are known to not be transient, like RDMA, have been explicitly disabled via commits like 5f1d43de5416 "IB/core: disable memory registration of filesystem-dax vmas". The dax_layout_busy_page() routine is called by filesystems with a lock held against mm faults (i_mmap_lock) to find pinned / busy dax pages. The process of looking up a busy page invalidates all mappings to trigger any subsequent get_user_pages() to block on i_mmap_lock. The filesystem continues to call dax_layout_busy_page() until it finally returns no more active pages. This approach assumes that the page pinning is transient, if that assumption is violated the system would have likely hung from the uncompleted I/O. Cc: Jeff Moyer Cc: Dave Chinner Cc: Matthew Wilcox Cc: Alexander Viro Cc: "Darrick J. Wong" Cc: Ross Zwisler Cc: Dave Hansen Cc: Andrew Morton Reported-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/dax.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/dax.h') diff --git a/include/linux/dax.h b/include/linux/dax.h index f9eb22ad341e..25bab6abb695 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -83,6 +83,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc); + +struct page *dax_layout_busy_page(struct address_space *mapping); #else static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { @@ -103,6 +105,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) return NULL; } +static inline struct page *dax_layout_busy_page(struct address_space *mapping) +{ + return NULL; +} + static inline int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { -- cgit v1.3-8-gc7d7 From b3a9a0c36e1f7b9e2e6cf965c2bb973624f2b3b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 2 May 2018 06:46:33 -0700 Subject: dax: Introduce a ->copy_to_iter dax operation Similar to the ->copy_from_iter() operation, a platform may want to deploy an architecture or device specific routine for handling reads from a dax_device like /dev/pmemX. On x86 this routine will point to a machine check safe version of copy_to_iter(). For now, add the plumbing to device-mapper and the dax core. Cc: Ross Zwisler Cc: Mike Snitzer Cc: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 10 ++++++++++ drivers/md/dm-linear.c | 16 ++++++++++++++++ drivers/md/dm-log-writes.c | 15 +++++++++++++++ drivers/md/dm-stripe.c | 21 +++++++++++++++++++++ drivers/md/dm.c | 25 +++++++++++++++++++++++++ drivers/nvdimm/pmem.c | 7 +++++++ drivers/s390/block/dcssblk.c | 7 +++++++ fs/dax.c | 3 ++- include/linux/dax.h | 5 +++++ include/linux/device-mapper.h | 5 +++-- 10 files changed, 111 insertions(+), 3 deletions(-) (limited to 'include/linux/dax.h') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..31b839113399 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -282,6 +282,16 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, } EXPORT_SYMBOL_GPL(dax_copy_from_iter); +size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + if (!dax_alive(dax_dev)) + return 0; + + return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} +EXPORT_SYMBOL_GPL(dax_copy_to_iter); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 775c06d953b7..d10964d41fd7 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct linear_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + + dev_sector = linear_map_sector(ti, sector); + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} + #else #define linear_dax_direct_access NULL #define linear_dax_copy_from_iter NULL +#define linear_dax_copy_to_iter NULL #endif static struct target_type linear_target = { @@ -204,6 +219,7 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_copy_from_iter = linear_dax_copy_from_iter, + .dax_copy_to_iter = linear_dax_copy_to_iter, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index c90c7c08a77f..9ea2b0291f20 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, dax_copy: return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); } + +static size_t log_writes_dax_copy_to_iter(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, + struct iov_iter *i) +{ + struct log_writes_c *lc = ti->private; + sector_t sector = pgoff * PAGE_SECTORS; + + if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); +} + #else #define log_writes_dax_direct_access NULL #define log_writes_dax_copy_from_iter NULL +#define log_writes_dax_copy_to_iter NULL #endif static struct target_type log_writes_target = { @@ -982,6 +996,7 @@ static struct target_type log_writes_target = { .io_hints = log_writes_io_hints, .direct_access = log_writes_dax_direct_access, .dax_copy_from_iter = log_writes_dax_copy_from_iter, + .dax_copy_to_iter = log_writes_dax_copy_to_iter, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index fe7fb9b1aec3..8547d7594338 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + struct stripe_c *sc = ti->private; + struct dax_device *dax_dev; + struct block_device *bdev; + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + dax_dev = sc->stripe[stripe].dev->dax_dev; + bdev = sc->stripe[stripe].dev->bdev; + + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} + #else #define stripe_dax_direct_access NULL #define stripe_dax_copy_from_iter NULL +#define stripe_dax_copy_to_iter NULL #endif /* @@ -478,6 +498,7 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_copy_from_iter = stripe_dax_copy_from_iter, + .dax_copy_to_iter = stripe_dax_copy_to_iter, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0a7b0107ca78..6752f1c25258 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + long ret = 0; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (!ti->type->dax_copy_to_iter) { + ret = copy_to_iter(addr, bytes, i); + goto out; + } + ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i); + out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. @@ -3134,6 +3158,7 @@ static const struct block_device_operations dm_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .copy_from_iter = dm_dax_copy_from_iter, + .copy_to_iter = dm_dax_copy_to_iter, }; /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index e023d6aa22b5..1b8ab48365de 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -264,9 +264,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, return copy_from_iter_flushcache(addr, bytes, i); } +static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + static const struct dax_operations pmem_dax_ops = { .direct_access = pmem_dax_direct_access, .copy_from_iter = pmem_copy_from_iter, + .copy_to_iter = pmem_copy_to_iter, }; static const struct attribute_group *pmem_attribute_groups[] = { diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 0a312e450207..29024492b8ed 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev, return copy_from_iter(addr, bytes, i); } +static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + static const struct dax_operations dcssblk_dax_ops = { .direct_access = dcssblk_dax_direct_access, .copy_from_iter = dcssblk_dax_copy_from_iter, + .copy_to_iter = dcssblk_dax_copy_to_iter, }; struct dcssblk_dev_info { diff --git a/fs/dax.c b/fs/dax.c index aaec72ded1b6..a64afdf7ec0d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1057,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else - map_len = copy_to_iter(kaddr, map_len, iter); + map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr, + map_len, iter); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; diff --git a/include/linux/dax.h b/include/linux/dax.h index f9eb22ad341e..a43b396fb336 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -20,6 +20,9 @@ struct dax_operations { /* copy_from_iter: required operation for fs-dax direct-i/o */ size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); + /* copy_to_iter: required operation for fs-dax direct-i/o */ + size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, + struct iov_iter *); }; extern struct attribute_group dax_attribute_group; @@ -118,6 +121,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 31fef7c34185..6fb0808e87c8 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -133,7 +133,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); -typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff, +typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); #define PAGE_SECTORS (PAGE_SIZE / 512) @@ -184,7 +184,8 @@ struct target_type { dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; - dm_dax_copy_from_iter_fn dax_copy_from_iter; + dm_dax_copy_iter_fn dax_copy_from_iter; + dm_dax_copy_iter_fn dax_copy_to_iter; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.3-8-gc7d7 From f77bc3a82ce58fe7977a11f28e0b6cac8a9e087c Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Wed, 27 Jun 2018 23:26:17 -0700 Subject: include/linux/dax.h: dax_iomap_fault() returns vm_fault_t Commit 1c8f422059ae ("mm: change return type to vm_fault_t") missed a conversion. It's not a big problem at present because mainline is still using typedef int vm_fault_t; Fixes: 1c8f422059ae ("mm: change return type to vm_fault_t") Link: http://lkml.kernel.org/r/20180620172046.GA27894@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/dax.h') diff --git a/include/linux/dax.h b/include/linux/dax.h index 3855e3800f48..deb0f663252f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -135,7 +135,7 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t pfn); -- cgit v1.3-8-gc7d7