aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c660
1 files changed, 452 insertions, 208 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 4e3e5a283a91..1c6867810cbd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -11,7 +11,6 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
@@ -25,6 +24,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#define CREATE_TRACE_POINTS
@@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
+static inline bool dax_mapping_is_cow(struct address_space *mapping)
+{
+ return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
+/*
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static inline void dax_mapping_set_cow(struct page *page)
+{
+ if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+ /*
+ * Reset the index if the page was already mapped
+ * regularly before.
+ */
+ if (page->mapping)
+ page->index = 1;
+ page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ }
+ page->index++;
+}
+
/*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files. If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool cow)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
+ if (cow) {
+ dax_mapping_set_cow(page);
+ } else {
+ WARN_ON_ONCE(page->mapping);
+ page->mapping = mapping;
+ page->index = index + i++;
+ }
}
}
@@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+ if (dax_mapping_is_cow(page->mapping)) {
+ /* keep the CoW flag if this page is still shared */
+ if (page->index-- > 0)
+ continue;
+ } else
+ WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
page->index = 0;
}
@@ -390,7 +421,7 @@ static struct page *dax_busy_page(void *entry)
}
/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock_page - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
*
* Context: Process context.
@@ -456,6 +487,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
}
/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ struct page **page)
+{
+ XA_STATE(xas, NULL, 0);
+ void *entry;
+
+ rcu_read_lock();
+ for (;;) {
+ entry = NULL;
+ if (!dax_mapping(mapping))
+ break;
+
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ rcu_read_unlock();
+ wait_entry_unlocked(&xas, entry);
+ rcu_read_lock();
+ continue;
+ }
+ if (!entry ||
+ dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ /*
+ * Because we are looking for entry from file's mapping
+ * and index, so the entry may not be inserted for now,
+ * or even a zero/empty entry. We don't think this is
+ * an error case. So, return a special value and do
+ * not output @page.
+ */
+ entry = (void *)~0UL;
+ } else {
+ *page = pfn_to_page(dax_to_pfn(entry));
+ dax_lock_entry(&xas, entry);
+ }
+ xas_unlock_irq(&xas);
+ break;
+ }
+ rcu_read_unlock();
+ return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ if (cookie == ~0UL)
+ return;
+
+ dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
* Find page cache entry at given index. If it is a DAX entry, return it
* with the entry locked. If the page cache doesn't contain an entry at
* that index, add a locked empty entry.
@@ -709,48 +803,69 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_entry(mapping, index, false);
}
-static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, struct page *to, unsigned long vaddr)
+static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
+{
+ return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
+}
+
+static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
{
+ pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
void *vto, *kaddr;
- pgoff_t pgoff;
long rc;
int id;
- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
-
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+ rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
+ &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
- vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)kaddr, vaddr, to);
+ vto = kmap_atomic(vmf->cow_page);
+ copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
kunmap_atomic(vto);
dax_read_unlock(id);
return 0;
}
/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
+ struct vm_area_struct *vma)
+{
+ return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
+ (iter->iomap.flags & IOMAP_F_DIRTY);
+}
+
+static bool dax_fault_is_cow(const struct iomap_iter *iter)
+{
+ return (iter->flags & IOMAP_WRITE) &&
+ (iter->iomap.flags & IOMAP_F_SHARED);
+}
+
+/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
* PTEs. If we happen to be trying to insert a PTE and there is a PMD
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_entry(struct xa_state *xas,
- struct address_space *mapping, struct vm_fault *vmf,
- void *entry, pfn_t pfn, unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap_iter *iter, void *entry, pfn_t pfn,
+ unsigned long flags)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
+ bool cow = dax_fault_is_cow(iter);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -762,11 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
+ dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
+ cow);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -786,99 +902,19 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
xas_unlock_irq(xas);
return entry;
}
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
- unsigned long address;
-
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- return address;
-}
-
-/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
- unsigned long pfn)
-{
- struct vm_area_struct *vma;
- pte_t pte, *ptep = NULL;
- pmd_t *pmdp = NULL;
- spinlock_t *ptl;
-
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- struct mmu_notifier_range range;
- unsigned long address;
-
- cond_resched();
-
- if (!(vma->vm_flags & VM_SHARED))
- continue;
-
- address = pgoff_address(index, vma);
-
- /*
- * follow_invalidate_pte() will use the range to call
- * mmu_notifier_invalidate_range_start() on our behalf before
- * taking any lock.
- */
- if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
- &pmdp, &ptl))
- continue;
-
- /*
- * No need to call mmu_notifier_invalidate_range() as we are
- * downgrading page table protection not changing it to point
- * to a new page.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
- pmd_t pmd;
-
- if (pfn != pmd_pfn(*pmdp))
- goto unlock_pmd;
- if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
- goto unlock_pmd;
-
- flush_cache_page(vma, address, pfn);
- pmd = pmdp_invalidate(vma, address, pmdp);
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_mkclean(pmd);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
- spin_unlock(ptl);
- } else {
- if (pfn != pte_pfn(*ptep))
- goto unlock_pte;
- if (!pte_dirty(*ptep) && !pte_write(*ptep))
- goto unlock_pte;
-
- flush_cache_page(vma, address, pfn);
- pte = ptep_clear_flush(vma, address, ptep);
- pte = pte_wrprotect(pte);
- pte = pte_mkclean(pte);
- set_pte_at(vma->vm_mm, address, ptep, pte);
-unlock_pte:
- pte_unmap_unlock(ptep, ptl);
- }
-
- mmu_notifier_invalidate_range_end(&range);
- }
- i_mmap_unlock_read(mapping);
-}
-
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn, index, count;
+ unsigned long pfn, index, count, end;
long ret = 0;
+ struct vm_area_struct *vma;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -936,8 +972,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
pfn = dax_to_pfn(entry);
count = 1UL << dax_entry_order(entry);
index = xas->xa_index & ~(count - 1);
+ end = index + count - 1;
+
+ /* Walk all mappings of a given index of a file and writeprotect them */
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+ pfn_mkclean_range(pfn, count, index, vma);
+ cond_resched();
+ }
+ i_mmap_unlock_read(mapping);
- dax_entry_mkclean(mapping, index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1005,29 +1049,22 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
-{
- return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
-static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
- pfn_t *pfnp)
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+ size_t size, void **kaddr, pfn_t *pfnp)
{
- const sector_t sector = dax_iomap_sector(iomap, pos);
- pgoff_t pgoff;
- int id, rc;
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
+ int id, rc = 0;
long length;
- rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
- if (rc)
- return rc;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- NULL, pfnp);
+ DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
+ if (!pfnp)
+ goto out_check_addr;
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
@@ -1037,11 +1074,71 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
+
+out_check_addr:
+ if (!kaddr)
+ goto out;
+ if (!*kaddr)
+ rc = -EFAULT;
out:
dax_read_unlock(id);
return rc;
}
+/**
+ * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * @pos: address to do copy from.
+ * @length: size of copy operation.
+ * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
+ * @srcmap: iomap srcmap
+ * @daddr: destination address to copy to.
+ *
+ * This can be called from two places. Either during DAX write fault (page
+ * aligned), to copy the length size data to daddr. Or, while doing normal DAX
+ * write operation, dax_iomap_actor() might call this to do the copy of either
+ * start or end unaligned address. In the latter case the rest of the copy of
+ * aligned ranges is taken care by dax_iomap_actor() itself.
+ */
+static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+ const struct iomap *srcmap, void *daddr)
+{
+ loff_t head_off = pos & (align_size - 1);
+ size_t size = ALIGN(head_off + length, align_size);
+ loff_t end = pos + length;
+ loff_t pg_end = round_up(end, align_size);
+ bool copy_all = head_off == 0 && end == pg_end;
+ void *saddr = 0;
+ int ret = 0;
+
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+
+ if (copy_all) {
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ return ret ? -EIO : 0;
+ }
+
+ /* Copy the head part of the range */
+ if (head_off) {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
+
+ /* Copy the tail part of the range */
+ if (end < pg_end) {
+ loff_t tail_off = head_off + length;
+ loff_t tail_len = pg_end - end;
+
+ ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
+ tail_len);
+ if (ret)
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
@@ -1049,17 +1146,15 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct xa_state *xas,
- struct address_space *mapping, void **entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap_iter *iter, void **entry)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -1068,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap *iomap, void **entry)
+ const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1086,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1120,63 +1215,120 @@ fallback:
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap *iomap, void **entry)
+ const struct iomap_iter *iter, void **entry)
{
return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
+static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
- sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
- pgoff_t pgoff;
- long rc, id;
- void *kaddr;
- bool page_aligned = false;
+ const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
+ void *kaddr;
+ long ret;
+
+ ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
+ NULL);
+ if (ret < 0)
+ return ret;
+ memset(kaddr + offset, 0, size);
+ if (srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret < 0)
+ return ret;
+ dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
+ } else
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
+ return ret;
+}
- if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
- (size == PAGE_SIZE))
- page_aligned = true;
+static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
+{
+ const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ u64 length = iomap_length(iter);
+ s64 written = 0;
+
+ /* already zeroed? we're done. */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
+
+ do {
+ unsigned offset = offset_in_page(pos);
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
+ long rc;
+ int id;
+
+ id = dax_read_lock();
+ if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ else
+ rc = dax_memzero(iter, pos, size);
+ dax_read_unlock(id);
- rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
+ if (rc < 0)
+ return rc;
+ pos += size;
+ length -= size;
+ written += size;
+ } while (length > 0);
- id = dax_read_lock();
+ if (did_zero)
+ *did_zero = true;
+ return written;
+}
- if (page_aligned)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
- else
- rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_DAX | IOMAP_ZERO,
+ };
+ int ret;
- if (!page_aligned) {
- memset(kaddr + offset, 0, size);
- dax_flush(iomap->dax_dev, kaddr + offset, size);
- }
- dax_read_unlock(id);
- return size;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_zero_iter(&iter, did_zero);
+ return ret;
}
+EXPORT_SYMBOL_GPL(dax_zero_range);
+
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ const struct iomap_ops *ops)
+{
+ unsigned int blocksize = i_blocksize(inode);
+ unsigned int off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
+ const struct iomap *srcmap = &iomi->srcmap;
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
+ bool write = iov_iter_rw(iter) == WRITE;
ssize_t ret = 0;
size_t xfer;
int id;
- if (iov_iter_rw(iter) == READ) {
+ if (!write) {
end = min(end, i_size_read(iomi->inode));
if (pos >= end)
return 0;
@@ -1185,7 +1337,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
return iov_iter_zero(min(length, end - pos), iter);
}
- if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ /*
+ * In DAX mode, enforce either pure overwrites of written extents, or
+ * writes to unwritten extents as part of a copy-on-write operation.
+ */
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
+ !(iomap->flags & IOMAP_F_SHARED)))
return -EIO;
/*
@@ -1203,9 +1360,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
- const sector_t sector = dax_iomap_sector(iomap, pos);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
ssize_t map_len;
- pgoff_t pgoff;
+ bool recovery = false;
void *kaddr;
if (fatal_signal_pending(current)) {
@@ -1213,29 +1370,38 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
- ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
- if (ret)
- break;
-
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
- &kaddr, NULL);
+ DAX_ACCESS, &kaddr, NULL);
+ if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+ map_len = dax_direct_access(dax_dev, pgoff,
+ PHYS_PFN(size), DAX_RECOVERY_WRITE,
+ &kaddr, NULL);
+ if (map_len > 0)
+ recovery = true;
+ }
if (map_len < 0) {
ret = map_len;
break;
}
+ if (write &&
+ srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret)
+ break;
+ }
+
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
if (map_len > end - pos)
map_len = end - pos;
- /*
- * The userspace address for the memory copy has already been
- * validated via access_ok() in either vfs_read() or
- * vfs_write(), depending on which operation we are doing.
- */
- if (iov_iter_rw(iter) == WRITE)
+ if (recovery)
+ xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
+ map_len, iter);
+ else if (write)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
@@ -1274,10 +1440,14 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
+ .flags = IOMAP_DAX,
};
loff_t done = 0;
int ret;
+ if (!iomi.len)
+ return 0;
+
if (iov_iter_rw(iter) == WRITE) {
lockdep_assert_held_write(&iomi.inode->i_rwsem);
iomi.flags |= IOMAP_WRITE;
@@ -1305,17 +1475,6 @@ static vm_fault_t dax_fault_return(int error)
}
/*
- * MAP_SYNC on a dax mapping guarantees dirty metadata is
- * flushed on write-faults (non-cow), but not read-faults.
- */
-static bool dax_fault_is_synchronous(unsigned long flags,
- struct vm_area_struct *vma, const struct iomap *iomap)
-{
- return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
- && (iomap->flags & IOMAP_F_DIRTY);
-}
-
-/*
* When handling a synchronous page fault and the inode need a fsync, we can
* insert the PTE/PMD into page tables only after that fsync happened. Skip
* insertion for now and return the pfn so that caller can insert it after the
@@ -1332,19 +1491,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
const struct iomap_iter *iter)
{
- sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
- unsigned long vaddr = vmf->address;
vm_fault_t ret;
int error = 0;
switch (iter->iomap.type) {
case IOMAP_HOLE:
case IOMAP_UNWRITTEN:
- clear_user_highpage(vmf->cow_page, vaddr);
+ clear_user_highpage(vmf->cow_page, vmf->address);
break;
case IOMAP_MAPPED:
- error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
- sector, vmf->cow_page, vaddr);
+ error = copy_cow_page_dax(vmf, iter);
break;
default:
WARN_ON_ONCE(1);
@@ -1375,15 +1531,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
const struct iomap_iter *iter, pfn_t *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = &iter->srcmap;
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
+ bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
int err = 0;
pfn_t pfn;
+ void *kaddr;
if (!pmd && vmf->cow_page)
return dax_fault_cow_page(vmf, iter);
@@ -1392,23 +1548,29 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
if (!write &&
(iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
if (!pmd)
- return dax_load_hole(xas, mapping, entry, vmf);
- return dax_pmd_load_hole(xas, vmf, iomap, entry);
+ return dax_load_hole(xas, vmf, iter, entry);
+ return dax_pmd_load_hole(xas, vmf, iter, entry);
}
- if (iomap->type != IOMAP_MAPPED) {
+ if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
WARN_ON_ONCE(1);
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}
- err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
+ err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
- write && !sync);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
+
+ if (write &&
+ srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (err)
+ return dax_fault_return(err);
+ }
- if (sync)
+ if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
/* insert PMD pfn */
@@ -1430,7 +1592,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
.inode = mapping->host,
.pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
.len = PAGE_SIZE,
- .flags = IOMAP_FAULT,
+ .flags = IOMAP_DAX | IOMAP_FAULT,
};
vm_fault_t ret = 0;
void *entry;
@@ -1539,7 +1701,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct iomap_iter iter = {
.inode = mapping->host,
.len = PMD_SIZE,
- .flags = IOMAP_FAULT,
+ .flags = IOMAP_DAX | IOMAP_FAULT,
};
vm_fault_t ret = VM_FAULT_FALLBACK;
pgoff_t max_pgoff;
@@ -1714,3 +1876,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+ struct iomap_iter *it_dest, u64 len, bool *same)
+{
+ const struct iomap *smap = &it_src->iomap;
+ const struct iomap *dmap = &it_dest->iomap;
+ loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ void *saddr, *daddr;
+ int id, ret;
+
+ len = min(len, min(smap->length, dmap->length));
+
+ if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+ *same = true;
+ return len;
+ }
+
+ if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+ *same = false;
+ return 0;
+ }
+
+ id = dax_read_lock();
+ ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+ &saddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+ &daddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ *same = !memcmp(saddr, daddr, len);
+ if (!*same)
+ len = 0;
+ dax_read_unlock(id);
+ return len;
+
+out_unlock:
+ dax_read_unlock(id);
+ return -EIO;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dst, loff_t dstoff, loff_t len, bool *same,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter src_iter = {
+ .inode = src,
+ .pos = srcoff,
+ .len = len,
+ .flags = IOMAP_DAX,
+ };
+ struct iomap_iter dst_iter = {
+ .inode = dst,
+ .pos = dstoff,
+ .len = len,
+ .flags = IOMAP_DAX,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&src_iter, ops)) > 0) {
+ while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
+ dst_iter.processed = dax_range_compare_iter(&src_iter,
+ &dst_iter, len, same);
+ }
+ if (ret <= 0)
+ src_iter.processed = ret;
+ }
+ return ret;
+}
+
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, ops);
+}
+EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);