From bb523b406c849eef8f265a07cd7f320f1f177743 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 2 Aug 2021 13:44:20 +0200 Subject: gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} Turn fault_in_pages_{readable,writeable} into versions that return the number of bytes not faulted in, similar to copy_to_user, instead of returning a non-zero value when any of the requested pages couldn't be faulted in. This supports the existing users that require all pages to be faulted in as well as new users that are happy if any pages can be faulted in. Rename the functions to fault_in_{readable,writeable} to make sure this change doesn't silently break things. Neither of these functions is entirely trivial and it doesn't seem useful to inline them, so move them to mm/gup.c. Signed-off-by: Andreas Gruenbacher --- include/linux/pagemap.h | 57 +++---------------------------------------------- 1 file changed, 3 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..9fe94f7a4f7e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -733,61 +733,10 @@ int wait_on_page_private_2_killable(struct page *page); extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* - * Fault everything in given userspace address range in. + * Fault in userspace address range. */ -static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) -{ - char __user *end = uaddr + size - 1; - - if (unlikely(size == 0)) - return 0; - - if (unlikely(uaddr > end)) - return -EFAULT; - /* - * Writing zeroes into userspace here is OK, because we know that if - * the zero gets there, we'll be overwriting it. - */ - do { - if (unlikely(__put_user(0, uaddr) != 0)) - return -EFAULT; - uaddr += PAGE_SIZE; - } while (uaddr <= end); - - /* Check whether the range spilled into the next page. */ - if (((unsigned long)uaddr & PAGE_MASK) == - ((unsigned long)end & PAGE_MASK)) - return __put_user(0, end); - - return 0; -} - -static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) -{ - volatile char c; - const char __user *end = uaddr + size - 1; - - if (unlikely(size == 0)) - return 0; - - if (unlikely(uaddr > end)) - return -EFAULT; - - do { - if (unlikely(__get_user(c, uaddr) != 0)) - return -EFAULT; - uaddr += PAGE_SIZE; - } while (uaddr <= end); - - /* Check whether the range spilled into the next page. */ - if (((unsigned long)uaddr & PAGE_MASK) == - ((unsigned long)end & PAGE_MASK)) { - return __get_user(c, end); - } - - (void)c; - return 0; -} +size_t fault_in_writeable(char __user *uaddr, size_t size); +size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -- cgit v1.2.3-59-g8ed1b From a6294593e8a1290091d0b078d5d33da5e0cd3dfe Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 2 Aug 2021 14:54:16 +0200 Subject: iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable Turn iov_iter_fault_in_readable into a function that returns the number of bytes not faulted in, similar to copy_to_user, instead of returning a non-zero value when any of the requested pages couldn't be faulted in. This supports the existing users that require all pages to be faulted in as well as new users that are happy if any pages can be faulted in. Rename iov_iter_fault_in_readable to fault_in_iov_iter_readable to make sure this change doesn't silently break things. Signed-off-by: Andreas Gruenbacher --- fs/btrfs/file.c | 2 +- fs/f2fs/file.c | 2 +- fs/fuse/file.c | 2 +- fs/iomap/buffered-io.c | 2 +- fs/ntfs/file.c | 2 +- fs/ntfs3/file.c | 2 +- include/linux/uio.h | 2 +- lib/iov_iter.c | 33 +++++++++++++++++++++------------ mm/filemap.c | 2 +- 9 files changed, 29 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d0f..f37211d3bb69 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1710,7 +1710,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, * Fault pages before locking them in prepare_pages * to avoid recursive lock */ - if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { ret = -EFAULT; break; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9c8ef33bd8d3..eb971e1e7227 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4276,7 +4276,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) size_t target_size = 0; int err; - if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + if (fault_in_iov_iter_readable(from, iov_iter_count(from))) set_inode_flag(inode, FI_NO_PREALLOC); if ((iocb->ki_flags & IOCB_NOWAIT)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 11404f8c21c7..4b6d8e13322d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, again: err = -EFAULT; - if (iov_iter_fault_in_readable(ii, bytes)) + if (fault_in_iov_iter_readable(ii, bytes)) break; err = -ENOMEM; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9cc5798423d1..1753c26c8e76 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -750,7 +750,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ab4f3362466d..a43adeacd930 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1829,7 +1829,7 @@ again: * pages being swapped out between us bringing them into memory * and doing the actual copying. */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 424450e77ad5..a52388387175 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -987,7 +987,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) frame_vbo = pos & ~(frame_size - 1); index = frame_vbo >> PAGE_SHIFT; - if (unlikely(iov_iter_fault_in_readable(from, bytes))) { + if (unlikely(fault_in_iov_iter_readable(from, bytes))) { err = -EFAULT; goto out; } diff --git a/include/linux/uio.h b/include/linux/uio.h index 207101a9c5c3..d18458af6681 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -133,7 +133,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); -int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes); +size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c88908f0f138..ce3d4f610626 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -430,33 +430,42 @@ out: } /* + * fault_in_iov_iter_readable - fault in iov iterator for reading + * @i: iterator + * @size: maximum length + * * Fault in one or more iovecs of the given iov_iter, to a maximum length of - * bytes. For each iovec, fault in each page that constitutes the iovec. + * @size. For each iovec, fault in each page that constitutes the iovec. + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). * - * Return 0 on success, or non-zero if the memory could not be accessed (i.e. - * because it is an invalid address). + * Always returns 0 for non-userspace iterators. */ -int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) +size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_iovec(i)) { + size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; - if (bytes > i->count) - bytes = i->count; - for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { - size_t len = min(bytes, p->iov_len - skip); + size -= count; + for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + size_t len = min(count, p->iov_len - skip); + size_t ret; if (unlikely(!len)) continue; - if (fault_in_readable(p->iov_base + skip, len)) - return -EFAULT; - bytes -= len; + ret = fault_in_readable(p->iov_base + skip, len); + count -= len - ret; + if (ret) + break; } + return count + size; } return 0; } -EXPORT_SYMBOL(iov_iter_fault_in_readable); +EXPORT_SYMBOL(fault_in_iov_iter_readable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, diff --git a/mm/filemap.c b/mm/filemap.c index ff34f4087f87..4dd5edcd39fd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3757,7 +3757,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { status = -EFAULT; break; } -- cgit v1.2.3-59-g8ed1b From cdd591fc86e38ad3899196066219fbbd845f3162 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 5 Jul 2021 17:26:28 +0200 Subject: iov_iter: Introduce fault_in_iov_iter_writeable Introduce a new fault_in_iov_iter_writeable helper for safely faulting in an iterator for writing. Uses get_user_pages() to fault in the pages without actually writing to them, which would be destructive. We'll use fault_in_iov_iter_writeable in gfs2 once we've determined that the iterator passed to .read_iter isn't in memory. Signed-off-by: Andreas Gruenbacher --- include/linux/pagemap.h | 1 + include/linux/uio.h | 1 + lib/iov_iter.c | 39 ++++++++++++++++++++++++++++++ mm/gup.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9fe94f7a4f7e..2f7dd14083d9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -736,6 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); +size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, diff --git a/include/linux/uio.h b/include/linux/uio.h index d18458af6681..25d1c24fd829 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -134,6 +134,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); +size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index ce3d4f610626..ac9a87e727a3 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -467,6 +467,45 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) } EXPORT_SYMBOL(fault_in_iov_iter_readable); +/* + * fault_in_iov_iter_writeable - fault in iov iterator for writing + * @i: iterator + * @size: maximum length + * + * Faults in the iterator using get_user_pages(), i.e., without triggering + * hardware page faults. This is primarily useful when we already know that + * some or all of the pages in @i aren't in memory. + * + * Returns the number of bytes not faulted in, like copy_to_user() and + * copy_from_user(). + * + * Always returns 0 for non-user-space iterators. + */ +size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) +{ + if (iter_is_iovec(i)) { + size_t count = min(size, iov_iter_count(i)); + const struct iovec *p; + size_t skip; + + size -= count; + for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + size_t len = min(count, p->iov_len - skip); + size_t ret; + + if (unlikely(!len)) + continue; + ret = fault_in_safe_writeable(p->iov_base + skip, len); + count -= len - ret; + if (ret) + break; + } + return count + size; + } + return 0; +} +EXPORT_SYMBOL(fault_in_iov_iter_writeable); + void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) diff --git a/mm/gup.c b/mm/gup.c index a7efb027d6cf..795f15c410cc 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1691,6 +1691,69 @@ out: } EXPORT_SYMBOL(fault_in_writeable); +/* + * fault_in_safe_writeable - fault in an address range for writing + * @uaddr: start of address range + * @size: length of address range + * + * Faults in an address range using get_user_pages, i.e., without triggering + * hardware page faults. This is primarily useful when we already know that + * some or all of the pages in the address range aren't in memory. + * + * Other than fault_in_writeable(), this function is non-destructive. + * + * Note that we don't pin or otherwise hold the pages referenced that we fault + * in. There's no guarantee that they'll stay in memory for any duration of + * time. + * + * Returns the number of bytes not faulted in, like copy_to_user() and + * copy_from_user(). + */ +size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) +{ + unsigned long start = (unsigned long)untagged_addr(uaddr); + unsigned long end, nstart, nend; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + int locked = 0; + + nstart = start & PAGE_MASK; + end = PAGE_ALIGN(start + size); + if (end < nstart) + end = 0; + for (; nstart != end; nstart = nend) { + unsigned long nr_pages; + long ret; + + if (!locked) { + locked = 1; + mmap_read_lock(mm); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + nend = end ? min(end, vma->vm_end) : vma->vm_end; + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + nr_pages = (nend - nstart) / PAGE_SIZE; + ret = __get_user_pages_locked(mm, nstart, nr_pages, + NULL, NULL, &locked, + FOLL_TOUCH | FOLL_WRITE); + if (ret <= 0) + break; + nend = nstart + ret * PAGE_SIZE; + } + if (locked) + mmap_read_unlock(mm); + if (nstart == end) + return 0; + return size - min_t(size_t, nstart - start, size); +} +EXPORT_SYMBOL(fault_in_safe_writeable); + /** * fault_in_readable - fault in userspace address range for reading * @uaddr: start of user address range -- cgit v1.2.3-59-g8ed1b From 97308f8b0d867e9ef59528cd97f0db55ffdf5651 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 23 Jul 2021 01:59:41 +0200 Subject: iomap: Support partial direct I/O on user copy failures In iomap_dio_rw, when iomap_apply returns an -EFAULT error and the IOMAP_DIO_PARTIAL flag is set, complete the request synchronously and return a partial result. This allows the caller to deal with the page fault and retry the remainder of the request. Signed-off-by: Andreas Gruenbacher Reviewed-by: Darrick J. Wong --- fs/iomap/direct-io.c | 6 ++++++ include/linux/iomap.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index a2a368e824c0..a434fb7887b2 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -581,6 +581,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) iov_iter_revert(iter, iomi.pos - dio->i_size); + if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { + if (!(iocb->ki_flags & IOCB_NOWAIT)) + wait_for_completion = true; + ret = 0; + } + /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { wait_for_completion = true; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 24f8489583ca..2a213b0d1e1f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -330,6 +330,13 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) +/* + * When a page fault occurs, return a partial synchronous result and allow + * the caller to retry the rest of the operation after dealing with the page + * fault. + */ +#define IOMAP_DIO_PARTIAL (1 << 2) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags); -- cgit v1.2.3-59-g8ed1b From 4fdccaa0d184c202f98d73b24e3ec8eeee88ab8d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 24 Jul 2021 12:26:41 +0200 Subject: iomap: Add done_before argument to iomap_dio_rw Add a done_before argument to iomap_dio_rw that indicates how much of the request has already been transferred. When the request succeeds, we report that done_before additional bytes were tranferred. This is useful for finishing a request asynchronously when part of the request has already been completed synchronously. We'll use that to allow iomap_dio_rw to be used with page faults disabled: when a page fault occurs while submitting a request, we synchronously complete the part of the request that has already been submitted. The caller can then take care of the page fault and call iomap_dio_rw again for the rest of the request, passing in the number of bytes already tranferred. Signed-off-by: Andreas Gruenbacher Reviewed-by: Darrick J. Wong --- fs/btrfs/file.c | 5 +++-- fs/erofs/data.c | 2 +- fs/ext4/file.c | 5 +++-- fs/gfs2/file.c | 4 ++-- fs/iomap/direct-io.c | 19 ++++++++++++++++--- fs/xfs/xfs_file.c | 6 +++--- fs/zonefs/super.c | 4 ++-- include/linux/iomap.h | 4 ++-- 8 files changed, 32 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f37211d3bb69..9d41b28c67ba 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1957,7 +1957,7 @@ relock: } dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0); + 0, 0); btrfs_inode_unlock(inode, ilock_flags); @@ -3658,7 +3658,8 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + 0, 0); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 9db829715652..16a41d0db55a 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!err) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0); + NULL, 0, 0); if (err < 0) return err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac0e11bbb445..b25c1f8f7c4f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ilock_shared) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); + (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, + 0); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d9126e3e6dd6..f772ee0fcae3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -822,7 +822,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, if (ret) goto out_uninit; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0); gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); @@ -856,7 +856,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, if (offset + len > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0); if (ret == -ENOTBLK) ret = 0; out: diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index a434fb7887b2..468dcbba45bc 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -31,6 +31,7 @@ struct iomap_dio { atomic_t ref; unsigned flags; int error; + size_t done_before; bool wait_for_completion; union { @@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); + if (ret > 0) + ret += dio->done_before; + kfree(dio); return ret; @@ -450,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, * may be pure data writes. In that case, we still need to do a full data sync * completion. * + * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, + * __iomap_dio_rw can return a partial result if it encounters a non-resident + * page in @iter after preparing a transfer. In that case, the non-resident + * pages can be faulted in and the request resumed with @done_before set to the + * number of bytes previously transferred. The request will then complete with + * the correct total number of bytes transferred; this is essential for + * completing partial requests asynchronously. + * * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags) + unsigned int dio_flags, size_t done_before) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -486,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->dops = dops; dio->error = 0; dio->flags = 0; + dio->done_before = done_before; dio->submit.iter = iter; dio->submit.waiter = current; @@ -652,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags) + unsigned int dio_flags, size_t done_before) { struct iomap_dio *dio; - dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7aa943edfc02..240eb932c014 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -259,7 +259,7 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -569,7 +569,7 @@ xfs_file_dio_write_aligned( } trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0); + &xfs_dio_write_ops, 0, 0); out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -647,7 +647,7 @@ retry_exclusive: trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, flags); + &xfs_dio_write_ops, flags, 0); /* * Retry unaligned I/O with exclusive blocking semantics if the DIO diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ddc346a9df9b..6122c38ab44d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = zonefs_file_dio_append(iocb, from); else ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, - &zonefs_write_dio_ops, 0); + &zonefs_write_dio_ops, 0, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } file_accessed(iocb->ki_filp); ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, - &zonefs_read_dio_ops, 0); + &zonefs_read_dio_ops, 0, 0); } else { ret = generic_file_read_iter(iocb, to); if (ret == -EIO) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2a213b0d1e1f..829f2325ecba 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -339,10 +339,10 @@ struct iomap_dio_ops { ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags); + unsigned int dio_flags, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags); + unsigned int dio_flags, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); -- cgit v1.2.3-59-g8ed1b From 55b8fe703bc51200d4698596c90813453b35ae63 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 17 Aug 2021 22:52:08 +0200 Subject: gup: Introduce FOLL_NOFAULT flag to disable page faults Introduce a new FOLL_NOFAULT flag that causes get_user_pages to return -EFAULT when it would otherwise trigger a page fault. This is roughly similar to FOLL_FAST_ONLY but available on all architectures, and less fragile. Signed-off-by: Andreas Gruenbacher --- include/linux/mm.h | 3 ++- mm/gup.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..2f0e6b9f8f3b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2851,7 +2851,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in page */ +#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ +#define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ diff --git a/mm/gup.c b/mm/gup.c index 795f15c410cc..e1c7e4bde11f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -918,6 +918,8 @@ static int faultin_page(struct vm_area_struct *vma, /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; + if (*flags & FOLL_NOFAULT) + return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) @@ -2843,7 +2845,7 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | - FOLL_FAST_ONLY))) + FOLL_FAST_ONLY | FOLL_NOFAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) -- cgit v1.2.3-59-g8ed1b From 3337ab08d08b1a375f88471d9c8b1cac968cb054 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 12 Jul 2021 12:06:14 +0200 Subject: iov_iter: Introduce nofault flag to disable page faults Introduce a new nofault flag to indicate to iov_iter_get_pages not to fault in user pages. This is implemented by passing the FOLL_NOFAULT flag to get_user_pages, which causes get_user_pages to fail when it would otherwise fault in a page. We'll use the ->nofault flag to prevent iomap_dio_rw from faulting in pages when page faults are not allowed. Signed-off-by: Andreas Gruenbacher --- include/linux/uio.h | 1 + lib/iov_iter.c | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 25d1c24fd829..6350354f97e9 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -35,6 +35,7 @@ struct iov_iter_state { struct iov_iter { u8 iter_type; + bool nofault; bool data_source; size_t iov_offset; size_t count; diff --git a/lib/iov_iter.c b/lib/iov_iter.c index ac9a87e727a3..66a740e6e153 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -513,6 +513,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, + .nofault = false, .data_source = direction, .iov = iov, .nr_segs = nr_segs, @@ -1527,13 +1528,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, return 0; if (likely(iter_is_iovec(i))) { + unsigned int gup_flags = 0; unsigned long addr; + if (iov_iter_rw(i) != WRITE) + gup_flags |= FOLL_WRITE; + if (i->nofault) + gup_flags |= FOLL_NOFAULT; + addr = first_iovec_segment(i, &len, start, maxsize, maxpages); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, - iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, - pages); + res = get_user_pages_fast(addr, n, gup_flags, pages); if (unlikely(res <= 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1649,15 +1654,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, return 0; if (likely(iter_is_iovec(i))) { + unsigned int gup_flags = 0; unsigned long addr; + if (iov_iter_rw(i) != WRITE) + gup_flags |= FOLL_WRITE; + if (i->nofault) + gup_flags |= FOLL_NOFAULT; + addr = first_iovec_segment(i, &len, start, maxsize, ~0U); n = DIV_ROUND_UP(len, PAGE_SIZE); p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, - iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); + res = get_user_pages_fast(addr, n, gup_flags, p); if (unlikely(res <= 0)) { kvfree(p); *pages = NULL; -- cgit v1.2.3-59-g8ed1b