aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c756
1 files changed, 445 insertions, 311 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b8a4a3f29b36..e462d39c840e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,45 +25,46 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include <linux/dax.h>
#include <linux/falloc.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
#include <linux/fadvise.h>
+#include <linux/mount.h>
static const struct vm_operations_struct xfs_file_vm_ops;
-int
-xfs_update_prealloc_flags(
+/*
+ * Decide if the given file range is aligned to the size of the fundamental
+ * allocation unit for the file.
+ */
+static bool
+xfs_is_falloc_aligned(
struct xfs_inode *ip,
- enum xfs_prealloc_flags flags)
+ loff_t pos,
+ long long int len)
{
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
- 0, 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- if (!(flags & XFS_PREALLOC_INVISIBLE)) {
- VFS_I(ip)->i_mode &= ~S_ISUID;
- if (VFS_I(ip)->i_mode & S_IXGRP)
- VFS_I(ip)->i_mode &= ~S_ISGID;
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ struct xfs_mount *mp = ip->i_mount;
+ uint64_t mask;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
+ u64 rextbytes;
+ u32 mod;
+
+ rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ div_u64_rem(pos, rextbytes, &mod);
+ if (mod)
+ return false;
+ div_u64_rem(len, rextbytes, &mod);
+ return mod == 0;
+ }
+ mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
+ } else {
+ mask = mp->m_sb.sb_blocksize - 1;
}
- if (flags & XFS_PREALLOC_SET)
- ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
- if (flags & XFS_PREALLOC_CLEAR)
- ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (flags & XFS_PREALLOC_SYNC)
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
+ return !((pos | len) & mask);
}
/*
@@ -80,19 +81,57 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
+ return xfs_log_force_inode(ip);
+}
+
+static xfs_csn_t
+xfs_fsync_seq(
+ struct xfs_inode *ip,
+ bool datasync)
+{
+ if (!xfs_ipincount(ip))
+ return 0;
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ return 0;
+ return ip->i_itemp->ili_commit_seq;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk. We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ int error = 0;
+ xfs_csn_t seq;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ seq = xfs_fsync_seq(ip, datasync);
+ if (seq) {
+ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ spin_lock(&ip->i_itemp->ili_lock);
+ ip->i_itemp->ili_fsync_fields = 0;
+ spin_unlock(&ip->i_itemp->ili_lock);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
}
STATIC int
@@ -102,12 +141,10 @@ xfs_file_fsync(
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error = 0;
+ int error, err2;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
@@ -115,7 +152,7 @@ xfs_file_fsync(
if (error)
return error;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_iflags_clear(ip, XFS_ITRUNCATED);
@@ -127,36 +164,22 @@ xfs_file_fsync(
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
- * All metadata updates are logged, which means that we just have to
- * flush the log up to the latest LSN that touched the inode. If we have
- * concurrent fsync/fdatasync() calls, we need them to all block on the
- * log force before we clear the ili_fsync_fields field. This ensures
- * that we don't get a racing sync operation that does not wait for the
- * metadata to hit the journal before returning. If we race with
- * clearing the ili_fsync_fields, then all that will happen is the log
- * force will do nothing as the lsn will already be on disk. We can't
- * race with setting ili_fsync_fields because that is done under
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- * until after the ili_fsync_fields is cleared.
+ * Any inode that has dirty modifications in the log is pinned. The
+ * racy check here for a pinned inode will not catch modifications
+ * that happen concurrently to the fsync call, but fsync semantics
+ * only require to sync previously completed I/O.
*/
- xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
- if (!datasync ||
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = ip->i_itemp->ili_last_lsn;
+ err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (err2 && !error)
+ error = err2;
}
- if (lsn) {
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- ip->i_itemp->ili_fsync_fields = 0;
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
/*
* If we only have a single device, and the log force about was
* a no-op we might have to flush the data device cache here.
@@ -165,36 +188,51 @@ xfs_file_fsync(
* commit.
*/
if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
- mp->m_logdev_targp == mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ mp->m_logdev_targp == mp->m_ddev_targp) {
+ err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ if (err2 && !error)
+ error = err2;
+ }
return error;
}
+static int
+xfs_ilock_iocb(
+ struct kiocb *iocb,
+ unsigned int lock_mode)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, lock_mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lock_mode);
+ }
+
+ return 0;
+}
+
STATIC ssize_t
-xfs_file_dio_aio_read(
+xfs_file_dio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- size_t count = iov_iter_count(to);
ssize_t ret;
- trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_direct_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
file_accessed(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -206,21 +244,16 @@ xfs_file_dax_read(
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- size_t count = iov_iter_count(to);
ssize_t ret = 0;
- trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_dax_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -229,21 +262,18 @@ xfs_file_dax_read(
}
STATIC ssize_t
-xfs_file_buffered_aio_read(
+xfs_file_buffered_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret;
- trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ trace_xfs_file_buffered_read(iocb, to);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -261,15 +291,15 @@ xfs_file_read_iter(
XFS_STATS_INC(mp, xs_read_calls);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
- ret = xfs_file_dio_aio_read(iocb, to);
+ ret = xfs_file_dio_read(iocb, to);
else
- ret = xfs_file_buffered_aio_read(iocb, to);
+ ret = xfs_file_buffered_read(iocb, to);
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@ -284,10 +314,10 @@ xfs_file_read_iter(
* if called for a direct write beyond i_size.
*/
STATIC ssize_t
-xfs_file_aio_write_checks(
+xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- int *iolock)
+ unsigned int *iolock)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -302,7 +332,14 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ error = break_layout(inode, false);
+ if (error == -EWOULDBLOCK)
+ error = -EAGAIN;
+ } else {
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ }
+
if (error)
return error;
@@ -313,28 +350,45 @@ restart:
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
+ error = xfs_ilock_iocb(iocb, *iolock);
+ if (error) {
+ *iolock = 0;
+ return error;
+ }
goto restart;
}
+
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the
- * iolock shared, we need to update it to exclusive which implies
- * having to redo all checks before.
+ * write. If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies having to
+ * redo all checks before.
+ *
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while we
+ * do this check until we have placed an IO barrier (i.e. hold the
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have the
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
*
- * We need to serialise against EOF updates that occur in IO
- * completions here. We want to make sure that nobody is changing the
- * size while we do this check until we have placed an IO barrier (i.e.
- * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
- * The spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
- * and hence be able to correctly determine if we need to run zeroing.
+ * We can do an unlocked check here safely as IO completion can only
+ * extend EOF. Truncate is locked out at this point, so the EOF can
+ * not move backwards, only forwards. Hence we only need to take the
+ * slow path and spin locks when we are at or beyond the current EOF.
*/
+ if (iocb->ki_pos <= i_size_read(inode))
+ goto out;
+
spin_lock(&ip->i_flags_lock);
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
@@ -354,22 +408,16 @@ restart:
drained_dio = true;
goto restart;
}
-
+
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
- NULL, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
} else
spin_unlock(&ip->i_flags_lock);
- /*
- * Updating the timestamps will grab the ilock again from
- * xfs_fs_dirty_inode, so we have to call it after dropping the
- * lock above. Eventually we should look into a way to avoid
- * the pointless lock roundtrip.
- */
- return file_modified(file);
+out:
+ return kiocb_modified(iocb);
}
static int
@@ -386,7 +434,7 @@ xfs_dio_write_end_io(
trace_xfs_end_io_direct_write(ip, offset, size);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (error)
@@ -434,7 +482,17 @@ xfs_dio_write_end_io(
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
+ *
+ * As IO completion only ever extends EOF, we can do an unlocked check
+ * here to avoid taking the spinlock. If we land within the current EOF,
+ * then we do not need to do an extending update at all, and we don't
+ * need to take the lock to check this. If we race with an update moving
+ * EOF, then we'll either still be beyond EOF and need to take the lock,
+ * or we'll be within EOF and we don't need to take it at all.
*/
+ if (offset + size <= i_size_read(inode))
+ goto out;
+
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode)) {
i_size_write(inode, offset + size);
@@ -454,122 +512,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
};
/*
- * xfs_file_dio_aio_write - handle direct IO writes
- *
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
- *
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer. To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * Handle block aligned direct I/O writes
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+ struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret;
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- return -EINVAL;
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
/*
- * Don't take the exclusive iolock here unless the I/O is unaligned to
- * the file system block size. We don't need to consider the EOF
- * extension case here because xfs_file_aio_write_checks() will relock
- * the inode as necessary for EOF zeroing cases and fill out the new
- * inode size as appropriate.
+ * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ * the iolock back to shared if we had to take the exclusive lock in
+ * xfs_file_write_checks() for other reasons.
*/
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask)) {
- unaligned_io = 1;
-
- /*
- * We can't properly handle unaligned direct I/O to reflink
- * files yet, as we can't unshare a partial block.
- */
- if (xfs_is_cow_inode(ip)) {
- trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- return -EREMCHG;
- }
- iolock = XFS_IOLOCK_EXCL;
- } else {
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops, 0, NULL, 0);
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* unaligned dio always waits, bail */
- if (unaligned_io)
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, iolock))
+/*
+ * Handle block unaligned direct I/O writes
+ *
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes. However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block. In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
+ *
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required. In that case
+ * we try again with the exclusive lock.
+ */
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
+ ssize_t ret;
+
+ /*
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF, so
+ * don't even bother trying the fast path in this case.
+ */
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
+retry_exclusive:
+ iolock = XFS_IOLOCK_EXCL;
+ flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
- goto out;
- count = iov_iter_count(from);
+ return ret;
/*
- * If we are doing unaligned IO, we can't allow any other overlapping IO
- * in-flight at the same time or we risk data corruption. Wait for all
- * other IO to drain before we submit. If the IO is aligned, demote the
- * iolock if we had to take the exclusive lock in
- * xfs_file_aio_write_checks() for other reasons.
+ * We can't properly handle unaligned direct I/O to reflink files yet,
+ * as we can't unshare a partial block.
*/
- if (unaligned_io) {
- inode_dio_wait(inode);
- } else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ if (xfs_is_cow_inode(ip)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
+
/*
- * If unaligned, this is the only IO in-flight. Wait on it before we
- * release the iolock to prevent subsequent overlapping IO.
+ * If we are doing exclusive unaligned I/O, this must be the only I/O
+ * in-flight. Otherwise we risk data corruption due to unwritten extent
+ * conversions from the AIO end_io handler. Wait for all other I/O to
+ * drain first.
*/
+ if (flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io);
-out:
- xfs_iunlock(ip, iolock);
+ &xfs_dio_write_ops, flags, NULL, 0);
/*
- * No fallback to buffered IO on errors for XFS, direct IO will either
- * complete fully or fail.
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user I/O, propagate the error.
*/
- ASSERT(ret < 0 || ret == count);
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
return ret;
}
+static ssize_t
+xfs_file_dio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ size_t count = iov_iter_count(from);
+
+ /* direct I/O must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ return -EINVAL;
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ return xfs_file_dio_write_unaligned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
static noinline ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
@@ -577,33 +662,28 @@ xfs_file_dax_write(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
- size_t count;
loff_t pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
- }
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
pos = iocb->ki_pos;
- count = iov_iter_count(from);
- trace_xfs_file_dax_write(ip, count, pos);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+ trace_xfs_file_dax_write(iocb, from);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
out:
- xfs_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
if (error)
return error;
@@ -617,33 +697,30 @@ out:
}
STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
- int iolock;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
+ bool cleared_space = false;
+ unsigned int iolock;
write_retry:
iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
@@ -656,27 +733,23 @@ write_retry:
* metadata space. This reduces the chances that the eofblocks scan
* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
* also behaves as a filter to prevent too many eofblocks scans from
- * running at the same time.
+ * running at the same time. Use a synchronous scan to increase the
+ * effectiveness of the scan.
*/
- if (ret == -EDQUOT && !enospc) {
+ if (ret == -EDQUOT && !cleared_space) {
xfs_iunlock(ip, iolock);
- enospc = xfs_inode_free_quota_eofblocks(ip);
- if (enospc)
- goto write_retry;
- enospc = xfs_inode_free_quota_cowblocks(ip);
- if (enospc)
- goto write_retry;
- iolock = 0;
- } else if (ret == -ENOSPC && !enospc) {
- struct xfs_eofblocks eofb = {0};
-
- enospc = 1;
+ xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
+ cleared_space = true;
+ goto write_retry;
+ } else if (ret == -ENOSPC && !cleared_space) {
+ struct xfs_icwalk icw = {0};
+
+ cleared_space = true;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
- eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
- xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
+ xfs_blockgc_free_space(ip->i_mount, &icw);
goto write_retry;
}
@@ -698,9 +771,7 @@ xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
@@ -710,7 +781,7 @@ xfs_file_write_iter(
if (ocount == 0)
return 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (IS_DAX(inode))
@@ -723,12 +794,12 @@ xfs_file_write_iter(
* CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode.
*/
- ret = xfs_file_dio_aio_write(iocb, from);
- if (ret != -EREMCHG)
+ ret = xfs_file_dio_write(iocb, from);
+ if (ret != -ENOTBLK)
return ret;
}
- return xfs_file_buffered_aio_write(iocb, from);
+ return xfs_file_buffered_write(iocb, from);
}
static void
@@ -742,7 +813,7 @@ xfs_wait_dax_page(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
-static int
+int
xfs_break_dax_layouts(
struct inode *inode,
bool *retry)
@@ -779,7 +850,7 @@ xfs_break_layouts(
error = xfs_break_dax_layouts(inode, &retry);
if (error || retry)
break;
- /* fall through */
+ fallthrough;
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, &retry);
break;
@@ -792,6 +863,21 @@ xfs_break_layouts(
return error;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (xfs_has_wsync(ip->i_mount))
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@@ -807,7 +893,6 @@ xfs_file_fallocate(
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
- enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;
@@ -852,14 +937,16 @@ xfs_file_fallocate(
goto out_unlock;
}
+ error = file_modified(file);
+ if (error)
+ goto out_unlock;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
-
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -879,10 +966,9 @@ xfs_file_fallocate(
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_INSERT_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
loff_t isize = i_size_read(inode);
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -904,8 +990,6 @@ xfs_file_fallocate(
}
do_file_insert = true;
} else {
- flags |= XFS_PREALLOC_SET;
-
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
@@ -951,27 +1035,20 @@ xfs_file_fallocate(
}
if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len,
- XFS_BMAPI_PREALLOC);
+ error = xfs_alloc_file_space(ip, offset, len);
if (error)
goto out_unlock;
}
}
- if (file->f_flags & O_DSYNC)
- flags |= XFS_PREALLOC_SYNC;
-
- error = xfs_update_prealloc_flags(ip, flags);
- if (error)
- goto out_unlock;
-
/* Change file size if needed */
if (new_size) {
struct iattr iattr;
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_dentry(file), &iattr);
+ error = xfs_vn_setattr_size(file_mnt_user_ns(file),
+ file_dentry(file), &iattr);
if (error)
goto out_unlock;
}
@@ -982,8 +1059,14 @@ xfs_file_fallocate(
* leave shifted extents past EOF and hence losing access to
* the data that is contained within them.
*/
- if (do_file_insert)
+ if (do_file_insert) {
error = xfs_insert_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_file_sync_writes(file))
+ error = xfs_log_force_inode(ip);
out_unlock:
xfs_iunlock(ip, iolock);
@@ -1036,16 +1119,16 @@ xfs_file_remap_range(
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return -EOPNOTSUPP;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
- if (ret < 0 || len == 0)
+ if (ret || len == 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
@@ -1062,16 +1145,20 @@ xfs_file_remap_range(
*/
cowextsize = 0;
if (pos_in == 0 && len == i_size_read(inode_in) &&
- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
pos_out == 0 && len >= i_size_read(inode_out) &&
- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
- cowextsize = src->i_d.di_cowextsize;
+ !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_cowextsize;
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
+ xfs_log_force_inode(dest);
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
@@ -1082,12 +1169,10 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
- if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT;
- return 0;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -1096,7 +1181,7 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
error = xfs_file_open(inode, file);
@@ -1108,7 +1193,7 @@ xfs_dir_open(
* certain to have the next operation be a read there.
*/
mode = xfs_ilock_data_map_shared(ip);
- if (ip->i_d.di_nextents > 0)
+ if (ip->i_df.if_nextents > 0)
error = xfs_dir3_data_readahead(ip, 0, 0);
xfs_iunlock(ip, mode);
return error;
@@ -1143,7 +1228,7 @@ xfs_file_readdir(
* point we can change the ->readdir prototype to include the
* buffer size. For now we use the current glibc buffer size.
*/
- bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
return xfs_readdir(NULL, ip, ctx, bufsize);
}
@@ -1156,7 +1241,7 @@ xfs_file_llseek(
{
struct inode *inode = file->f_mapping->host;
- if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
+ if (xfs_is_shutdown(XFS_I(inode)->i_mount))
return -EIO;
switch (whence) {
@@ -1175,13 +1260,39 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
+#ifdef CONFIG_FS_DAX
+static inline vm_fault_t
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+}
+#else
+static inline vm_fault_t
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
+}
+#endif
+
/*
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
*
- * mmap_sem (MM)
+ * mmap_lock (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmaplock (XFS - truncate serialisation)
+ * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1202,30 +1313,38 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
pfn_t pfn;
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
- (write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
- &xfs_read_iomap_ops);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
- if (write_fault)
+ if (write_fault) {
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
- else
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else {
ret = filemap_fault(vmf);
+ }
}
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;
}
+static inline bool
+xfs_is_write_fault(
+ struct vm_fault *vmf)
+{
+ return (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
+}
+
static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
@@ -1233,7 +1352,7 @@ xfs_filemap_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
IS_DAX(file_inode(vmf->vma->vm_file)) &&
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
@@ -1246,7 +1365,7 @@ xfs_filemap_huge_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, pe_size,
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
@@ -1269,10 +1388,25 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
+static vm_fault_t
+xfs_filemap_map_pages(
+ struct vm_fault *vmf,
+ pgoff_t start_pgoff,
+ pgoff_t end_pgoff)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = xfs_filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
@@ -1305,7 +1439,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,