aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c702
1 files changed, 321 insertions, 381 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9aea7d68d8ab..c06129cffba9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone;
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
-STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
@@ -452,7 +451,7 @@ xfs_lock_inodes(
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
* support an arbitrary depth of locking here, but absolute limits on
- * inodes depend on the the type of locking and the limits placed by
+ * inodes depend on the type of locking and the limits placed by
* lockdep annotations in xfs_lock_inumorder. These are all checked by
* the asserts.
*/
@@ -1740,10 +1739,31 @@ xfs_inactive_ifree(
return error;
}
+ /*
+ * We do not hold the inode locked across the entire rolling transaction
+ * here. We only need to hold it for the first transaction that
+ * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
+ * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
+ * here breaks the relationship between cluster buffer invalidation and
+ * stale inode invalidation on cluster buffer item journal commit
+ * completion, and can result in leaving dirty stale inodes hanging
+ * around in memory.
+ *
+ * We have no need for serialising this inode operation against other
+ * operations - we freed the inode and hence reallocation is required
+ * and that will serialise on reallocating the space the deferops need
+ * to free. Hence we can unlock the inode on the first commit of
+ * the transaction rather than roll it right through the deferops. This
+ * avoids relogging the XFS_ISTALE inode.
+ *
+ * We check that xfs_ifree() hasn't grown an internal transaction roll
+ * by asserting that the inode is still locked when it returns.
+ */
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_ifree(tp, ip);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
@@ -1756,7 +1776,6 @@ xfs_inactive_ifree(
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
}
xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1774,7 +1793,6 @@ xfs_inactive_ifree(
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
@@ -2147,7 +2165,6 @@ xfs_iunlink_update_dinode(
xfs_dinode_calc_crc(mp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
- xfs_inobp_check(mp, ibp);
}
/* Set an in-core inode's unlinked pointer and return the old value. */
@@ -2248,7 +2265,6 @@ xfs_iunlink(
}
if (next_agino != NULLAGINO) {
- struct xfs_perag *pag;
xfs_agino_t old_agino;
/*
@@ -2265,9 +2281,7 @@ xfs_iunlink(
* agino has been unlinked, add a backref from the next inode
* back to agino.
*/
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- xfs_perag_put(pag);
+ error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
if (error)
return error;
}
@@ -2403,7 +2417,6 @@ xfs_iunlink_remove(
struct xfs_buf *agibp;
struct xfs_buf *last_ibp;
struct xfs_dinode *last_dip = NULL;
- struct xfs_perag *pag = NULL;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t next_agino;
@@ -2447,32 +2460,22 @@ xfs_iunlink_remove(
* this inode's backref to point from the next inode.
*/
if (next_agino != NULLAGINO) {
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_change_backref(pag, next_agino,
+ error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
NULLAGINO);
if (error)
- goto out;
+ return error;
}
- if (head_agino == agino) {
- /* Point the head of the list to the next unlinked inode. */
- error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
- next_agino);
- if (error)
- goto out;
- } else {
+ if (head_agino != agino) {
struct xfs_imap imap;
xfs_agino_t prev_agino;
- if (!pag)
- pag = xfs_perag_get(mp, agno);
-
/* We need to search the list for the inode being freed. */
error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
&prev_agino, &imap, &last_dip, &last_ibp,
- pag);
+ agibp->b_pag);
if (error)
- goto out;
+ return error;
/* Point the previous inode on the list to the next inode. */
xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
@@ -2486,29 +2489,29 @@ xfs_iunlink_remove(
* change_backref takes care of deleting the backref if
* next_agino is NULLAGINO.
*/
- error = xfs_iunlink_change_backref(pag, agino, next_agino);
- if (error)
- goto out;
+ return xfs_iunlink_change_backref(agibp->b_pag, agino,
+ next_agino);
}
-out:
- if (pag)
- xfs_perag_put(pag);
- return error;
+ /* Point the head of the list to the next unlinked inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
}
/*
- * Look up the inode number specified and mark it stale if it is found. If it is
- * dirty, return the inode so it can be attached to the cluster buffer so it can
- * be processed appropriately when the cluster free transaction completes.
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
*/
-static struct xfs_inode *
-xfs_ifree_get_one_inode(
- struct xfs_perag *pag,
+static void
+xfs_ifree_mark_inode_stale(
+ struct xfs_buf *bp,
struct xfs_inode *free_ip,
xfs_ino_t inum)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_perag *pag = bp->b_pag;
+ struct xfs_inode_log_item *iip;
struct xfs_inode *ip;
retry:
@@ -2516,8 +2519,10 @@ retry:
ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
/* Inode not in memory, nothing to do */
- if (!ip)
- goto out_rcu_unlock;
+ if (!ip) {
+ rcu_read_unlock();
+ return;
+ }
/*
* because this is an RCU protected lookup, we could find a recently
@@ -2528,9 +2533,9 @@ retry:
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
- goto out_rcu_unlock;
+ rcu_read_unlock();
+ return;
}
- spin_unlock(&ip->i_flags_lock);
/*
* Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2540,43 +2545,50 @@ retry:
*/
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
delay(1);
goto retry;
}
-
- /*
- * Check the inode number again in case we're racing with
- * freeing in xfs_reclaim_inode(). See the comments in that
- * function for more information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out_rcu_unlock;
- }
}
+ ip->i_flags |= XFS_ISTALE;
+ spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
+ /*
+ * If we can't get the flush lock, the inode is already attached. All
+ * we needed to do here is mark the inode stale so buffer IO completion
+ * will remove it from the AIL.
+ */
+ iip = ip->i_itemp;
+ if (!xfs_iflock_nowait(ip)) {
+ ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(iip->ili_last_fields);
+ goto out_iunlock;
+ }
/*
- * We don't need to attach clean inodes or those only with unlogged
- * changes (which we throw away, anyway).
+ * Inodes not attached to the buffer can be released immediately.
+ * Everything else has to go through xfs_iflush_abort() on journal
+ * commit as the flock synchronises removal of the inode from the
+ * cluster buffer against inode reclaim.
*/
- if (!ip->i_itemp || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
+ if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out_no_inode;
+ goto out_iunlock;
}
- return ip;
-out_rcu_unlock:
- rcu_read_unlock();
-out_no_inode:
- return NULL;
+ /* we have a dirty inode in memory that has not yet been flushed. */
+ spin_lock(&iip->ili_lock);
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ spin_unlock(&iip->ili_lock);
+ ASSERT(iip->ili_last_fields);
+
+out_iunlock:
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
/*
@@ -2586,26 +2598,20 @@ out_no_inode:
*/
STATIC int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
+ struct xfs_inode *free_ip,
+ struct xfs_trans *tp,
struct xfs_icluster *xic)
{
- xfs_mount_t *mp = free_ip->i_mount;
+ struct xfs_mount *mp = free_ip->i_mount;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ xfs_ino_t inum = xic->first_ino;
int nbufs;
int i, j;
int ioffset;
- xfs_daddr_t blkno;
- xfs_buf_t *bp;
- xfs_inode_t *ip;
- struct xfs_inode_log_item *iip;
- struct xfs_log_item *lip;
- struct xfs_perag *pag;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_ino_t inum;
int error;
- inum = xic->first_ino;
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2634,10 +2640,8 @@ xfs_ifree_cluster(
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED, &bp);
- if (error) {
- xfs_perag_put(pag);
+ if (error)
return error;
- }
/*
* This buffer may not have been correctly initialised as we
@@ -2651,60 +2655,16 @@ xfs_ifree_cluster(
bp->b_ops = &xfs_inode_buf_ops;
/*
- * Walk the inodes already attached to the buffer and mark them
- * stale. These will all have the flush locks held, so an
- * in-memory inode walk can't lock them. By marking them all
- * stale first, we will not attempt to lock them in the loop
- * below as the XFS_ISTALE flag will be set.
+ * Now we need to set all the cached clean inodes as XFS_ISTALE,
+ * too. This requires lookups, and will skip inodes that we've
+ * already marked XFS_ISTALE.
*/
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (struct xfs_inode_log_item *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = xfs_istale_done;
- xfs_trans_ail_copy_lsn(mp->m_ail,
- &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- }
- }
-
-
- /*
- * For each inode in memory attempt to add it to the inode
- * buffer and set it up for being staled on buffer IO
- * completion. This is safe as we've locked out tail pushing
- * and flushing by locking the buffer.
- *
- * We have already marked every inode that was part of a
- * transaction stale above, which means there is no point in
- * even trying to lock them.
- */
- for (i = 0; i < igeo->inodes_per_cluster; i++) {
- ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
- if (!ip)
- continue;
-
- iip = ip->i_itemp;
- iip->ili_last_fields = iip->ili_fields;
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
-
- xfs_buf_attach_iodone(bp, xfs_istale_done,
- &iip->ili_item);
-
- if (ip != free_ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ for (i = 0; i < igeo->inodes_per_cluster; i++)
+ xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
-
- xfs_perag_put(pag);
return 0;
}
@@ -2725,6 +2685,7 @@ xfs_ifree(
{
int error;
struct xfs_icluster xic = { 0 };
+ struct xfs_inode_log_item *iip = ip->i_itemp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2762,7 +2723,9 @@ xfs_ifree(
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
/* Don't attempt to replay owner changes for a deleted inode */
- ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+ spin_lock(&iip->ili_lock);
+ iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+ spin_unlock(&iip->ili_lock);
/*
* Bump the generation count so no one will be confused
@@ -3142,7 +3105,7 @@ out_trans_abort:
/*
* xfs_rename_alloc_whiteout()
*
- * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * Return a referenced, unlinked, unlocked inode that can be used as a
* whiteout in a rename transaction. We use a tmpfile inode here so that if we
* crash between allocating the inode and linking it into the rename transaction
* recovery will free the inode and we won't leak it.
@@ -3469,232 +3432,9 @@ out_release_wip:
return error;
}
-STATIC int
-xfs_iflush_cluster(
- struct xfs_inode *ip,
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- unsigned long first_index, mask;
- int cilist_size;
- struct xfs_inode **cilist;
- struct xfs_inode *cip;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- int error = 0;
- int nr_found;
- int clcount = 0;
- int i;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
- cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
- cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
- if (!cilist)
- goto out_put;
-
- mask = ~(igeo->inodes_per_cluster - 1);
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
- rcu_read_lock();
- /* really need a gang lookup range call here */
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
- first_index, igeo->inodes_per_cluster);
- if (nr_found == 0)
- goto out_free;
-
- for (i = 0; i < nr_found; i++) {
- cip = cilist[i];
- if (cip == ip)
- continue;
-
- /*
- * because this is an RCU protected lookup, we could find a
- * recently freed or even reallocated inode during the lookup.
- * We need to check under the i_flags_lock for a valid inode
- * here. Skip it if it is not valid or the wrong inode.
- */
- spin_lock(&cip->i_flags_lock);
- if (!cip->i_ino ||
- __xfs_iflags_test(cip, XFS_ISTALE)) {
- spin_unlock(&cip->i_flags_lock);
- continue;
- }
-
- /*
- * Once we fall off the end of the cluster, no point checking
- * any more inodes in the list because they will also all be
- * outside the cluster.
- */
- if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
- spin_unlock(&cip->i_flags_lock);
- break;
- }
- spin_unlock(&cip->i_flags_lock);
-
- /*
- * Do an un-protected check to see if the inode is dirty and
- * is a candidate for flushing. These checks will be repeated
- * later after the appropriate locks are acquired.
- */
- if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
- continue;
-
- /*
- * Try to get locks. If any are unavailable or it is pinned,
- * then this inode cannot be flushed and is skipped.
- */
-
- if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
- continue;
- if (!xfs_iflock_nowait(cip)) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
- if (xfs_ipincount(cip)) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
-
- /*
- * Check the inode number again, just to be certain we are not
- * racing with freeing in xfs_reclaim_inode(). See the comments
- * in that function for more information as to why the initial
- * check is not sufficient.
- */
- if (!cip->i_ino) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
- /*
- * arriving here means that this inode can be flushed. First
- * re-check that it's dirty before flushing.
- */
- if (!xfs_inode_clean(cip)) {
- error = xfs_iflush_int(cip, bp);
- if (error) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- goto out_free;
- }
- clcount++;
- } else {
- xfs_ifunlock(cip);
- }
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- }
-
- if (clcount) {
- XFS_STATS_INC(mp, xs_icluster_flushcnt);
- XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
- }
-
-out_free:
- rcu_read_unlock();
- kmem_free(cilist);
-out_put:
- xfs_perag_put(pag);
- return error;
-}
-
-/*
- * Flush dirty inode metadata into the backing buffer.
- *
- * The caller must have the inode lock and the inode flush lock held. The
- * inode lock will still be held upon return to the caller, and the inode
- * flush lock will be released after the inode has reached the disk.
- *
- * The caller must write out the buffer returned in *bpp and release it.
- */
-int
+static int
xfs_iflush(
struct xfs_inode *ip,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp = NULL;
- struct xfs_dinode *dip;
- int error;
-
- XFS_STATS_INC(mp, xs_iflush_count);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
- ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
- *bpp = NULL;
-
- xfs_iunpin_wait(ip);
-
- /*
- * For stale inodes we cannot rely on the backing buffer remaining
- * stale in cache for the remaining life of the stale inode and so
- * xfs_imap_to_bp() below may give us a buffer that no longer contains
- * inodes below. We have to check this after ensuring the inode is
- * unpinned so that it is safe to reclaim the stale inode after the
- * flush call.
- */
- if (xfs_iflags_test(ip, XFS_ISTALE)) {
- xfs_ifunlock(ip);
- return 0;
- }
-
- /*
- * Get the buffer containing the on-disk inode. We are doing a try-lock
- * operation here, so we may get an EAGAIN error. In that case, return
- * leaving the inode dirty.
- *
- * If we get any other error, we effectively have a corruption situation
- * and we cannot flush the inode. Abort the flush and shut down.
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK);
- if (error == -EAGAIN) {
- xfs_ifunlock(ip);
- return error;
- }
- if (error)
- goto abort;
-
- /*
- * If the buffer is pinned then push on the log now so we won't
- * get stuck waiting in the write for too long.
- */
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
-
- /*
- * Flush the provided inode then attempt to gather others from the
- * cluster into the write.
- *
- * Note: Once we attempt to flush an inode, we must run buffer
- * completion callbacks on any failure. If this fails, simulate an I/O
- * failure on the buffer and shut down.
- */
- error = xfs_iflush_int(ip, bp);
- if (!error)
- error = xfs_iflush_cluster(ip, bp);
- if (error) {
- bp->b_flags |= XBF_ASYNC;
- xfs_buf_ioend_fail(bp);
- goto shutdown;
- }
-
- *bpp = bp;
- return 0;
-
-abort:
- xfs_iflush_abort(ip);
-shutdown:
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
-}
-
-STATIC int
-xfs_iflush_int(
- struct xfs_inode *ip,
struct xfs_buf *bp)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
@@ -3706,7 +3446,7 @@ xfs_iflush_int(
ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- ASSERT(iip != NULL && iip->ili_fields != 0);
+ ASSERT(iip->ili_item.li_buf == bp);
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3801,7 +3541,6 @@ xfs_iflush_int(
xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
if (XFS_IFORK_Q(ip))
xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
- xfs_inobp_check(mp, bp);
/*
* We've recorded everything logged in the inode, so we'd like to clear
@@ -3818,40 +3557,148 @@ xfs_iflush_int(
* know that the information those bits represent is permanently on
* disk. As long as the flush completes before the inode is logged
* again, then both ili_fields and ili_last_fields will be cleared.
- *
- * We can play with the ili_fields bits here, because the inode lock
- * must be held exclusively in order to set bits there and the flush
- * lock protects the ili_last_fields bits. Set ili_logged so the flush
- * done routine can tell whether or not to look in the AIL. Also, store
- * the current LSN of the inode so that we can tell whether the item has
- * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
- * need the AIL lock, because it is a 64 bit value that cannot be read
- * atomically.
*/
error = 0;
flush_out:
+ spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
+ spin_unlock(&iip->ili_lock);
+ /*
+ * Store the current LSN of the inode so that we can tell whether the
+ * item has moved in the AIL from xfs_iflush_done().
+ */
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
+ /* generate the checksum. */
+ xfs_dinode_calc_crc(mp, dip);
+ return error;
+}
+
+/*
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
+ *
+ * The caller must have a reference to the inode and hold the cluster buffer
+ * locked. The function will walk across all the inodes on the cluster buffer it
+ * can find and lock without blocking, and flush them to the cluster buffer.
+ *
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
+ */
+int
+xfs_iflush_cluster(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_log_item *lip, *n;
+ struct xfs_inode *ip;
+ struct xfs_inode_log_item *iip;
+ int clcount = 0;
+ int error = 0;
+
/*
- * Attach the inode item callback to the buffer whether the flush
- * succeeded or not. If not, the caller will shut down and fail I/O
- * completion on the buffer to remove the inode from the AIL and release
- * the flush lock.
+ * We must use the safe variant here as on shutdown xfs_iflush_abort()
+ * can remove itself from the list.
*/
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ iip = (struct xfs_inode_log_item *)lip;
+ ip = iip->ili_inode;
- /* generate the checksum. */
- xfs_dinode_calc_crc(mp, dip);
+ /*
+ * Quick and dirty check to avoid locks if possible.
+ */
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+ continue;
+ if (xfs_ipincount(ip))
+ continue;
+
+ /*
+ * The inode is still attached to the buffer, which means it is
+ * dirty but reclaim might try to grab it. Check carefully for
+ * that, and grab the ilock while still holding the i_flags_lock
+ * to guarantee reclaim will not be able to reclaim this inode
+ * once we drop the i_flags_lock.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+
+ /*
+ * ILOCK will pin the inode against reclaim and prevent
+ * concurrent transactions modifying the inode while we are
+ * flushing the inode.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Skip inodes that are already flush locked as they have
+ * already been written to the buffer.
+ */
+ if (!xfs_iflock_nowait(ip)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ /*
+ * Abort flushing this inode if we are shut down because the
+ * inode may not currently be in the AIL. This can occur when
+ * log I/O failure unpins the inode without inserting into the
+ * AIL, leaving a dirty/unpinned inode attached to the buffer
+ * that otherwise looks like it should be flushed.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_iunpin_wait(ip);
+ /* xfs_iflush_abort() drops the flush lock */
+ xfs_iflush_abort(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ error = -EIO;
+ continue;
+ }
+
+ /* don't block waiting on a log force to unpin dirty inodes */
+ if (xfs_ipincount(ip)) {
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ if (!xfs_inode_clean(ip))
+ error = xfs_iflush(ip, bp);
+ else
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error)
+ break;
+ clcount++;
+ }
+
+ if (error) {
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_ioend_fail(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ if (!clcount)
+ return -EAGAIN;
+
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
+ return 0;
- ASSERT(!list_empty(&bp->b_li_list));
- ASSERT(bp->b_iodone != NULL);
- return error;
}
/* Release an inode. */
@@ -3881,3 +3728,96 @@ xfs_log_force_inode(
return 0;
return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
}
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding. The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+ struct inode *src,
+ struct inode *dest)
+{
+ int error;
+
+ if (src > dest)
+ swap(src, dest);
+
+retry:
+ /* Wait to break both inodes' layouts before we start locking. */
+ error = break_layout(src, true);
+ if (error)
+ return error;
+ if (src != dest) {
+ error = break_layout(dest, true);
+ if (error)
+ return error;
+ }
+
+ /* Lock one inode and make sure nobody got in and leased it. */
+ inode_lock(src);
+ error = break_layout(src, false);
+ if (error) {
+ inode_unlock(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ if (src == dest)
+ return 0;
+
+ /* Lock the other inode and make sure nobody got in and leased it. */
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ error = break_layout(dest, false);
+ if (error) {
+ inode_unlock(src);
+ inode_unlock(dest);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int ret;
+
+ ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+ if (ret)
+ return ret;
+ if (ip1 == ip2)
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+ else
+ xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
+ ip2, XFS_MMAPLOCK_EXCL);
+ return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ bool same_inode = (ip1 == ip2);
+
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ if (!same_inode)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ inode_unlock(VFS_I(ip2));
+ if (!same_inode)
+ inode_unlock(VFS_I(ip1));
+}