aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/libxfs/xfs_attr.c264
-rw-r--r--fs/xfs/libxfs/xfs_attr.h (renamed from fs/xfs/xfs_attr.h)2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c10
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c94
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h1
-rw-r--r--fs/xfs/libxfs/xfs_format.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c30
-rw-r--r--fs/xfs/libxfs/xfs_sb.c5
-rw-r--r--fs/xfs/scrub/alloc.c1
-rw-r--r--fs/xfs/scrub/inode.c4
-rw-r--r--fs/xfs/scrub/repair.c128
-rw-r--r--fs/xfs/scrub/scrub.c13
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_aops.h14
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c109
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c119
-rw-r--r--fs/xfs/xfs_buf_item.h1
-rw-r--r--fs/xfs/xfs_fsops.c50
-rw-r--r--fs/xfs/xfs_inode.c10
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c53
-rw-r--r--fs/xfs/xfs_iops.c12
-rw-r--r--fs/xfs/xfs_log_recover.c10
-rw-r--r--fs/xfs/xfs_reflink.c362
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_stats.c52
-rw-r--r--fs/xfs/xfs_stats.h28
-rw-r--r--fs/xfs/xfs_super.c38
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c28
-rw-r--r--fs/xfs/xfs_trans_buf.c141
35 files changed, 1023 insertions, 677 deletions
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 1e671d4eb6fa..844ed87b1900 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -191,6 +191,128 @@ xfs_attr_calc_size(
return nblks;
}
+STATIC int
+xfs_attr_try_sf_addname(
+ struct xfs_inode *dp,
+ struct xfs_da_args *args)
+{
+
+ struct xfs_mount *mp = dp->i_mount;
+ int error, error2;
+
+ error = xfs_attr_shortform_addname(args);
+ if (error == -ENOSPC)
+ return error;
+
+ /*
+ * Commit the shortform mods, and we're done.
+ * NOTE: this is also the error path (EEXIST, etc).
+ */
+ if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(args->trans);
+
+ error2 = xfs_trans_commit(args->trans);
+ args->trans = NULL;
+ return error ? error : error2;
+}
+
+/*
+ * Set the attribute specified in @args.
+ */
+int
+xfs_attr_set_args(
+ struct xfs_da_args *args,
+ struct xfs_buf **leaf_bp)
+{
+ struct xfs_inode *dp = args->dp;
+ int error;
+
+ /*
+ * If the attribute list is non-existent or a shortform list,
+ * upgrade it to a single-leaf-block attribute list.
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(args);
+
+ /*
+ * Try to add the attr to the attribute list in the inode.
+ */
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC)
+ return error;
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block.
+ * GROT: another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a
+ * concurrent AIL push cannot grab the half-baked leaf
+ * buffer and run into problems with the write verifier.
+ */
+ xfs_trans_bhold(args->trans, *leaf_bp);
+
+ error = xfs_defer_finish(&args->trans);
+ if (error)
+ return error;
+
+ /*
+ * Commit the leaf transformation. We'll need another
+ * (linked) transaction to add the new attribute to the
+ * leaf.
+ */
+ error = xfs_trans_roll_inode(&args->trans, dp);
+ if (error)
+ return error;
+ xfs_trans_bjoin(args->trans, *leaf_bp);
+ *leaf_bp = NULL;
+ }
+
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ error = xfs_attr_leaf_addname(args);
+ else
+ error = xfs_attr_node_addname(args);
+ return error;
+}
+
+/*
+ * Remove the attribute specified in @args.
+ */
+int
+xfs_attr_remove_args(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ int error;
+
+ if (!xfs_inode_hasattr(dp)) {
+ error = -ENOATTR;
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+ error = xfs_attr_shortform_remove(args);
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_removename(args);
+ } else {
+ error = xfs_attr_node_removename(args);
+ }
+
+ return error;
+}
+
int
xfs_attr_set(
struct xfs_inode *dp,
@@ -204,7 +326,7 @@ xfs_attr_set(
struct xfs_da_args args;
struct xfs_trans_res tres;
int rsvd = (flags & ATTR_ROOT) != 0;
- int error, err2, local;
+ int error, local;
XFS_STATS_INC(mp, xs_attr_set);
@@ -255,93 +377,17 @@ xfs_attr_set(
error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_trans_cancel(args.trans);
- return error;
- }
+ if (error)
+ goto out_trans_cancel;
xfs_trans_ijoin(args.trans, dp, 0);
-
- /*
- * If the attribute list is non-existent or a shortform list,
- * upgrade it to a single-leaf-block attribute list.
- */
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
-
- /*
- * Build initial attribute list (if required).
- */
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
- xfs_attr_shortform_create(&args);
-
- /*
- * Try to add the attr to the attribute list in
- * the inode.
- */
- error = xfs_attr_shortform_addname(&args);
- if (error != -ENOSPC) {
- /*
- * Commit the shortform mods, and we're done.
- * NOTE: this is also the error path (EEXIST, etc).
- */
- ASSERT(args.trans != NULL);
-
- /*
- * If this is a synchronous mount, make sure that
- * the transaction goes to disk before returning
- * to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
-
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_trans_ichgtime(args.trans, dp,
- XFS_ICHGTIME_CHG);
- }
- err2 = xfs_trans_commit(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- return error ? error : err2;
- }
-
- /*
- * It won't fit in the shortform, transform to a leaf block.
- * GROT: another possible req'mt for a double-split btree op.
- */
- error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
- if (error)
- goto out;
- /*
- * Prevent the leaf buffer from being unlocked so that a
- * concurrent AIL push cannot grab the half-baked leaf
- * buffer and run into problems with the write verifier.
- */
- xfs_trans_bhold(args.trans, leaf_bp);
- error = xfs_defer_finish(&args.trans);
- if (error)
- goto out;
-
- /*
- * Commit the leaf transformation. We'll need another (linked)
- * transaction to add the new attribute to the leaf, which
- * means that we have to hold & join the leaf buffer here too.
- */
- error = xfs_trans_roll_inode(&args.trans, dp);
- if (error)
- goto out;
- xfs_trans_bjoin(args.trans, leaf_bp);
- leaf_bp = NULL;
- }
-
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
- error = xfs_attr_leaf_addname(&args);
- else
- error = xfs_attr_node_addname(&args);
+ error = xfs_attr_set_args(&args, &leaf_bp);
if (error)
- goto out;
+ goto out_release_leaf;
+ if (!args.trans) {
+ /* shortform attribute has already been committed */
+ goto out_unlock;
+ }
/*
* If this is a synchronous mount, make sure that the
@@ -358,17 +404,17 @@ xfs_attr_set(
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(args.trans);
+out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
return error;
-out:
+out_release_leaf:
if (leaf_bp)
xfs_trans_brelse(args.trans, leaf_bp);
+out_trans_cancel:
if (args.trans)
xfs_trans_cancel(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+ goto out_unlock;
}
/*
@@ -423,17 +469,7 @@ xfs_attr_remove(
*/
xfs_trans_ijoin(args.trans, dp, 0);
- if (!xfs_inode_hasattr(dp)) {
- error = -ENOATTR;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
- error = xfs_attr_shortform_remove(&args);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_removename(&args);
- } else {
- error = xfs_attr_node_removename(&args);
- }
-
+ error = xfs_attr_remove_args(&args);
if (error)
goto out;
@@ -587,7 +623,7 @@ xfs_attr_leaf_addname(
*/
error = xfs_attr3_leaf_to_node(args);
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -675,7 +711,7 @@ xfs_attr_leaf_addname(
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -693,9 +729,6 @@ xfs_attr_leaf_addname(
error = xfs_attr3_leaf_clearflag(args);
}
return error;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
/*
@@ -738,15 +771,12 @@ xfs_attr_leaf_removename(
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
}
return 0;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
/*
@@ -864,7 +894,7 @@ restart:
state = NULL;
error = xfs_attr3_leaf_to_node(args);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -888,7 +918,7 @@ restart:
*/
error = xfs_da3_split(state);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -984,7 +1014,7 @@ restart:
if (retval && (state->path.active > 1)) {
error = xfs_da3_join(state);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -1013,9 +1043,6 @@ out:
if (error)
return error;
return retval;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- goto out;
}
/*
@@ -1107,7 +1134,7 @@ xfs_attr_node_removename(
if (retval && (state->path.active > 1)) {
error = xfs_da3_join(state);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -1138,7 +1165,7 @@ xfs_attr_node_removename(
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -1150,9 +1177,6 @@ xfs_attr_node_removename(
out:
xfs_da_state_free(state);
return error;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- goto out;
}
/*
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 033ff8c478e2..bdf52a333f3f 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -140,7 +140,9 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
unsigned char *value, int *valuelenp, int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
unsigned char *value, int valuelen, int flags);
+int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp);
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index af094063e402..d89363c6b523 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -485,7 +485,7 @@ xfs_attr_rmtval_set(
blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map,
&nmap);
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -553,9 +553,6 @@ xfs_attr_rmtval_set(
}
ASSERT(valuelen == 0);
return 0;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
/*
@@ -625,7 +622,7 @@ xfs_attr_rmtval_remove(
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, &done);
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -638,7 +635,4 @@ xfs_attr_rmtval_remove(
return error;
}
return 0;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2760314fdf7f..74d7228e755b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
/*
- * Make space in the inode incore.
+ * Make space in the inode incore. This needs to be undone if we fail
+ * to expand the root.
*/
xfs_iroot_realloc(ip, 1, whichfork);
ifp->if_flags |= XFS_IFBROOT;
@@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree(
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = wasdel;
*logflagsp = 0;
- if ((error = xfs_alloc_vextent(&args))) {
- ASSERT(ifp->if_broot == NULL);
- goto err1;
- }
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto out_root_realloc;
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
- ASSERT(ifp->if_broot == NULL);
error = -ENOSPC;
- goto err1;
+ goto out_root_realloc;
}
+
/*
* Allocation can't fail, the space was reserved.
*/
@@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree(
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
if (!abp) {
- error = -ENOSPC;
- goto err2;
+ error = -EFSCORRUPTED;
+ goto out_unreserve_dquot;
}
+
/*
* Fill in the child block.
*/
@@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree(
*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
return 0;
-err2:
+out_unreserve_dquot:
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-err1:
+out_root_realloc:
xfs_iroot_realloc(ip, -1, whichfork);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ASSERT(ifp->if_broot == NULL);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
@@ -1017,6 +1019,34 @@ xfs_bmap_add_attrfork_local(
return -EFSCORRUPTED;
}
+/* Set an inode attr fork off based on the format */
+int
+xfs_bmap_set_attrforkoff(
+ struct xfs_inode *ip,
+ int size,
+ int *version)
+{
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+ *version = 2;
+ break;
+ default:
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/*
* Convert inode from non-attributed to attributed.
* Must not be in a transaction, ip must not be locked.
@@ -1068,26 +1098,9 @@ xfs_bmap_add_attrfork(
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
- else if (mp->m_flags & XFS_MOUNT_ATTR2)
- version = 2;
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
+ error = xfs_bmap_set_attrforkoff(ip, size, &version);
+ if (error)
goto trans_cancel;
- }
-
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
ip->i_afp->if_flags = XFS_IFEXTENTS;
@@ -4079,8 +4092,7 @@ xfs_bmapi_allocate(
* extents to real extents when we're about to write the data.
*/
if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
- (bma->flags & XFS_BMAPI_PREALLOC) &&
- xfs_sb_version_hasextflgbit(&mp->m_sb))
+ (bma->flags & XFS_BMAPI_PREALLOC))
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
@@ -5243,8 +5255,7 @@ __xfs_bunmapi(
* unmapping part of it. But we can't really
* get rid of part of a realtime extent.
*/
- if (del.br_state == XFS_EXT_UNWRITTEN ||
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ if (del.br_state == XFS_EXT_UNWRITTEN) {
/*
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
@@ -5294,10 +5305,9 @@ __xfs_bunmapi(
del.br_blockcount -= mod;
del.br_startoff += mod;
del.br_startblock += mod;
- } else if ((del.br_startoff == start &&
- (del.br_state == XFS_EXT_UNWRITTEN ||
- tp->t_blk_res == 0)) ||
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ } else if (del.br_startoff == start &&
+ (del.br_state == XFS_EXT_UNWRITTEN ||
+ tp->t_blk_res == 0)) {
/*
* Can't make it unwritten. There isn't
* a full extent here so just skip it.
@@ -6112,11 +6122,7 @@ xfs_bmap_validate_extent(
XFS_FSB_TO_AGNO(mp, endfsb))
return __this_address;
}
- if (irec->br_state != XFS_EXT_NORM) {
- if (whichfork != XFS_DATA_FORK)
- return __this_address;
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
- return __this_address;
- }
+ if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+ return __this_address;
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b6e9b639e731..488dc8860fd7 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -183,6 +183,7 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, struct xfs_owner_info *oinfo,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 059bc44c27e8..9995d5ae380b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -287,6 +287,8 @@ static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
{
if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
return false;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+ return false;
/* check for unknown features in the fs */
if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
@@ -357,12 +359,6 @@ static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
(sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
{
return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
@@ -1016,6 +1012,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
+/* Do not use bit 15, di_flags is legacy and unchanging now */
+
#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 30d1d60f1d46..09d9c8cfa4a0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -415,6 +415,31 @@ xfs_dinode_verify_fork(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_forkoff(
+ struct xfs_dinode *dip,
+ struct xfs_mount *mp)
+{
+ if (!XFS_DFORK_Q(dip))
+ return NULL;
+
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
+ return __this_address;
+ break;
+ case XFS_DINODE_FMT_LOCAL: /* fall through ... */
+ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
+ case XFS_DINODE_FMT_BTREE:
+ if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3))
+ return __this_address;
+ break;
+ default:
+ return __this_address;
+ }
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -470,6 +495,11 @@ xfs_dinode_verify(
if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
return __this_address;
+ /* check for illegal values of forkoff */
+ fa = xfs_dinode_verify_forkoff(dip, mp);
+ if (fa)
+ return fa;
+
/* Do we have appropriate data fork formats for the mode? */
switch (mode & S_IFMT) {
case S_IFIFO:
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 081f46e30556..b5a82acd7dfe 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1115,7 +1115,8 @@ xfs_fs_geometry(
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
- XFS_FSOP_GEOM_FLAGS_DIRV2;
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
+ XFS_FSOP_GEOM_FLAGS_EXTFLG;
if (xfs_sb_version_hasattr(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
if (xfs_sb_version_hasquota(sbp))
@@ -1124,8 +1125,6 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
if (xfs_sb_version_hasdalign(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
- if (xfs_sb_version_hasextflgbit(sbp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG;
if (xfs_sb_version_hassector(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
if (xfs_sb_version_hasasciici(sbp))
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 036b5c7021eb..376bcb585ae6 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -17,7 +17,6 @@
#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "xfs_alloc.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 5b3b177c0fc9..e386c9b0b4ab 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -126,6 +126,7 @@ xchk_inode_flags(
{
struct xfs_mount *mp = sc->mp;
+ /* di_flags are all taken, last bit cannot be used */
if (flags & ~XFS_DIFLAG_ANY)
goto bad;
@@ -172,8 +173,9 @@ xchk_inode_flags2(
{
struct xfs_mount *mp = sc->mp;
+ /* Unknown di_flags2 could be from a future kernel */
if (flags2 & ~XFS_DIFLAG2_ANY)
- goto bad;
+ xchk_ino_set_warning(sc, ino);
/* reflink flag requires reflink feature */
if ((flags2 & XFS_DIFLAG2_REFLINK) &&
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 9f08dd9bf1d5..4fc0a5ea7673 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -29,6 +29,8 @@
#include "xfs_ag_resv.h"
#include "xfs_trans_space.h"
#include "xfs_quota.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -692,13 +694,14 @@ xrep_findroot_block(
struct xrep_find_ag_btree *fab,
uint64_t owner,
xfs_agblock_t agbno,
- bool *found_it)
+ bool *done_with_block)
{
struct xfs_mount *mp = ri->sc->mp;
struct xfs_buf *bp;
struct xfs_btree_block *btblock;
xfs_daddr_t daddr;
- int error;
+ int block_level;
+ int error = 0;
daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
@@ -717,36 +720,111 @@ xrep_findroot_block(
return error;
}
+ /*
+ * Read the buffer into memory so that we can see if it's a match for
+ * our btree type. We have no clue if it is beforehand, and we want to
+ * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
+ * will cause needless disk reads in subsequent calls to this function)
+ * and logging metadata verifier failures.
+ *
+ * Therefore, pass in NULL buffer ops. If the buffer was already in
+ * memory from some other caller it will already have b_ops assigned.
+ * If it was in memory from a previous unsuccessful findroot_block
+ * call, the buffer won't have b_ops but it should be clean and ready
+ * for us to try to verify if the read call succeeds. The same applies
+ * if the buffer wasn't in memory at all.
+ *
+ * Note: If we never match a btree type with this buffer, it will be
+ * left in memory with NULL b_ops. This shouldn't be a problem unless
+ * the buffer gets written.
+ */
error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
mp->m_bsize, 0, &bp, NULL);
if (error)
return error;
- /*
- * Does this look like a block matching our fs and higher than any
- * other block we've found so far? If so, reattach buffer verifiers
- * so the AIL won't complain if the buffer is also dirty.
- */
+ /* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
if (be32_to_cpu(btblock->bb_magic) != fab->magic)
goto out;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- goto out;
- bp->b_ops = fab->buf_ops;
- /* Ignore this block if it's lower in the tree than we've seen. */
- if (fab->root != NULLAGBLOCK &&
- xfs_btree_get_level(btblock) < fab->height)
- goto out;
+ /*
+ * If the buffer already has ops applied and they're not the ones for
+ * this btree type, we know this block doesn't match the btree and we
+ * can bail out.
+ *
+ * If the buffer ops match ours, someone else has already validated
+ * the block for us, so we can move on to checking if this is a root
+ * block candidate.
+ *
+ * If the buffer does not have ops, nobody has successfully validated
+ * the contents and the buffer cannot be dirty. If the magic, uuid,
+ * and structure match this btree type then we'll move on to checking
+ * if it's a root block candidate. If there is no match, bail out.
+ */
+ if (bp->b_ops) {
+ if (bp->b_ops != fab->buf_ops)
+ goto out;
+ } else {
+ ASSERT(!xfs_trans_buf_is_dirty(bp));
+ if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
+ &mp->m_sb.sb_meta_uuid))
+ goto out;
+ fab->buf_ops->verify_read(bp);
+ if (bp->b_error) {
+ bp->b_error = 0;
+ goto out;
+ }
- /* Make sure we pass the verifiers. */
- bp->b_ops->verify_read(bp);
- if (bp->b_error)
+ /*
+ * Some read verifiers will (re)set b_ops, so we must be
+ * careful not to blow away any such assignment.
+ */
+ if (!bp->b_ops)
+ bp->b_ops = fab->buf_ops;
+ }
+
+ /*
+ * This block passes the magic/uuid and verifier tests for this btree
+ * type. We don't need the caller to try the other tree types.
+ */
+ *done_with_block = true;
+
+ /*
+ * Compare this btree block's level to the height of the current
+ * candidate root block.
+ *
+ * If the level matches the root we found previously, throw away both
+ * blocks because there can't be two candidate roots.
+ *
+ * If level is lower in the tree than the root we found previously,
+ * ignore this block.
+ */
+ block_level = xfs_btree_get_level(btblock);
+ if (block_level + 1 == fab->height) {
+ fab->root = NULLAGBLOCK;
goto out;
- fab->root = agbno;
- fab->height = xfs_btree_get_level(btblock) + 1;
- *found_it = true;
+ } else if (block_level < fab->height) {
+ goto out;
+ }
+
+ /*
+ * This is the highest block in the tree that we've found so far.
+ * Update the btree height to reflect what we've learned from this
+ * block.
+ */
+ fab->height = block_level + 1;
+
+ /*
+ * If this block doesn't have sibling pointers, then it's the new root
+ * block candidate. Otherwise, the root will be found farther up the
+ * tree.
+ */
+ if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
+ btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+ fab->root = agbno;
+ else
+ fab->root = NULLAGBLOCK;
trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
@@ -768,7 +846,7 @@ xrep_findroot_rmap(
struct xrep_findroot *ri = priv;
struct xrep_find_ag_btree *fab;
xfs_agblock_t b;
- bool found_it;
+ bool done;
int error = 0;
/* Ignore anything that isn't AG metadata. */
@@ -777,16 +855,16 @@ xrep_findroot_rmap(
/* Otherwise scan each block + btree type. */
for (b = 0; b < rec->rm_blockcount; b++) {
- found_it = false;
+ done = false;
for (fab = ri->btree_info; fab->buf_ops; fab++) {
if (rec->rm_owner != fab->rmap_owner)
continue;
error = xrep_findroot_block(ri, fab,
rec->rm_owner, rec->rm_startblock + b,
- &found_it);
+ &done);
if (error)
return error;
- if (found_it)
+ if (done)
break;
}
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4bfae1e61d30..1b2344d00525 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -412,19 +412,6 @@ xchk_validate_inputs(
goto out;
}
- error = -EOPNOTSUPP;
- /*
- * We won't scrub any filesystem that doesn't have the ability
- * to record unwritten extents. The option was made default in
- * 2003, removed from mkfs in 2007, and cannot be disabled in
- * v5, so if we find a filesystem without this flag it's either
- * really old or totally unsupported. Avoid it either way.
- * We also don't support v1-v3 filesystems, which aren't
- * mountable.
- */
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
- goto out;
-
/*
* We only want to repair read-write v5+ filesystems. Defer the check
* for ops->repair until after our scrub confirms that we need to
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 49f5f5896a43..338b9d9984e0 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -917,7 +917,7 @@ xfs_vm_writepage(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_INVALID,
+ .io_type = XFS_IO_HOLE,
};
int ret;
@@ -933,7 +933,7 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_INVALID,
+ .io_type = XFS_IO_HOLE,
};
int ret;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 9af867951a10..494b4338446e 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -12,21 +12,19 @@ extern struct bio_set xfs_ioend_bioset;
* Types of I/O for bmap clustering and I/O completion tracking.
*/
enum {
- XFS_IO_INVALID, /* initial state */
+ XFS_IO_HOLE, /* covers region without any block allocation */
XFS_IO_DELALLOC, /* covers delalloc region */
XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
XFS_IO_OVERWRITE, /* covers already allocated extent */
XFS_IO_COW, /* covers copy-on-write extent */
- XFS_IO_HOLE, /* covers region without any block allocation */
};
#define XFS_IO_TYPES \
- { XFS_IO_INVALID, "invalid" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }, \
- { XFS_IO_HOLE, "hole" }
+ { XFS_IO_HOLE, "hole" }, \
+ { XFS_IO_DELALLOC, "delalloc" }, \
+ { XFS_IO_UNWRITTEN, "unwritten" }, \
+ { XFS_IO_OVERWRITE, "overwrite" }, \
+ { XFS_IO_COW, "CoW" }
/*
* Structure for buffered I/O completions.
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index addbd74ecd8e..5d263dfdb3bc 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -406,10 +406,10 @@ xfs_getbmap_report_one(
struct xfs_bmbt_irec *got)
{
struct kgetbmap *p = out + bmv->bmv_entries;
- bool shared = false, trimmed = false;
+ bool shared = false;
int error;
- error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, got, &shared);
if (error)
return error;
@@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range(
struct xfs_iext_cursor icur;
int error = 0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
- }
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
goto out_unlock;
@@ -1047,44 +1043,6 @@ out_trans_cancel:
}
static int
-xfs_adjust_extent_unmap_boundaries(
- struct xfs_inode *ip,
- xfs_fileoff_t *startoffset_fsb,
- xfs_fileoff_t *endoffset_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap;
- int nimap, error;
- xfs_extlen_t mod = 0;
-
- nimap = 1;
- error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
- if (error)
- return error;
-
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod);
- if (mod)
- *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
-
- nimap = 1;
- error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
- if (error)
- return error;
-
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && mod != mp->m_sb.sb_rextsize)
- *endoffset_fsb -= mod;
- }
-
- return 0;
-}
-
-static int
xfs_flush_unmap_range(
struct xfs_inode *ip,
xfs_off_t offset,
@@ -1137,19 +1095,8 @@ xfs_free_file_space(
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/*
- * Need to zero the stuff we're not freeing, on disk. If it's a RT file
- * and we can't use unwritten extents then we actually need to ensure
- * to zero the whole extent, otherwise we just need to take of block
- * boundaries, and xfs_bunmapi will handle the rest.
+ * Need to zero the stuff we're not freeing, on disk.
*/
- if (XFS_IS_REALTIME_INODE(ip) &&
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
- &endoffset_fsb);
- if (error)
- return error;
- }
-
if (endoffset_fsb > startoffset_fsb) {
while (!done) {
error = xfs_unmap_extent(ip, startoffset_fsb,
@@ -1584,7 +1531,7 @@ xfs_swap_extent_rmap(
tirec.br_blockcount, &irec,
&nimaps, 0);
if (error)
- goto out_defer;
+ goto out;
ASSERT(nimaps == 1);
ASSERT(tirec.br_startoff == irec.br_startoff);
trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
@@ -1599,22 +1546,22 @@ xfs_swap_extent_rmap(
/* Remove the mapping from the donor file. */
error = xfs_bmap_unmap_extent(tp, tip, &uirec);
if (error)
- goto out_defer;
+ goto out;
/* Remove the mapping from the source file. */
error = xfs_bmap_unmap_extent(tp, ip, &irec);
if (error)
- goto out_defer;
+ goto out;
/* Map the donor file's blocks into the source file. */
error = xfs_bmap_map_extent(tp, ip, &uirec);
if (error)
- goto out_defer;
+ goto out;
/* Map the source file's blocks into the donor file. */
error = xfs_bmap_map_extent(tp, tip, &irec);
if (error)
- goto out_defer;
+ goto out;
error = xfs_defer_finish(tpp);
tp = *tpp;
@@ -1636,8 +1583,6 @@ xfs_swap_extent_rmap(
tip->i_d.di_flags2 = tip_flags2;
return 0;
-out_defer:
- xfs_defer_cancel(tp);
out:
trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
tip->i_d.di_flags2 = tip_flags2;
@@ -1830,6 +1775,12 @@ xfs_swap_extents(
if (error)
goto out_unlock;
+ if (xfs_inode_has_cow_data(tip)) {
+ error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+ if (error)
+ return error;
+ }
+
/*
* Extent "swapping" with rmap requires a permanent reservation and
* a block reservation because it's really just a remap operation
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e839907e8492..b21ea2ba768d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone;
#define xb_to_gfp(flags) \
((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ * b_sema (caller holds)
+ * b_lock
+ *
+ * xfs_buf_stale:
+ * b_sema (caller holds)
+ * b_lock
+ * lru_lock
+ *
+ * xfs_buf_rele:
+ * b_lock
+ * pag_buf_lock
+ * lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ */
static inline int
xfs_buf_is_vmapped(
@@ -749,6 +775,30 @@ _xfs_buf_read(
return xfs_buf_submit(bp);
}
+/*
+ * If the caller passed in an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use them to verify the contents. If the contents
+ * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
+ * recorded errors and is already in XBF_DONE state.
+ */
+int
+xfs_buf_ensure_ops(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+ ASSERT(bp->b_error == 0);
+
+ if (!ops || bp->b_ops)
+ return 0;
+
+ bp->b_ops = ops;
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ bp->b_flags &= ~XBF_DONE;
+ return bp->b_error;
+}
+
xfs_buf_t *
xfs_buf_read_map(
struct xfs_buftarg *target,
@@ -762,26 +812,32 @@ xfs_buf_read_map(
flags |= XBF_READ;
bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (bp) {
- trace_xfs_buf_read(bp, flags, _RET_IP_);
+ if (!bp)
+ return NULL;
- if (!(bp->b_flags & XBF_DONE)) {
- XFS_STATS_INC(target->bt_mount, xb_get_read);
- bp->b_ops = ops;
- _xfs_buf_read(bp, flags);
- } else if (flags & XBF_ASYNC) {
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
- xfs_buf_relse(bp);
- return NULL;
- } else {
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- }
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+ if (!(bp->b_flags & XBF_DONE)) {
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
+ bp->b_ops = ops;
+ _xfs_buf_read(bp, flags);
+ return bp;
}
+ xfs_buf_ensure_ops(bp, ops);
+
+ if (flags & XBF_ASYNC) {
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ ASSERT(bp->b_ops != NULL || ops == NULL);
return bp;
}
@@ -1006,8 +1062,18 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ /*
+ * We grab the b_lock here first to serialise racing xfs_buf_rele()
+ * calls. The pag_buf_lock being taken on the last reference only
+ * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+ * to last reference we drop here is not serialised against the last
+ * reference until we take bp->b_lock. Hence if we don't grab b_lock
+ * first, the last "release" reference can win the race to the lock and
+ * free the buffer before the second-to-last reference is processed,
+ * leading to a use-after-free scenario.
+ */
spin_lock(&bp->b_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers(
* is only safely useable for callers that can track I/O completion by higher
* level means, e.g. AIL pushing as the @buffer_list is consumed in this
* function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
*/
int
xfs_buf_delwri_submit_nowait(
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4e3171acd0f8..b9f5511ea998 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -385,4 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
+int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1c9d1398980b..12d8455bfbb2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -532,6 +532,49 @@ xfs_buf_item_push(
}
/*
+ * Drop the buffer log item refcount and take appropriate action. This helper
+ * determines whether the bli must be freed or not, since a decrement to zero
+ * does not necessarily mean the bli is unused.
+ *
+ * Return true if the bli is freed, false otherwise.
+ */
+bool
+xfs_buf_item_put(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_log_item *lip = &bip->bli_item;
+ bool aborted;
+ bool dirty;
+
+ /* drop the bli ref and return if it wasn't the last one */
+ if (!atomic_dec_and_test(&bip->bli_refcount))
+ return false;
+
+ /*
+ * We dropped the last ref and must free the item if clean or aborted.
+ * If the bli is dirty and non-aborted, the buffer was clean in the
+ * transaction but still awaiting writeback from previous changes. In
+ * that case, the bli is freed on buffer writeback completion.
+ */
+ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
+ XFS_FORCED_SHUTDOWN(lip->li_mountp);
+ dirty = bip->bli_flags & XFS_BLI_DIRTY;
+ if (dirty && !aborted)
+ return false;
+
+ /*
+ * The bli is aborted or clean. An aborted item may be in the AIL
+ * regardless of dirty state. For example, consider an aborted
+ * transaction that invalidated a dirty bli and cleared the dirty
+ * state.
+ */
+ if (aborted)
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bip->bli_buf);
+ return true;
+}
+
+/*
* Release the buffer associated with the buf log item. If there is no dirty
* logged data associated with the buffer recorded in the buf log item, then
* free the buf log item and remove the reference to it in the buffer.
@@ -556,76 +599,42 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool aborted;
- bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
- bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
+ bool released;
+ bool hold = bip->bli_flags & XFS_BLI_HOLD;
+ bool stale = bip->bli_flags & XFS_BLI_STALE;
#if defined(DEBUG) || defined(XFS_WARN)
- bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+ bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
+ bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
#endif
- aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags);
-
- /* Clear the buffer's association with this transaction. */
- bp->b_transp = NULL;
-
- /*
- * The per-transaction state has been copied above so clear it from the
- * bli.
- */
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
-
- /*
- * If the buf item is marked stale, then don't do anything. We'll
- * unlock the buffer and free the buf item when the buffer is unpinned
- * for the last time.
- */
- if (bip->bli_flags & XFS_BLI_STALE) {
- trace_xfs_buf_item_unlock_stale(bip);
- ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- if (!aborted) {
- atomic_dec(&bip->bli_refcount);
- return;
- }
- }
-
trace_xfs_buf_item_unlock(bip);
/*
- * If the buf item isn't tracking any data, free it, otherwise drop the
- * reference we hold to it. If we are aborting the transaction, this may
- * be the only reference to the buf item, so we free it anyway
- * regardless of whether it is dirty or not. A dirty abort implies a
- * shutdown, anyway.
- *
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
*/
ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
(ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+ ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
/*
- * Clean buffers, by definition, cannot be in the AIL. However, aborted
- * buffers may be in the AIL regardless of dirty state. An aborted
- * transaction that invalidates a buffer already in the AIL may have
- * marked it stale and cleared the dirty state, for example.
- *
- * Therefore if we are aborting a buffer and we've just taken the last
- * reference away, we have to check if it is in the AIL before freeing
- * it. We need to free it in this case, because an aborted transaction
- * has already shut the filesystem down and this is the last chance we
- * will have to do so.
+ * Clear the buffer's association with this transaction and
+ * per-transaction state from the bli, which has been copied above.
*/
- if (atomic_dec_and_test(&bip->bli_refcount)) {
- if (aborted) {
- ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
- xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- } else if (!dirty)
- xfs_buf_item_relse(bp);
- }
+ bp->b_transp = NULL;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
- if (!hold)
- xfs_buf_relse(bp);
+ /*
+ * Unref the item and unlock the buffer unless held or stale. Stale
+ * buffers remain locked until final unpin unless the bli is freed by
+ * the unref call. The latter implies shutdown because buffer
+ * invalidation dirties the bli and transaction.
+ */
+ released = xfs_buf_item_put(bip);
+ if (hold || (stale && !released))
+ return;
+ ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags));
+ xfs_buf_relse(bp);
}
/*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 3f7d7b72e7e6..90f65f891fab 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -51,6 +51,7 @@ struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
+bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7c00b8bedfe3..093c2b8d7e20 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -470,20 +470,13 @@ xfs_fs_goingdown(
*/
void
xfs_do_force_shutdown(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
int flags,
char *fname,
int lnnum)
{
- int logerror;
-
- logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+ bool logerror = flags & SHUTDOWN_LOG_IO_ERROR;
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_notice(mp,
- "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
- __func__, flags, lnnum, fname, __return_address);
- }
/*
* No need to duplicate efforts.
*/
@@ -499,27 +492,34 @@ xfs_do_force_shutdown(
if (xfs_log_force_umount(mp, logerror))
return;
+ if (flags & SHUTDOWN_FORCE_UMOUNT) {
+ xfs_alert(mp,
+"User initiated shutdown received. Shutting down filesystem");
+ return;
+ }
+
+ xfs_notice(mp,
+"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
+ __func__, flags, lnnum, fname, __return_address);
+
if (flags & SHUTDOWN_CORRUPT_INCORE) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
- "Corruption of in-memory data detected. Shutting down filesystem");
+"Corruption of in-memory data detected. Shutting down filesystem");
if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
xfs_stack_trace();
- } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
- }
- }
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_alert(mp,
- "Please umount the filesystem and rectify the problem(s)");
+ } else if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
}
+
+ xfs_alert(mp,
+ "Please unmount the filesystem and rectify the problem(s)");
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d957a46dc1cb..05db9540e459 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags(
error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
XFS_ITRUNC_MAX_EXTENTS, &done);
if (error)
- goto out_bmap_cancel;
+ goto out;
/*
* Duplicate the transaction that has the permanent
@@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags(
out:
*tpp = tp;
return error;
-out_bmap_cancel:
- /*
- * If the bunmapi call encounters an error, return to the caller where
- * the transaction can be properly aborted. We just need to make sure
- * we're not holding any resources that we were not when we came in.
- */
- xfs_defer_cancel(tp);
- goto out;
}
int
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0ef5ece5634c..6e2c08f30f60 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -604,14 +604,6 @@ xfs_ioc_space(
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
- /*
- * Only allow the sys admin to reserve space unless
- * unwritten extents are enabled.
- */
- if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
return -EPERM;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6320aca39f39..27c93b5f029d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -62,6 +62,21 @@ xfs_bmbt_to_iomap(
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
}
+static void
+xfs_hole_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
+ iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+ iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+}
+
xfs_extlen_t
xfs_eof_alignment(
struct xfs_inode *ip,
@@ -502,6 +517,7 @@ xfs_file_iomap_begin_delay(
struct inode *inode,
loff_t offset,
loff_t count,
+ unsigned flags,
struct iomap *iomap)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -538,15 +554,23 @@ xfs_file_iomap_begin_delay(
goto out_unlock;
}
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
- if (!eof && got.br_startoff <= offset_fsb) {
- if (xfs_is_reflink_inode(ip)) {
- bool shared;
+ if (eof)
+ got.br_startoff = end_fsb; /* fake hole until the end */
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
- maxbytes_fsb);
+ if (got.br_startoff <= offset_fsb) {
+ /*
+ * For reflink files we may need a delalloc reservation when
+ * overwriting shared extents. This includes zeroing of
+ * existing extents that contain data.
+ */
+ if (xfs_is_reflink_inode(ip) &&
+ ((flags & IOMAP_WRITE) ||
+ got.br_state != XFS_EXT_UNWRITTEN)) {
xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got, &shared);
+ error = xfs_reflink_reserve_cow(ip, &got);
if (error)
goto out_unlock;
}
@@ -555,6 +579,11 @@ xfs_file_iomap_begin_delay(
goto done;
}
+ if (flags & IOMAP_ZERO) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
+ goto out_unlock;
+ }
+
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
@@ -1003,16 +1032,17 @@ xfs_file_iomap_begin(
struct xfs_bmbt_irec imap;
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
- bool shared = false, trimmed = false;
+ bool shared = false;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
- return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
+ return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+ iomap);
}
/*
@@ -1038,8 +1068,7 @@ xfs_file_iomap_begin(
if (flags & IOMAP_REPORT) {
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
- &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
if (error)
goto out_unlock;
}
@@ -1065,7 +1094,7 @@ xfs_file_iomap_begin(
if (error)
goto out_unlock;
} else {
- error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ error = xfs_reflink_reserve_cow(ip, &imap);
if (error)
goto out_unlock;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c3e74f9128e8..f48ffd7a8d3e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -471,8 +471,18 @@ xfs_vn_get_link_inline(
struct inode *inode,
struct delayed_call *done)
{
+ char *link;
+
ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
- return XFS_I(inode)->i_df.if_u1.if_data;
+
+ /*
+ * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
+ * if_data is junk.
+ */
+ link = XFS_I(inode)->i_df.if_u1.if_data;
+ if (!link)
+ return ERR_PTR(-EFSCORRUPTED);
+ return link;
}
STATIC int
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a21dc61ec09e..1fc9e9042e0e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1570,16 +1570,6 @@ xlog_find_zeroed(
if (last_cycle != 0) { /* log completely written to */
xlog_put_bp(bp);
return 0;
- } else if (first_cycle != 1) {
- /*
- * If the cycle of the last block is zero, the cycle of
- * the first block must be 1. If it's not, maybe we're
- * not looking at a log... Bail out.
- */
- xfs_warn(log->l_mp,
- "Log inconsistent or not a log (last==0, first!=1)");
- error = -EINVAL;
- goto bp_err;
}
/* we have a partially zeroed log */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 38f405415b88..8eaeec9d58ed 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -182,8 +182,7 @@ int
xfs_reflink_trim_around_shared(
struct xfs_inode *ip,
struct xfs_bmbt_irec *irec,
- bool *shared,
- bool *trimmed)
+ bool *shared)
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared(
if (error)
return error;
- *shared = *trimmed = false;
+ *shared = false;
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
@@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared(
*/
irec->br_blockcount = flen;
*shared = true;
- if (flen != aglen)
- *trimmed = true;
return 0;
} else {
/*
@@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared(
* start of the shared region.
*/
irec->br_blockcount = fbno - agbno;
- *trimmed = true;
return 0;
}
}
@@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared(
/*
* Trim the passed in imap to the next shared/unshared extent boundary, and
* if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork. In this case *shared is set to true, else to false.
+ * COW fork.
*
* Note that imap will always contain the block numbers for the existing blocks
* in the data fork, as the upper layers need them for read-modify-write
@@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared(
int
xfs_reflink_reserve_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- bool *shared)
+ struct xfs_bmbt_irec *imap)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got;
int error = 0;
- bool eof = false, trimmed;
+ bool eof = false;
struct xfs_iext_cursor icur;
+ bool shared;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -273,18 +269,16 @@ xfs_reflink_reserve_cow(
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-
- *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, imap, &shared);
if (error)
return error;
/* Not shared? Just report the (potentially capped) extent. */
- if (!*shared)
+ if (!shared)
return 0;
/*
@@ -352,6 +346,50 @@ xfs_reflink_convert_cow(
return error;
}
+/*
+ * Find the extent that maps the given range in the COW fork. Even if the extent
+ * is not shared we might have a preallocation for it in the COW fork. If so we
+ * use it that rather than trigger a new allocation.
+ */
+static int
+xfs_find_trim_cow_extent(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ bool *shared,
+ bool *found)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+
+ *found = false;
+
+ /*
+ * If we don't find an overlapping extent, trim the range we need to
+ * allocate to fit the hole we found.
+ */
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = offset_fsb + count_fsb;
+ if (got.br_startoff > offset_fsb) {
+ xfs_trim_extent(imap, imap->br_startoff,
+ got.br_startoff - imap->br_startoff);
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
+ }
+
+ *shared = true;
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ return 0;
+ }
+
+ /* real extent found - no need to allocate */
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ *imap = got;
+ *found = true;
+ return 0;
+}
+
/* Allocate all CoW reservations covering a range of blocks in a file. */
int
xfs_reflink_allocate_cow(
@@ -363,78 +401,64 @@ xfs_reflink_allocate_cow(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
xfs_filblks_t count_fsb = imap->br_blockcount;
- struct xfs_bmbt_irec got;
- struct xfs_trans *tp = NULL;
+ struct xfs_trans *tp;
int nimaps, error = 0;
- bool trimmed;
+ bool found;
xfs_filblks_t resaligned;
xfs_extlen_t resblks = 0;
- struct xfs_iext_cursor icur;
-retry:
- ASSERT(xfs_is_reflink_inode(ip));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_is_reflink_inode(ip));
- /*
- * Even if the extent is not shared we might have a preallocation for
- * it in the COW fork. If so use it.
- */
- if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
- got.br_startoff <= offset_fsb) {
- *shared = true;
-
- /* If we have a real allocation in the COW fork we're done. */
- if (!isnullstartblock(got.br_startblock)) {
- xfs_trim_extent(&got, offset_fsb, count_fsb);
- *imap = got;
- goto convert;
- }
+ error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ if (error || !*shared)
+ return error;
+ if (found)
+ goto convert;
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- } else {
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
- if (error || !*shared)
- goto out;
- }
+ resaligned = xfs_aligned_fsb_count(imap->br_startoff,
+ imap->br_blockcount, xfs_get_cowextsz_hint(ip));
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
- if (!tp) {
- resaligned = xfs_aligned_fsb_count(imap->br_startoff,
- imap->br_blockcount, xfs_get_cowextsz_hint(ip));
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ xfs_iunlock(ip, *lockmode);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ *lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, *lockmode);
- xfs_iunlock(ip, *lockmode);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- *lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, *lockmode);
+ if (error)
+ return error;
- if (error)
- return error;
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error)
+ goto out_trans_cancel;
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out;
- goto retry;
+ /*
+ * Check for an overlapping extent again now that we dropped the ilock.
+ */
+ error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+ if (found) {
+ xfs_trans_cancel(tp);
+ goto convert;
}
error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
XFS_QMOPT_RES_REGBLKS);
if (error)
- goto out;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, 0);
- nimaps = 1;
-
/* Allocate the entire reservation as unwritten blocks. */
+ nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
resblks, imap, &nimaps);
if (error)
- goto out_trans_cancel;
+ goto out_unreserve;
xfs_inode_set_cowblocks_tag(ip);
-
- /* Finish up. */
error = xfs_trans_commit(tp);
if (error)
return error;
@@ -447,12 +471,12 @@ retry:
return -ENOSPC;
convert:
return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
-out_trans_cancel:
+
+out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
XFS_QMOPT_RES_REGBLKS);
-out:
- if (tp)
- xfs_trans_cancel(tp);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
@@ -666,14 +690,12 @@ xfs_reflink_end_cow(
if (!del.br_blockcount)
goto prev_extent;
- ASSERT(!isnullstartblock(got.br_startblock));
-
/*
- * Don't remap unwritten extents; these are
- * speculatively preallocated CoW extents that have been
- * allocated but have not yet been involved in a write.
+ * Only remap real extent that contain data. With AIO
+ * speculatively preallocations can leak into the range we
+ * are called upon, and we need to skip them.
*/
- if (got.br_state == XFS_EXT_UNWRITTEN)
+ if (!xfs_bmap_is_real_extent(&got))
goto prev_extent;
/* Unmap the old blocks in the data fork. */
@@ -1195,35 +1217,92 @@ retry:
return 0;
}
+/* Unlock both inodes after they've been prepped for a range clone. */
+STATIC void
+xfs_reflink_remap_unlock(
+ struct file *file_in,
+ struct file *file_out)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ bool same_inode = (inode_in == inode_out);
+
+ xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+ if (!same_inode)
+ xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+ inode_unlock(inode_out);
+ if (!same_inode)
+ inode_unlock_shared(inode_in);
+}
+
/*
- * Link a range of blocks from one file to another.
+ * If we're reflinking to a point past the destination file's EOF, we must
+ * zero any speculative post-EOF preallocations that sit between the old EOF
+ * and the destination file offset.
*/
-int
-xfs_reflink_remap_range(
+static int
+xfs_reflink_zero_posteof(
+ struct xfs_inode *ip,
+ loff_t pos)
+{
+ loff_t isize = i_size_read(VFS_I(ip));
+
+ if (pos <= isize)
+ return 0;
+
+ trace_xfs_zero_eof(ip, isize, pos - isize);
+ return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
+ &xfs_iomap_ops);
+}
+
+/*
+ * Prepare two files for range cloning. Upon a successful return both inodes
+ * will have the iolock and mmaplock held, the page cache of the out file will
+ * be truncated, and any leases on the out file will have been broken. This
+ * function borrows heavily from xfs_file_aio_write_checks.
+ *
+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
+ * EOF block in the source dedupe range because it's not a complete block match,
+ * hence can introduce a corruption into the file that has it's block replaced.
+ *
+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
+ * "block aligned" for the purposes of cloning entire files. However, if the
+ * source file range includes the EOF block and it lands within the existing EOF
+ * of the destination file, then we can expose stale data from beyond the source
+ * file EOF in the destination file.
+ *
+ * XFS doesn't support partial block sharing, so in both cases we have check
+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
+ * down to the previous whole block and ignore the partial EOF block. While this
+ * means we can't dedupe the last block of a file, this is an acceptible
+ * tradeoff for simplicity on implementation.
+ *
+ * For cloning, we want to share the partial EOF block if it is also the new EOF
+ * block of the destination file. If the partial EOF block lies inside the
+ * existing destination EOF, then we have to abort the clone to avoid exposing
+ * stale data in the destination file. Hence we reject these clone attempts with
+ * -EINVAL in this case.
+ */
+STATIC int
+xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- u64 len,
+ u64 *len,
bool is_dedupe)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- struct xfs_mount *mp = src->i_mount;
bool same_inode = (inode_in == inode_out);
- xfs_fileoff_t sfsbno, dfsbno;
- xfs_filblks_t fsblen;
- xfs_extlen_t cowextsize;
+ u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return -EOPNOTSUPP;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/* Lock both files against IO */
ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
if (ret)
@@ -1245,33 +1324,115 @@ xfs_reflink_remap_range(
goto out_unlock;
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- &len, is_dedupe);
+ len, is_dedupe);
if (ret <= 0)
goto out_unlock;
+ /*
+ * If the dedupe data matches, chop off the partial EOF block
+ * from the source file so we don't try to dedupe the partial
+ * EOF block.
+ */
+ if (is_dedupe) {
+ *len &= ~blkmask;
+ } else if (*len & blkmask) {
+ /*
+ * The user is attempting to share a partial EOF block,
+ * if it's inside the destination EOF then reject it.
+ */
+ if (pos_out + *len < i_size_read(inode_out)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
goto out_unlock;
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
/*
- * Clear out post-eof preallocations because we don't have page cache
- * backing the delayed allocations and they'll never get freed on
- * their own.
+ * Zero existing post-eof speculative preallocations in the destination
+ * file.
*/
- if (xfs_can_free_eofblocks(dest, true)) {
- ret = xfs_free_eofblocks(dest);
- if (ret)
- goto out_unlock;
- }
+ ret = xfs_reflink_zero_posteof(dest, pos_out);
+ if (ret)
+ goto out_unlock;
/* Set flags and remap blocks. */
ret = xfs_reflink_set_inode_flag(src, dest);
if (ret)
goto out_unlock;
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + *len) - 1);
+
+ /* If we're altering the file contents... */
+ if (!is_dedupe) {
+ /*
+ * ...update the timestamps (which will grab the ilock again
+ * from xfs_fs_dirty_inode, so we have to call it before we
+ * take the ilock).
+ */
+ if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+ ret = file_update_time(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /*
+ * ...clear the security bits if the process is not being run
+ * by root. This keeps people from modifying setuid and setgid
+ * binaries.
+ */
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ return 1;
+out_unlock:
+ xfs_reflink_remap_unlock(file_in, file_out);
+ return ret;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t sfsbno, dfsbno;
+ xfs_filblks_t fsblen;
+ xfs_extlen_t cowextsize;
+ ssize_t ret;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, is_dedupe);
+ if (ret <= 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
@@ -1280,10 +1441,6 @@ xfs_reflink_remap_range(
if (ret)
goto out_unlock;
- /* Zap any page cache for the destination file's range. */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + len) - 1);
-
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
@@ -1300,12 +1457,7 @@ xfs_reflink_remap_range(
is_dedupe);
out_unlock:
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock_shared(inode_in);
+ xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c585ad9552b2..7f47202b5639 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -10,10 +10,10 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+ struct xfs_bmbt_irec *irec, bool *shared);
extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared);
+ struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 4e4423153071..cc509743facd 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -29,30 +29,30 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
char *desc;
int endpoint;
} xstats[] = {
- { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC },
- { "abt", XFSSTAT_END_ALLOC_BTREE },
- { "blk_map", XFSSTAT_END_BLOCK_MAPPING },
- { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE },
- { "dir", XFSSTAT_END_DIRECTORY_OPS },
- { "trans", XFSSTAT_END_TRANSACTIONS },
- { "ig", XFSSTAT_END_INODE_OPS },
- { "log", XFSSTAT_END_LOG_OPS },
- { "push_ail", XFSSTAT_END_TAIL_PUSHING },
- { "xstrat", XFSSTAT_END_WRITE_CONVERT },
- { "rw", XFSSTAT_END_READ_WRITE_OPS },
- { "attr", XFSSTAT_END_ATTRIBUTE_OPS },
- { "icluster", XFSSTAT_END_INODE_CLUSTER },
- { "vnodes", XFSSTAT_END_VNODE_OPS },
- { "buf", XFSSTAT_END_BUF },
- { "abtb2", XFSSTAT_END_ABTB_V2 },
- { "abtc2", XFSSTAT_END_ABTC_V2 },
- { "bmbt2", XFSSTAT_END_BMBT_V2 },
- { "ibt2", XFSSTAT_END_IBT_V2 },
- { "fibt2", XFSSTAT_END_FIBT_V2 },
- { "rmapbt", XFSSTAT_END_RMAP_V2 },
- { "refcntbt", XFSSTAT_END_REFCOUNT },
+ { "extent_alloc", xfsstats_offset(xs_abt_lookup) },
+ { "abt", xfsstats_offset(xs_blk_mapr) },
+ { "blk_map", xfsstats_offset(xs_bmbt_lookup) },
+ { "bmbt", xfsstats_offset(xs_dir_lookup) },
+ { "dir", xfsstats_offset(xs_trans_sync) },
+ { "trans", xfsstats_offset(xs_ig_attempts) },
+ { "ig", xfsstats_offset(xs_log_writes) },
+ { "log", xfsstats_offset(xs_try_logspace)},
+ { "push_ail", xfsstats_offset(xs_xstrat_quick)},
+ { "xstrat", xfsstats_offset(xs_write_calls) },
+ { "rw", xfsstats_offset(xs_attr_get) },
+ { "attr", xfsstats_offset(xs_iflush_count)},
+ { "icluster", xfsstats_offset(vn_active) },
+ { "vnodes", xfsstats_offset(xb_get) },
+ { "buf", xfsstats_offset(xs_abtb_2) },
+ { "abtb2", xfsstats_offset(xs_abtc_2) },
+ { "abtc2", xfsstats_offset(xs_bmbt_2) },
+ { "bmbt2", xfsstats_offset(xs_ibt_2) },
+ { "ibt2", xfsstats_offset(xs_fibt_2) },
+ { "fibt2", xfsstats_offset(xs_rmap_2) },
+ { "rmapbt", xfsstats_offset(xs_refcbt_2) },
+ { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
- { "qm", XFSSTAT_END_QM },
+ { "qm", xfsstats_offset(xs_xstrat_bytes)},
};
/* Loop over all stats groups */
@@ -104,6 +104,10 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
#ifdef CONFIG_PROC_FS
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
+
+#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims)
+#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot)
+
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
@@ -119,7 +123,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
int j;
seq_printf(m, "qm");
- for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+ for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 130db070e4d8..34d704f703d2 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -41,17 +41,14 @@ enum {
* XFS global statistics
*/
struct __xfsstats {
-# define XFSSTAT_END_EXTENT_ALLOC 4
uint32_t xs_allocx;
uint32_t xs_allocb;
uint32_t xs_freex;
uint32_t xs_freeb;
-# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4)
uint32_t xs_abt_lookup;
uint32_t xs_abt_compare;
uint32_t xs_abt_insrec;
uint32_t xs_abt_delrec;
-# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7)
uint32_t xs_blk_mapr;
uint32_t xs_blk_mapw;
uint32_t xs_blk_unmap;
@@ -59,21 +56,17 @@ struct __xfsstats {
uint32_t xs_del_exlist;
uint32_t xs_look_exlist;
uint32_t xs_cmp_exlist;
-# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4)
uint32_t xs_bmbt_lookup;
uint32_t xs_bmbt_compare;
uint32_t xs_bmbt_insrec;
uint32_t xs_bmbt_delrec;
-# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4)
uint32_t xs_dir_lookup;
uint32_t xs_dir_create;
uint32_t xs_dir_remove;
uint32_t xs_dir_getdents;
-# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3)
uint32_t xs_trans_sync;
uint32_t xs_trans_async;
uint32_t xs_trans_empty;
-# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7)
uint32_t xs_ig_attempts;
uint32_t xs_ig_found;
uint32_t xs_ig_frecycle;
@@ -81,13 +74,11 @@ struct __xfsstats {
uint32_t xs_ig_dup;
uint32_t xs_ig_reclaims;
uint32_t xs_ig_attrchg;
-# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5)
uint32_t xs_log_writes;
uint32_t xs_log_blocks;
uint32_t xs_log_noiclogs;
uint32_t xs_log_force;
uint32_t xs_log_force_sleep;
-# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10)
uint32_t xs_try_logspace;
uint32_t xs_sleep_logspace;
uint32_t xs_push_ail;
@@ -98,22 +89,17 @@ struct __xfsstats {
uint32_t xs_push_ail_flushing;
uint32_t xs_push_ail_restarts;
uint32_t xs_push_ail_flush;
-# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2)
uint32_t xs_xstrat_quick;
uint32_t xs_xstrat_split;
-# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2)
uint32_t xs_write_calls;
uint32_t xs_read_calls;
-# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4)
uint32_t xs_attr_get;
uint32_t xs_attr_set;
uint32_t xs_attr_remove;
uint32_t xs_attr_list;
-# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3)
uint32_t xs_iflush_count;
uint32_t xs_icluster_flushcnt;
uint32_t xs_icluster_flushinode;
-# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8)
uint32_t vn_active; /* # vnodes not on free lists */
uint32_t vn_alloc; /* # times vn_alloc called */
uint32_t vn_get; /* # times vn_get called */
@@ -122,7 +108,6 @@ struct __xfsstats {
uint32_t vn_reclaim; /* # times vn_reclaim called */
uint32_t vn_remove; /* # times vn_remove called */
uint32_t vn_free; /* # times vn_free called */
-#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
uint32_t xb_get;
uint32_t xb_create;
uint32_t xb_get_locked;
@@ -133,28 +118,19 @@ struct __xfsstats {
uint32_t xb_page_found;
uint32_t xb_get_read;
/* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX)
uint32_t xs_abtb_2[__XBTS_MAX];
-#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
uint32_t xs_abtc_2[__XBTS_MAX];
-#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
uint32_t xs_bmbt_2[__XBTS_MAX];
-#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
uint32_t xs_ibt_2[__XBTS_MAX];
-#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
uint32_t xs_fibt_2[__XBTS_MAX];
-#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
uint32_t xs_rmap_2[__XBTS_MAX];
-#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
uint32_t xs_refcbt_2[__XBTS_MAX];
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
uint32_t xs_qm_dqcachemisses;
uint32_t xs_qm_dqcachehits;
uint32_t xs_qm_dqwants;
-#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
uint32_t xs_qm_dquot;
uint32_t xs_qm_dquot_unused;
/* Extra precision counters */
@@ -163,10 +139,12 @@ struct __xfsstats {
uint64_t xs_read_bytes;
};
+#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+
struct xfsstats {
union {
struct __xfsstats s;
- uint32_t a[XFSSTAT_END_XQMSTAT];
+ uint32_t a[xfsstats_offset(xs_qm_dquot)];
};
};
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 207ee302b1bb..d3e6cd063688 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -43,6 +43,7 @@
#include <linux/dax.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/mempool.h>
#include <linux/writeback.h>
@@ -933,6 +934,32 @@ xfs_fs_alloc_inode(
return NULL;
}
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+
+ if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+ return;
+ do {
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_warn(ip->i_mount,
+ "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+ ip->i_ino,
+ whichfork == XFS_DATA_FORK ? "data" : "cow",
+ got.br_startoff, got.br_blockcount);
+ }
+ } while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork) do { } while (0)
+#endif
+
/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
@@ -951,7 +978,12 @@ xfs_fs_destroy_inode(
xfs_inactive(ip);
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
+ xfs_check_delalloc(ip, XFS_DATA_FORK);
+ xfs_check_delalloc(ip, XFS_COW_FORK);
+ ASSERT(0);
+ }
+
XFS_STATS_INC(ip->i_mount, vn_reclaim);
/*
@@ -1097,7 +1129,7 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
- statp->f_type = XFS_SB_MAGIC;
+ statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
@@ -1650,7 +1682,7 @@ xfs_fs_fill_super(
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
*/
- sb->s_magic = XFS_SB_MAGIC;
+ sb->s_magic = XFS_SUPER_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ad315e83bc02..3043e5ed6495 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index bedc5a5133a5..912b42f5fe4a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -259,6 +259,14 @@ xfs_trans_alloc(
struct xfs_trans *tp;
int error;
+ /*
+ * Allocate the handle before we do our freeze accounting and setting up
+ * GFP_NOFS allocation context so that we avoid lockdep false positives
+ * by doing GFP_KERNEL allocations inside sb_start_intwrite().
+ */
+ tp = kmem_zone_zalloc(xfs_trans_zone,
+ (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
@@ -270,8 +278,6 @@ xfs_trans_alloc(
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
atomic_inc(&mp->m_active_trans);
- tp = kmem_zone_zalloc(xfs_trans_zone,
- (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_flags = flags;
tp->t_mountp = mp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c3d278e96ad1..a0c5dbda18aa 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -220,6 +220,7 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
uint);
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
void xfs_extent_free_init_defer_op(void);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 55326f971cb3..d3a4e89bf4a0 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -531,17 +531,33 @@ xfsaild(
set_current_state(TASK_INTERRUPTIBLE);
/*
- * Check kthread_should_stop() after we set the task state
- * to guarantee that we either see the stop bit and exit or
- * the task state is reset to runnable such that it's not
- * scheduled out indefinitely and detects the stop bit at
- * next iteration.
- *
+ * Check kthread_should_stop() after we set the task state to
+ * guarantee that we either see the stop bit and exit or the
+ * task state is reset to runnable such that it's not scheduled
+ * out indefinitely and detects the stop bit at next iteration.
* A memory barrier is included in above task state set to
* serialize again kthread_stop().
*/
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
+
+ /*
+ * The caller forces out the AIL before stopping the
+ * thread in the common case, which means the delwri
+ * queue is drained. In the shutdown case, the queue may
+ * still hold relogged buffers that haven't been
+ * submitted because they were pinned since added to the
+ * queue.
+ *
+ * Log I/O error processing stales the underlying buffer
+ * and clears the delwri state, expecting the buf to be
+ * removed on the next submission attempt. That won't
+ * happen if we're shutting down, so this is the last
+ * opportunity to release such buffers from the queue.
+ */
+ ASSERT(list_empty(&ailp->ail_buf_list) ||
+ XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 15919f67a88f..629f1479c9d2 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -264,11 +264,39 @@ xfs_trans_read_buf_map(
return -EIO;
}
+ /*
+ * Check if the caller is trying to read a buffer that is
+ * already attached to the transaction yet has no buffer ops
+ * assigned. Ops are usually attached when the buffer is
+ * attached to the transaction, or by the read caller if
+ * special circumstances. That didn't happen, which is not
+ * how this is supposed to go.
+ *
+ * If the buffer passes verification we'll let this go, but if
+ * not we have to shut down. Let the transaction cleanup code
+ * release this buffer when it kills the tranaction.
+ */
+ ASSERT(bp->b_ops != NULL);
+ error = xfs_buf_ensure_ops(bp, ops);
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+
+ if (tp->t_flags & XFS_TRANS_DIRTY)
+ xfs_force_shutdown(tp->t_mountp,
+ SHUTDOWN_META_IO_ERROR);
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
+ }
+
bip = bp->b_log_item;
bip->bli_recur++;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
trace_xfs_trans_read_buf_recur(bip);
+ ASSERT(bp->b_ops != NULL || ops == NULL);
*bpp = bp;
return 0;
}
@@ -316,55 +344,58 @@ xfs_trans_read_buf_map(
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_read_buf(bp->b_log_item);
}
+ ASSERT(bp->b_ops != NULL || ops == NULL);
*bpp = bp;
return 0;
}
+/* Has this buffer been dirtied by anyone? */
+bool
+xfs_trans_buf_is_dirty(
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ if (!bip)
+ return false;
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
/*
- * Release the buffer bp which was previously acquired with one of the
- * xfs_trans_... buffer allocation routines if the buffer has not
- * been modified within this transaction. If the buffer is modified
- * within this transaction, do decrement the recursion count but do
- * not release the buffer even if the count goes to 0. If the buffer is not
- * modified within the transaction, decrement the recursion count and
- * release the buffer if the recursion count goes to 0.
+ * Release a buffer previously joined to the transaction. If the buffer is
+ * modified within this transaction, decrement the recursion count but do not
+ * release the buffer even if the count goes to 0. If the buffer is not modified
+ * within the transaction, decrement the recursion count and release the buffer
+ * if the recursion count goes to 0.
*
- * If the buffer is to be released and it was not modified before
- * this transaction began, then free the buf_log_item associated with it.
+ * If the buffer is to be released and it was not already dirty before this
+ * transaction began, then also free the buf_log_item associated with it.
*
- * If the transaction pointer is NULL, make this just a normal
- * brelse() call.
+ * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call.
*/
void
xfs_trans_brelse(
- xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
- struct xfs_buf_log_item *bip;
- int freed;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
- /*
- * Default to a normal brelse() call if the tp is NULL.
- */
- if (tp == NULL) {
- ASSERT(bp->b_transp == NULL);
+ ASSERT(bp->b_transp == tp);
+
+ if (!tp) {
xfs_buf_relse(bp);
return;
}
- ASSERT(bp->b_transp == tp);
- bip = bp->b_log_item;
+ trace_xfs_trans_brelse(bip);
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- trace_xfs_trans_brelse(bip);
-
/*
- * If the release is just for a recursive lock,
- * then decrement the count and return.
+ * If the release is for a recursive lookup, then decrement the count
+ * and return.
*/
if (bip->bli_recur > 0) {
bip->bli_recur--;
@@ -372,64 +403,24 @@ xfs_trans_brelse(
}
/*
- * If the buffer is dirty within this transaction, we can't
+ * If the buffer is invalidated or dirty in this transaction, we can't
* release it until we commit.
*/
if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
return;
-
- /*
- * If the buffer has been invalidated, then we can't release
- * it until the transaction commits to disk unless it is re-dirtied
- * as part of this transaction. This prevents us from pulling
- * the item from the AIL before we should.
- */
if (bip->bli_flags & XFS_BLI_STALE)
return;
- ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-
/*
- * Free up the log item descriptor tracking the released item.
+ * Unlink the log item from the transaction and clear the hold flag, if
+ * set. We wouldn't want the next user of the buffer to get confused.
*/
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
xfs_trans_del_item(&bip->bli_item);
+ bip->bli_flags &= ~XFS_BLI_HOLD;
- /*
- * Clear the hold flag in the buf log item if it is set.
- * We wouldn't want the next user of the buffer to
- * get confused.
- */
- if (bip->bli_flags & XFS_BLI_HOLD) {
- bip->bli_flags &= ~XFS_BLI_HOLD;
- }
-
- /*
- * Drop our reference to the buf log item.
- */
- freed = atomic_dec_and_test(&bip->bli_refcount);
-
- /*
- * If the buf item is not tracking data in the log, then we must free it
- * before releasing the buffer back to the free pool.
- *
- * If the fs has shutdown and we dropped the last reference, it may fall
- * on us to release a (possibly dirty) bli if it never made it to the
- * AIL (e.g., the aborted unpin already happened and didn't release it
- * due to our reference). Since we're already shutdown and need
- * ail_lock, just force remove from the AIL and release the bli here.
- */
- if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
- xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
-/***
- ASSERT(bp->b_pincount == 0);
-***/
- ASSERT(atomic_read(&bip->bli_refcount) == 0);
- ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
- ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
- xfs_buf_item_relse(bp);
- }
+ /* drop the reference to the bli */
+ xfs_buf_item_put(bip);
bp->b_transp = NULL;
xfs_buf_relse(bp);