aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/libxfs/xfs_attr.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2022-05-12 15:12:56 +1000
committerDave Chinner <david@fromorbit.com>2022-05-12 15:12:56 +1000
commitfdaf1bb3cafcfee9ef05c4eaf6ee1193fd90cbd2 (patch)
tree7a3fb4d3164766d7cd1c51531cb3bc8fdcbf1383 /fs/xfs/libxfs/xfs_attr.c
parentxfs: use XFS_DA_OP flags in deferred attr ops (diff)
downloadlinux-dev-fdaf1bb3cafcfee9ef05c4eaf6ee1193fd90cbd2.tar.xz
linux-dev-fdaf1bb3cafcfee9ef05c4eaf6ee1193fd90cbd2.zip
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute when logging attributes. The existing algorithm is essentially: 1. create new attr w/ INCOMPLETE 2. atomically flip INCOMPLETE flags between old + new attribute 3. remove old attr which is marked w/ INCOMPLETE This algorithm guarantees that we see either the old or new attribute, and if we fail after the atomic flag flip, we don't have to recover the removal of the old attr because we never see INCOMPLETE attributes in lookups. For logged attributes, however, this does not work. The logged attribute intents do not track the work that has been done as the transaction rolls, and hence the only recovery mechanism we have is "run the replace operation from scratch". This is further exacerbated by the attempt to avoid needing the INCOMPLETE flag to create an atomic swap. This means we can create a second active attribute of the same name before we remove the original. If we fail at any point after the create but before the removal has completed, we end up with duplicate attributes in the attr btree and recovery only tries to replace one of them. There are several other failure modes where we can leave partially allocated remote attributes that expose stale data, partially free remote attributes that enable UAF based stale data exposure, etc. TO fix this, we need a different algorithm for replace operations when LARP is enabled. Luckily, it's not that complex if we take the right first step. That is, the first thing we log is the attri intent with the new name/value pair and mark the old attr as INCOMPLETE in the same transaction. From there, we then remove the old attr and keep relogging the new name/value in the intent, such that we always know that we have to create the new attr in recovery. Once the old attr is removed, we then run a normal ATTR_CREATE operation relogging the intent as we go. If the new attr is local, then it gets created in a single atomic transaction that also logs the final intent done. If the new attr is remote, the we set INCOMPLETE on the new attr while we allocate and set the remote value, and then we clear the INCOMPLETE flag at in the last transaction taht logs the final intent done. If we fail at any point in this algorithm, log recovery will always see the same state on disk: the new name/value in the intent, and either an INCOMPLETE attr or no attr in the attr btree. If we find an INCOMPLETE attr, we run the full replace starting with removing the INCOMPLETE attr. If we don't find it, then we simply create the new attr. Notably, recovery of a failed create that has an INCOMPLETE flag set is now the same - we start with the lookup of the INCOMPLETE attr, and if that exists then we do the full replace recovery process, otherwise we just create the new attr. Hence changing the way we do the replace operation when LARP is enabled allows us to use the same log recovery algorithm for both the ATTR_CREATE and ATTR_REPLACE operations. This is also the same algorithm we use for runtime ATTR_REPLACE operations (except for the step setting up the initial conditions). The result is that: - ATTR_CREATE uses the same algorithm regardless of whether LARP is enabled or not - ATTR_REPLACE with larp=0 is identical to the old algorithm - ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm from the larp=0 code and then runs the unmodified ATTR_CREATE code. - log recovery when larp=1 runs the same ATTR_REPLACE algorithm as it uses at runtime. Because the state machine is now quite clean, changing the algorithm is really just a case of changing the initial state and how the states link together for the ATTR_REPLACE case. Hence it's not a huge amount of code for what is a fairly substantial rework of the attr logging and recovery algorithm.... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/xfs/libxfs/xfs_attr.c')
-rw-r--r--fs/xfs/libxfs/xfs_attr.c97
1 files changed, 58 insertions, 39 deletions
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 5072c156833b..14ae0826bc15 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -68,9 +68,12 @@ int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
- if (!XFS_IFORK_Q(ip) ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0))
+ if (!XFS_IFORK_Q(ip))
+ return 0;
+ if (!ip->i_afp)
+ return 0;
+ if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0)
return 0;
return 1;
}
@@ -408,23 +411,30 @@ out:
}
/*
- * When we bump the state to REPLACE, we may actually need to skip over the
- * state. When LARP mode is enabled, we don't need to run the atomic flags flip,
- * so we skip straight over the REPLACE state and go on to REMOVE_OLD.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
*/
-static void
-xfs_attr_dela_state_set_replace(
+static enum xfs_delattr_state
+xfs_attr_complete_op(
struct xfs_attr_item *attr,
- enum xfs_delattr_state replace)
+ enum xfs_delattr_state replace_state)
{
struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
- ASSERT(replace == XFS_DAS_LEAF_REPLACE ||
- replace == XFS_DAS_NODE_REPLACE);
-
- attr->xattri_dela_state = replace;
- if (xfs_has_larp(args->dp->i_mount))
- attr->xattri_dela_state++;
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
}
static int
@@ -466,10 +476,9 @@ xfs_attr_leaf_addname(
*/
if (args->rmtblkno)
attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
- else if (args->op_flags & XFS_DA_OP_REPLACE)
- xfs_attr_dela_state_set_replace(attr, XFS_DAS_LEAF_REPLACE);
else
- attr->xattri_dela_state = XFS_DAS_DONE;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
out:
trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
return error;
@@ -511,10 +520,9 @@ xfs_attr_node_addname(
if (args->rmtblkno)
attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
- else if (args->op_flags & XFS_DA_OP_REPLACE)
- xfs_attr_dela_state_set_replace(attr, XFS_DAS_NODE_REPLACE);
else
- attr->xattri_dela_state = XFS_DAS_DONE;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
out:
trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
return error;
@@ -546,18 +554,15 @@ xfs_attr_rmtval_alloc(
if (error)
return error;
- /* If this is not a rename, clear the incomplete flag and we're done. */
- if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
+ /*
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
+ */
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
error = xfs_attr3_leaf_clearflag(args);
- attr->xattri_dela_state = XFS_DAS_DONE;
- } else {
- /*
- * We are running a REPLACE operation, so we need to bump the
- * state to the step in that operation.
- */
- attr->xattri_dela_state++;
- xfs_attr_dela_state_set_replace(attr, attr->xattri_dela_state);
- }
out:
trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
return error;
@@ -714,13 +719,24 @@ next_state:
return xfs_attr_node_addname(attr);
case XFS_DAS_SF_REMOVE:
- attr->xattri_dela_state = XFS_DAS_DONE;
- return xfs_attr_sf_removename(args);
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
case XFS_DAS_LEAF_REMOVE:
- attr->xattri_dela_state = XFS_DAS_DONE;
- return xfs_attr_leaf_removename(args);
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
case XFS_DAS_NODE_REMOVE:
error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
+ }
if (error)
return error;
attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
@@ -806,14 +822,16 @@ next_state:
case XFS_DAS_LEAF_REMOVE_ATTR:
error = xfs_attr_leaf_remove_attr(attr);
- attr->xattri_dela_state = XFS_DAS_DONE;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
break;
case XFS_DAS_NODE_REMOVE_ATTR:
error = xfs_attr_node_remove_attr(attr);
if (!error)
error = xfs_attr_leaf_shrink(args);
- attr->xattri_dela_state = XFS_DAS_DONE;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
break;
default:
ASSERT(0);
@@ -1315,9 +1333,10 @@ xfs_attr_leaf_removename(
dp = args->dp;
error = xfs_attr_leaf_hasname(args, &bp);
-
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
return error;
} else if (error != -EEXIST)
return error;