diff options
-rw-r--r-- | fs/xfs/Makefile | 2 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_fs.h | 4 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_health.h | 4 | ||||
-rw-r--r-- | fs/xfs/scrub/common.c | 3 | ||||
-rw-r--r-- | fs/xfs/scrub/common.h | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/health.c | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks.c | 930 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks.h | 102 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks_repair.c | 223 | ||||
-rw-r--r-- | fs/xfs/scrub/repair.h | 2 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.c | 9 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.h | 5 | ||||
-rw-r--r-- | fs/xfs/scrub/stats.c | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.c | 2 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.h | 183 | ||||
-rw-r--r-- | fs/xfs/xfs_health.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 117 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 31 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_symlink.c | 1 |
21 files changed, 1623 insertions, 4 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 68891e6ee08e..253744092915 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -160,6 +160,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ iscan.o \ + nlinks.o \ parent.o \ readdir.o \ refcount.o \ @@ -193,6 +194,7 @@ xfs-y += $(addprefix scrub/, \ ialloc_repair.o \ inode_repair.o \ newbt.o \ + nlinks_repair.o \ reap.o \ refcount_repair.o \ repair.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 07acbed9235c..515cd27d3b3a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -196,6 +196,7 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ #define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ +#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -711,9 +712,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ #define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ #define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ +#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 26 +#define XFS_SCRUB_TYPE_NR 27 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 5626e53b3f0f..2bfe2dc404a1 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -42,6 +42,7 @@ struct xfs_fsop_geom; #define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ #define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ +#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ /* Observable health issues for realtime volume metadata. */ #define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ @@ -79,7 +80,8 @@ struct xfs_fsop_geom; XFS_SICK_FS_UQUOTA | \ XFS_SICK_FS_GQUOTA | \ XFS_SICK_FS_PQUOTA | \ - XFS_SICK_FS_QUOTACHECK) + XFS_SICK_FS_QUOTACHECK | \ + XFS_SICK_FS_NLINKS) #define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ XFS_SICK_RT_SUMMARY) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index c5a6c47d3df2..699092195f41 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1302,6 +1302,9 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_enable(); + if (scrub_fsgates & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index eb51037cd0d2..529a510dc76f 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -129,6 +129,7 @@ xchk_setup_quotacheck(struct xfs_scrub *sc) } #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 3c9eac070796..34519fbc2d40 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -106,6 +106,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 000000000000..8a7d9557897c --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,930 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting file link counts. For each file, we create a shadow link + * counting structure, then walk the entire directory tree, incrementing parent + * and child link counts for each directory entry seen. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all + * directory entry updates because that is when link count updates occur. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. Because the hook code runs in a different process context from the + * scrub code and the scrub state flags are not accessed atomically, failures + * in the hook code must abort the iscan and the scrubber must notice the + * aborted scan and set the incomplete flag. + * + * Note that we use jump labels and srcu notifier hooks to minimize the + * overhead when live nlinks is /not/ running. Locking order for nlink + * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock. + */ + +/* + * Add a delta to an nlink counter, clamping the value to U32_MAX. Because + * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results + * even if we lose some precision. + */ +static inline void +careful_add( + xfs_nlink_t *nlinkp, + int delta) +{ + uint64_t new_value = (uint64_t)(*nlinkp) + delta; + + BUILD_BUG_ON(XFS_MAXLINK > U32_MAX); + *nlinkp = min_t(uint64_t, new_value, U32_MAX); +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + int parents_delta, + int backrefs_delta, + int children_delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta, + backrefs_delta, children_delta); + + careful_add(&nl.parents, parents_delta); + careful_add(&nl.backrefs, backrefs_delta); + careful_add(&nl.children, children_delta); + + nl.flags |= XCHK_NLINK_WRITTEN; + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure based on a directory update in progress. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_nlink_ctrs *xnc; + int error; + + xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, + p->delta, p->name->name, p->name->len); + + /* + * If we've already scanned @dp, update the number of parents that link + * to @ip. If @ip is a subdirectory, update the number of child links + * going out of @dp. + */ + if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta, + 0, 0); + if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode)) + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + 0, p->delta); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + /* + * If @ip is a subdirectory and we've already scanned it, update the + * number of backrefs pointing to @dp. + */ + if (S_ISDIR(VFS_IC(p->ip)->i_mode) && + xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + p->delta, 0); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); + return NOTIFY_DONE; +} + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_collect_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + bool dot = false, dotdot = false; + int error; + + /* Does this name make sense? */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) { + error = -ECANCELED; + goto out_abort; + } + + if (name->len == 1 && name->name[0] == '.') + dot = true; + else if (name->len == 2 && name->name[0] == '.' && + name->name[1] == '.') + dotdot = true; + + /* Don't accept a '.' entry that points somewhere else. */ + if (dot && ino != dp->i_ino) { + error = -ECANCELED; + goto out_abort; + } + + /* Don't accept an invalid inode number. */ + if (!xfs_verify_dir_ino(sc->mp, ino)) { + error = -ECANCELED; + goto out_abort; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name); + + mutex_lock(&xnc->lock); + + /* + * If this is a dotdot entry, it is a back link from dp to ino. How + * we handle this depends on whether or not dp is the root directory. + * + * The root directory is its own parent, so we pretend the dotdot entry + * establishes the "parent" of the root directory. Increment the + * number of parents of the root directory. + * + * Otherwise, increment the number of backrefs pointing back to ino. + */ + if (dotdot) { + if (dp == sc->mp->m_rootip) + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + else + error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link from dp to ino, increment the + * number of parents linking into ino. + */ + if (!dot && !dotdot) { + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link to a subdirectory, increment the + * number of child links of dp. + */ + if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) { + error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1); + if (error) + goto out_unlock; + } + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + +/* Walk a directory to bump the observed link counts of the children. */ +STATIC int +xchk_nlinks_collect_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + unsigned int lock_mode; + int error = 0; + + /* Prevent anyone from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out_unlock; + + /* + * We cannot count file links if the directory looks as though it has + * been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_abort; + } + + error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); + goto out_unlock; + +out_abort: + xchk_set_incomplete(sc); + xchk_iscan_abort(&xnc->collect_iscan); +out_unlock: + xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_collect_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_collect_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this non-directory file. */ +static inline int +xchk_nlinks_collect_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + int error; + + /* Count the rt and quota files that are rooted in the superblock. */ + error = xchk_nlinks_collect_metafiles(xnc); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_collect_dir(xnc, ip); + else + error = xchk_nlinks_collect_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->collect_iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing file link counters. Walk each inode and compare the link + * counts against our shadow information; and then walk each shadow link count + * structure (that wasn't covered in the first part), comparing it against the + * file. + */ + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN); + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * shouldn't really happen outside of the collection phase. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + /* Copy the counters, but do not expose the internal state. */ + obs->parents = nl.parents; + obs->backrefs = nl.backrefs; + obs->children = nl.children; + obs->flags = 0; + return 0; +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + unsigned int actual_nlink; + int error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * If we don't have ftype to get an accurate count of the subdirectory + * entries in this directory, take advantage of the fact that on a + * consistent ftype=0 filesystem, the number of subdirectory + * backreferences (dotdot entries) pointing towards this directory + * should be equal to the number of subdirectory entries in the + * directory. + */ + if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode)) + obs.children = obs.backrefs; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* + * If we found so many parents that we'd overflow i_nlink, we must flag + * this as a corruption. The VFS won't let users increase the link + * count, but it will let them decrease it. + */ + if (total_links > XFS_MAXLINK) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* Link counts should match. */ + if (total_links != actual_nlink) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) { + /* + * The collection phase ignores directories with zero link + * count, so we ignore them here too. + * + * The number of subdirectory backreferences (dotdot entries) + * pointing towards this directory should be equal to the + * number of subdirectory entries in the directory. + */ + if (obs.children != obs.backrefs) + xchk_ino_xref_set_corrupt(sc, ip->i_ino); + } else { + /* + * Non-directories and unlinked directories should not have + * back references. + */ + if (obs.backrefs != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* + * Non-directories and unlinked directories should not have + * children. + */ + if (obs.children != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + + if (ip == sc->mp->m_rootip) { + /* + * For the root of a directory tree, both the '.' and '..' + * entries should point to the root directory. The dotdot + * entry is counted as a parent of the root /and/ a backref of + * the root directory. + */ + if (obs.parents != 1) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } else if (actual_nlink > 0) { + /* + * Linked files that are not the root directory should have at + * least one parent. + */ + if (obs.parents == 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + +out_corrupt: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -ECANCELED; +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * The first iget failed, so try again with the variant that returns + * either an incore inode or the AGI buffer. If the function returns + * EINVAL/ENOENT, it should have passed us the AGI buffer so that we + * can guarantee that the inode won't be allocated while we check for + * a zero link count in the observed link count data. + */ + error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip); + if (!error) { + /* Actually got an inode, so use the inode compare. */ + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found. Check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(NULL, &obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + if (agi_bp) + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* + * Try to visit every inode in the filesystem to compare the link count. Move + * on if we can't grab an inode, since we'll revisit unchecked nlink records in + * the second part. + */ +static int +xchk_nlinks_compare_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. Inodes skipped by _compare_iter will be tried again in the + * next phase of the scan. + */ + xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan); + while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) { + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) { + xfs_ino_t ino = cur - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook); + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_teardown(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned long long max_inos; + xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1; + xfs_agino_t first_agino, last_agino; + int error; + + ASSERT(xnc->sc == NULL); + xnc->sc = sc; + + mutex_init(&xnc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan); + + /* + * Set up enough space to store an nlink record for the highest + * possible inode number in this system. + */ + xfs_agino_range(mp, last_agno, &first_agino, &last_agino); + max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1; + descr = xchk_xfile_descr(sc, "file link counts"); + error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos), + sizeof(struct xchk_nlink), &xnc->nlinks); + kfree(descr); + if (error) + goto out_teardown; + + /* + * Hook into the directory entry code so that we can capture updates to + * file link counts. The hook only triggers for inodes that were + * already scanned, and the scanner thread takes each inode's ILOCK, + * which means that any in-progress inode updates will finish before we + * can scan the inode. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update); + error = xfs_dir_hook_add(mp, &xnc->dhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 000000000000..a950f3daf204 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; +}; + +/* + * In-core link counts for a given inode in the filesystem. + * + * For an empty rootdir, the directory entries and the field to which they are + * accounted are as follows: + * + * Root directory: + * + * . points to self (root.child) + * .. points to self (root.parent) + * f1 points to a child file (f1.parent) + * d1 points to a child dir (d1.parent, root.child) + * + * Subdirectory d1: + * + * . points to self (d1.child) + * .. points to root dir (root.backref) + * f2 points to child file (f2.parent) + * f3 points to root.f1 (f1.parent) + * + * root.nlink == 3 (root.dot, root.dotdot, root.d1) + * d1.nlink == 2 (root.d1, d1.dot) + * f1.nlink == 2 (root.f1, d1.f3) + * f2.nlink == 1 (d1.f2) + */ +struct xchk_nlink { + /* Count of forward links from parent directories to this file. */ + xfs_nlink_t parents; + + /* + * Count of back links to this parent directory from child + * subdirectories. + */ + xfs_nlink_t backrefs; + + /* + * Count of forward links from this directory to all child files and + * the number of dot entries. Should be zero for non-directories. + */ + xfs_nlink_t children; + + /* Record state flags */ + unsigned int flags; +}; + +/* + * This incore link count has been written at least once. We never want to + * store an xchk_nlink that looks uninitialized. + */ +#define XCHK_NLINK_WRITTEN (1U << 0) + +/* Already checked this link count record. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 1) + +/* Already made a repair with this link count record. */ +#define XREP_NLINK_DIRTY (1U << 2) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live) +{ + uint64_t ret = live->parents; + + /* Add one link count for the dot entry of any linked directory. */ + if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink) + ret++; + return ret + live->children; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c new file mode 100644 index 000000000000..b87618322f55 --- /dev/null +++ b/fs/xfs/scrub/nlinks_repair.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" + +/* + * Live Inode Link Count Repair + * ============================ + * + * Use the live inode link count information that we collected to replace the + * nlink values of the incore inodes. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * inode is locked. + */ + +/* + * Correct the link count of the given inode. Because we have to grab locks + * and resources in a certain order, it's possible that this will be a no-op. + */ +STATIC int +xrep_nlinks_repair_inode( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + uint64_t total_links; + uint64_t actual_nlink; + bool dirty = false; + int error; + + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_scanlock; + } + + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * We're done accessing the shared scan data, so we can drop the lock. + * We still hold @ip's ILOCK, so its link count cannot change. + */ + mutex_unlock(&xnc->lock); + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + /* + * Non-directories cannot have directories pointing up to them. + * + * We previously set error to zero, but set it again because one static + * checker author fears that programmers will fail to maintain this + * invariant and built their tool to flag this as a security risk. A + * different tool author made their bot complain about the redundant + * store. This is a never-ending and stupid battle; both tools missed + * *actual bugs* elsewhere; and I no longer care. + */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + error = 0; + goto out_trans; + } + + /* + * We did not find any links to this inode. If the inode agrees, we + * have nothing further to do. If not, the inode has a nonzero link + * count and we don't have anywhere to graft the child onto. Dropping + * a live inode's link count to zero can cause unexpected shutdowns in + * inactivation, so leave it alone. + */ + if (total_links == 0) { + if (actual_nlink != 0) + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { + if (total_links > XFS_MAXLINK) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + trace_xrep_nlinks_update_inode(mp, ip, &obs); + + set_nlink(VFS_I(ip), total_links); + dirty = true; + } + + if (!dirty) { + error = 0; + goto out_trans; + } + + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + + error = xrep_trans_commit(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; + +out_scanlock: + mutex_unlock(&xnc->lock); +out_trans: + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; +} + +/* + * Try to visit every inode in the filesystem for repairs. Move on if we can't + * grab an inode, since we're still making forward progress. + */ +static int +xrep_nlinks_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Commit the new inode link counters. */ +int +xrep_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error; + + /* + * We need ftype for an accurate count of the number of child + * subdirectory links. Child subdirectories with a back link (dotdot + * entry) but no forward link are unfixable, so we cannot repair the + * link count of the parent directory based on the back link count + * alone. Filesystems without ftype support are rare (old V4) so we + * just skip out here. + */ + if (!xfs_has_ftype(sc->mp)) + return -EOPNOTSUPP; + + /* + * Use the inobt to walk all allocated inodes to compare and fix the + * link counts. Retry iget every tenth of a second for up to 30 + * seconds -- even if repair misses a few inodes, we still try to fix + * as many of them as we can. + */ + xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan); + ASSERT(sc->ip == NULL); + + while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) { + /* + * Commit the scrub transaction so that we can create repair + * transactions with the correct reservations. + */ + xchk_trans_cancel(sc); + + error = xrep_nlinks_repair_inode(xnc); + xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip); + xchk_irele(sc, sc->ip); + sc->ip = NULL; + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + + /* + * Create a new empty transaction so that we can advance the + * iscan cursor without deadlocking if the inobt has a cycle. + * We can only push the inactivation workqueues with an empty + * transaction. + */ + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index fdfa06699921..8edac0150e96 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -116,6 +116,7 @@ int xrep_inode(struct xfs_scrub *sc); int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); +int xrep_nlinks(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); @@ -196,6 +197,7 @@ xrep_setup_nothing( #define xrep_rtbitmap xrep_notsupported #define xrep_quota xrep_notsupported #define xrep_quotacheck xrep_notsupported +#define xrep_nlinks xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9112c0985c62..c0b99184bb3e 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -160,6 +160,9 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_disable(); + if (sc->flags & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } @@ -369,6 +372,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_quotacheck, .repair = xrep_quotacheck, }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_nlinks, + }, }; static int diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5cd4550155f2..f99a3c21d02e 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -122,6 +122,7 @@ struct xfs_scrub { #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ +#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ #define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ @@ -132,7 +133,8 @@ struct xfs_scrub { * must be enabled during scrub setup and can only be torn down afterwards. */ #define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ - XCHK_FSGATES_QUOTA) + XCHK_FSGATES_QUOTA | \ + XCHK_FSGATES_DIRENTS) /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); @@ -183,6 +185,7 @@ xchk_quotacheck(struct xfs_scrub *sc) } #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index d716a432227b..b4ef1ebe28ab 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -78,6 +78,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", + [XFS_SCRUB_TYPE_NLINKS] = "nlinks", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 5ed75cc33b92..2d5a330afe10 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -17,11 +17,13 @@ #include "xfs_quota.h" #include "xfs_quota_defs.h" #include "xfs_da_format.h" +#include "xfs_dir2.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" #include "scrub/iscan.h" +#include "scrub/nlinks.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index fedcebf90a42..c9b6b0e0bf11 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -23,6 +23,7 @@ struct xfarray; struct xfarray_sortinfo; struct xchk_dqiter; struct xchk_iscan; +struct xchk_nlink; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -67,6 +68,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -94,7 +96,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ - { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" } + { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -113,6 +116,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ + { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -1318,6 +1322,180 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait, __entry->retry_delay) ); +TRACE_EVENT(xchk_nlinks_collect_dirent, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + xfs_ino_t ino, const struct xfs_name *name), + TP_ARGS(mp, dp, ino, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_collect_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp, + int action, xfs_ino_t ino, int delta, + const char *name, unsigned int namelen), + TP_ARGS(mp, dp, action, ino, delta, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(int, action) + __field(xfs_ino_t, ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->action = action; + __entry->ino = ino; + __entry->delta = delta; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +TRACE_EVENT(xchk_nlinks_update_incore, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live, int parents_delta, + int backrefs_delta, int children_delta), + TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + __field(int, parents_delta) + __field(int, backrefs_delta) + __field(int, children_delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + __entry->parents_delta = parents_delta; + __entry->backrefs_delta = backrefs_delta; + __entry->children_delta = children_delta; + ), + TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents_delta, + __entry->parents, + __entry->backrefs_delta, + __entry->backrefs, + __entry->children_delta, + __entry->children) +); + +DECLARE_EVENT_CLASS(xchk_nlinks_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->nlink, + __entry->parents, + __entry->backrefs, + __entry->children) +); +#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlinks_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -2007,6 +2185,9 @@ DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); #endif /* CONFIG_XFS_QUOTA */ +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index ef07af9f753d..111c27a6b107 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -281,6 +281,7 @@ static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, + { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6635d219527..e8845287debd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -925,6 +925,81 @@ xfs_bumplink( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of directory live update hooks. + * If the compiler supports jump labels, the static branch will be replaced by + * a nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); + +void +xfs_dir_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dir_hooks_switch); +} + +void +xfs_dir_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dir_hooks_switch); +} + +/* Call hooks for a directory update relating to a child dirent update. */ +inline void +xfs_dir_update_hook( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta, + const struct xfs_name *name) +{ + if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { + struct xfs_dir_update_params p = { + .dp = dp, + .ip = ip, + .delta = delta, + .name = name, + }; + struct xfs_mount *mp = ip->i_mount; + + xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); + } +} + +/* Call the specified function during a directory update. */ +int +xfs_dir_hook_add( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Stop calling the specified function during a directory update. */ +void +xfs_dir_hook_del( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Configure directory update hook functions. */ +void +xfs_dir_hook_setup( + struct xfs_dir_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->dirent_hook, mod_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + int xfs_create( struct mnt_idmap *idmap, @@ -1036,6 +1111,12 @@ xfs_create( } /* + * Create ip with a reference from dp, and add '.' and '..' references + * if it's a directory. + */ + xfs_dir_update_hook(dp, ip, 1, name); + + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to * the user. @@ -1249,6 +1330,7 @@ xfs_link( xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); xfs_bumplink(tp, sip); + xfs_dir_update_hook(tdp, sip, 1, target_name); /* * If this is a synchronous mount, make sure that the @@ -2563,6 +2645,12 @@ xfs_remove( } /* + * Drop the link from dp to ip, and if ip was a directory, remove the + * '.' and '..' references since we freed the directory. + */ + xfs_dir_update_hook(dp, ip, -1, name); + + /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. @@ -2752,6 +2840,20 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + + /* + * Inform our hook clients that we've finished an exchange operation as + * follows: removed the source and target files from their directories; + * added the target to the source directory; and added the source to + * the target directory. All inodes are locked, so it's ok to model a + * rename this way so long as we say we deleted entries before we add + * new ones. + */ + xfs_dir_update_hook(dp1, ip1, -1, name1); + xfs_dir_update_hook(dp2, ip2, -1, name2); + xfs_dir_update_hook(dp1, ip2, 1, name1); + xfs_dir_update_hook(dp2, ip1, 1, name2); + return xfs_finish_rename(tp); out_trans_abort: @@ -3135,6 +3237,21 @@ retry: if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + /* + * Inform our hook clients that we've finished a rename operation as + * follows: removed the source and target files from their directories; + * that we've added the source to the target directory; and finally + * that we've added the whiteout, if there was one. All inodes are + * locked, so it's ok to model a rename this way so long as we say we + * deleted entries before we add new ones. + */ + if (target_ip) + xfs_dir_update_hook(target_dp, target_ip, -1, target_name); + xfs_dir_update_hook(src_dp, src_ip, -1, src_name); + xfs_dir_update_hook(target_dp, src_ip, 1, target_name); + if (wip) + xfs_dir_update_hook(src_dp, wip, 1, src_name); + error = xfs_finish_rename(tp); if (wip) xfs_irele(wip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7bbdc7009e7d..ab46ffb3ac19 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) return &ip->i_vnode; } +/* convert from const xfs inode to const vfs inode */ +static inline const struct inode *VFS_IC(const struct xfs_inode *ip) +{ + return &ip->i_vnode; +} + /* * For regular files we only update the on-disk filesize when actually * writing data back to disk. Until then only the copy in the VFS inode @@ -626,4 +632,29 @@ bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); +struct xfs_dir_update_params { + const struct xfs_inode *dp; + const struct xfs_inode *ip; + const struct xfs_name *name; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, + int delta, const struct xfs_name *name); + +struct xfs_dir_hook { + struct xfs_hook dirent_hook; +}; + +void xfs_dir_hook_disable(void); +void xfs_dir_hook_enable(void); + +int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); +#else +# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 503fe3c7edbf..e86dfe67894f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -252,6 +252,9 @@ typedef struct xfs_mount { /* cpus that have inodes queued for inactivation */ struct cpumask m_inodegc_cpumask; + + /* Hook to feed dirent updates to an active online repair. */ + struct xfs_hooks m_dir_update_hooks; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b31652fa7004..74e87ed5eee1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2011,6 +2011,8 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ + xfs_hooks_init(&mp->m_dir_update_hooks); + fc->s_fs_info = mp; fc->ops = &xfs_context_ops; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index c2dc8c501bdc..e73692fbe179 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -322,6 +322,7 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_dir_update_hook(dp, ip, 1, link_name); /* * If this is a synchronous mount, make sure that the |