// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Fujitsu. All Rights Reserved. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_bit.h" #include "xfs_btree.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_trans.h" #include "xfs_ag.h" #include #include #include struct xfs_failure_info { xfs_agblock_t startblock; xfs_extlen_t blockcount; int mf_flags; bool want_shutdown; }; static pgoff_t xfs_failure_pgoff( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, const struct xfs_failure_info *notify) { loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); if (notify->startblock > rec->rm_startblock) pos += XFS_FSB_TO_B(mp, notify->startblock - rec->rm_startblock); return pos >> PAGE_SHIFT; } static unsigned long xfs_failure_pgcnt( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, const struct xfs_failure_info *notify) { xfs_agblock_t end_rec; xfs_agblock_t end_notify; xfs_agblock_t start_cross; xfs_agblock_t end_cross; start_cross = max(rec->rm_startblock, notify->startblock); end_rec = rec->rm_startblock + rec->rm_blockcount; end_notify = notify->startblock + notify->blockcount; end_cross = min(end_rec, end_notify); return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; } static int xfs_dax_failure_fn( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *rec, void *data) { struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; struct xfs_failure_info *notify = data; struct address_space *mapping; pgoff_t pgoff; unsigned long pgcnt; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { /* Continue the query because this isn't a failure. */ if (notify->mf_flags & MF_MEM_PRE_REMOVE) return 0; notify->want_shutdown = true; return 0; } /* Get files that incore, filter out others that are not in use. */ error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, 0, &ip); /* Continue the rmap query if the inode isn't incore */ if (error == -ENODATA) return 0; if (error) { notify->want_shutdown = true; return 0; } mapping = VFS_I(ip)->i_mapping; pgoff = xfs_failure_pgoff(mp, rec, notify); pgcnt = xfs_failure_pgcnt(mp, rec, notify); /* Continue the rmap query if the inode isn't a dax file. */ if (dax_mapping(mapping)) error = mf_dax_kill_procs(mapping, pgoff, pgcnt, notify->mf_flags); /* Invalidate the cache in dax pages. */ if (notify->mf_flags & MF_MEM_PRE_REMOVE) invalidate_inode_pages2_range(mapping, pgoff, pgoff + pgcnt - 1); xfs_irele(ip); return error; } static int xfs_dax_notify_failure_freeze( struct xfs_mount *mp) { struct super_block *sb = mp->m_super; int error; error = freeze_super(sb, FREEZE_HOLDER_KERNEL); if (error) xfs_emerg(mp, "already frozen by kernel, err=%d", error); return error; } static void xfs_dax_notify_failure_thaw( struct xfs_mount *mp, bool kernel_frozen) { struct super_block *sb = mp->m_super; int error; if (kernel_frozen) { error = thaw_super(sb, FREEZE_HOLDER_KERNEL); if (error) xfs_emerg(mp, "still frozen after notify failure, err=%d", error); } /* * Also thaw userspace call anyway because the device is about to be * removed immediately. */ thaw_super(sb, FREEZE_HOLDER_USERSPACE); } static int xfs_dax_notify_ddev_failure( struct xfs_mount *mp, xfs_daddr_t daddr, xfs_daddr_t bblen, int mf_flags) { struct xfs_failure_info notify = { .mf_flags = mf_flags }; struct xfs_trans *tp = NULL; struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; int error = 0; bool kernel_frozen = false; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); if (mf_flags & MF_MEM_PRE_REMOVE) { xfs_info(mp, "Device is about to be removed!"); /* * Freeze fs to prevent new mappings from being created. * - Keep going on if others already hold the kernel forzen. * - Keep going on if other errors too because this device is * starting to fail. * - If kernel frozen state is hold successfully here, thaw it * here as well at the end. */ kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; } error = xfs_trans_alloc_empty(mp, &tp); if (error) goto out; for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; struct xfs_agf *agf; struct xfs_perag *pag; xfs_agblock_t range_agend; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); if (error) { xfs_perag_put(pag); break; } cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); /* * Set the rmap range from ri_low to ri_high, which represents * a [start, end] where we looking for the files or metadata. */ memset(&ri_high, 0xFF, sizeof(ri_high)); ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); if (agno == end_agno) ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); agf = agf_bp->b_addr; range_agend = min(be32_to_cpu(agf->agf_length) - 1, ri_high.rm_startblock); notify.startblock = ri_low.rm_startblock; notify.blockcount = range_agend + 1 - ri_low.rm_startblock; error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_dax_failure_fn, ¬ify); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agf_bp); xfs_perag_put(pag); if (error) break; fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); } xfs_trans_cancel(tp); /* * Shutdown fs from a force umount in pre-remove case which won't fail, * so errors can be ignored. Otherwise, shutdown the filesystem with * CORRUPT flag if error occured or notify.want_shutdown was set during * RMAP querying. */ if (mf_flags & MF_MEM_PRE_REMOVE) xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); else if (error || notify.want_shutdown) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); if (!error) error = -EFSCORRUPTED; } out: /* Thaw the fs if it has been frozen before. */ if (mf_flags & MF_MEM_PRE_REMOVE) xfs_dax_notify_failure_thaw(mp, kernel_frozen); return error; } static int xfs_dax_notify_failure( struct dax_device *dax_dev, u64 offset, u64 len, int mf_flags) { struct xfs_mount *mp = dax_holder(dax_dev); u64 ddev_start; u64 ddev_end; if (!(mp->m_super->s_flags & SB_BORN)) { xfs_warn(mp, "filesystem is not ready for notify_failure()!"); return -EIO; } if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { xfs_debug(mp, "notify_failure() not supported on realtime device!"); return -EOPNOTSUPP; } if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && mp->m_logdev_targp != mp->m_ddev_targp) { /* * In the pre-remove case the failure notification is attempting * to trigger a force unmount. The expectation is that the * device is still present, but its removal is in progress and * can not be cancelled, proceed with accessing the log device. */ if (mf_flags & MF_MEM_PRE_REMOVE) return 0; xfs_err(mp, "ondisk log corrupt, shutting down fs!"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); return -EFSCORRUPTED; } if (!xfs_has_rmapbt(mp)) { xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); return -EOPNOTSUPP; } ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; /* Notify failure on the whole device. */ if (offset == 0 && len == U64_MAX) { offset = ddev_start; len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); } /* Ignore the range out of filesystem area */ if (offset + len - 1 < ddev_start) return -ENXIO; if (offset > ddev_end) return -ENXIO; /* Calculate the real range when it touches the boundary */ if (offset > ddev_start) offset -= ddev_start; else { len -= ddev_start - offset; offset = 0; } if (offset + len - 1 > ddev_end) len = ddev_end - offset + 1; return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), mf_flags); } const struct dax_holder_operations xfs_dax_holder_operations = { .notify_failure = xfs_dax_notify_failure, };