diff options
Diffstat (limited to 'fs/xfs/xfs_fsops.c')
-rw-r--r-- | fs/xfs/xfs_fsops.c | 372 |
1 files changed, 201 insertions, 171 deletions
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3e61d0cc23f8..13851c0d640b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,105 +17,149 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_trace.h" + +/* + * Write new AG headers to disk. Non-transactional, but need to be + * written and completed prior to the growfs transaction being logged. + * To do this, we use a delayed write buffer list and wait for + * submission and IO completion of the list as a whole. This allows the + * IO subsystem to merge all the AG headers in a single AG into a single + * IO and hide most of the latency of the IO from us. + * + * This also means that if we get an error whilst building the buffer + * list to write, we can cancel the entire list without having written + * anything. + */ +static int +xfs_resizefs_init_new_ags( + struct xfs_trans *tp, + struct aghdr_init_data *id, + xfs_agnumber_t oagcount, + xfs_agnumber_t nagcount, + xfs_rfsblock_t delta, + struct xfs_perag *last_pag, + bool *lastag_extended) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta; + int error; + + *lastag_extended = false; + + INIT_LIST_HEAD(&id->buffer_list); + for (id->agno = nagcount - 1; + id->agno >= oagcount; + id->agno--, delta -= id->agsize) { + + if (id->agno == nagcount - 1) + id->agsize = nb - (id->agno * + (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + else + id->agsize = mp->m_sb.sb_agblocks; + + error = xfs_ag_init_headers(mp, id); + if (error) { + xfs_buf_delwri_cancel(&id->buffer_list); + return error; + } + } + + error = xfs_buf_delwri_submit(&id->buffer_list); + if (error) + return error; + + if (delta) { + *lastag_extended = true; + error = xfs_ag_extend_space(last_pag, tp, delta); + } + return error; +} /* * growfs operations */ static int xfs_growfs_data_private( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_data_t *in) /* growfs data input struct */ + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_data *in) /* growfs data input struct */ { - xfs_buf_t *bp; + struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; - xfs_rfsblock_t nb, nb_mod; - xfs_rfsblock_t new; + xfs_rfsblock_t nb, nb_div, nb_mod; + int64_t delta; + bool lastag_extended; xfs_agnumber_t oagcount; - xfs_trans_t *tp; + struct xfs_trans *tp; struct aghdr_init_data id = {}; + struct xfs_perag *last_pag; nb = in->newblocks; - if (nb < mp->m_sb.sb_dblocks) - return -EINVAL; - if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); + if (error) return error; - error = xfs_buf_read_uncached(mp->m_ddev_targp, + + if (nb > mp->m_sb.sb_dblocks) { + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); - if (error) - return error; - xfs_buf_relse(bp); + if (error) + return error; + xfs_buf_relse(bp); + } - new = nb; /* use new as a temporary here */ - nb_mod = do_div(new, mp->m_sb.sb_agblocks); - nagcount = new + (nb_mod != 0); + nb_div = nb; + nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); + nagcount = nb_div + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; - if (nb < mp->m_sb.sb_dblocks) - return -EINVAL; } - new = nb - mp->m_sb.sb_dblocks; - oagcount = mp->m_sb.sb_agcount; + delta = nb - mp->m_sb.sb_dblocks; + /* + * Reject filesystems with a single AG because they are not + * supported, and reject a shrink operation that would cause a + * filesystem to become unsupported. + */ + if (delta < 0 && nagcount < 2) + return -EINVAL; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { - error = xfs_initialize_perag(mp, nagcount, &nagimax); + error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); if (error) return error; + } else if (nagcount < oagcount) { + /* TODO: shrinking the entire AGs hasn't yet completed */ + return -EINVAL; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, - XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Write new AG headers to disk. Non-transactional, but need to be - * written and completed prior to the growfs transaction being logged. - * To do this, we use a delayed write buffer list and wait for - * submission and IO completion of the list as a whole. This allows the - * IO subsystem to merge all the AG headers in a single AG into a single - * IO and hide most of the latency of the IO from us. - * - * This also means that if we get an error whilst building the buffer - * list to write, we can cancel the entire list without having written - * anything. - */ - INIT_LIST_HEAD(&id.buffer_list); - for (id.agno = nagcount - 1; - id.agno >= oagcount; - id.agno--, new -= id.agsize) { - - if (id.agno == nagcount - 1) - id.agsize = nb - - (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); - else - id.agsize = mp->m_sb.sb_agblocks; + last_pag = xfs_perag_get(mp, oagcount - 1); + if (delta > 0) { + error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, + delta, last_pag, &lastag_extended); + } else { + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, + "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); - error = xfs_ag_init_headers(mp, &id); - if (error) { - xfs_buf_delwri_cancel(&id.buffer_list); - goto out_trans_cancel; - } + error = xfs_ag_shrink_space(last_pag, &tp, -delta); } - error = xfs_buf_delwri_submit(&id.buffer_list); + xfs_perag_put(last_pag); if (error) goto out_trans_cancel; - xfs_trans_agblocks_delta(tp, id.nfree); - - /* If there are new blocks in the old last AG, extend it. */ - if (new) { - error = xfs_ag_extend_space(mp, tp, &id, new); - if (error) - goto out_trans_cancel; - } - /* * Update changed superblock fields transactionally. These are not * seen by the rest of the world until the transaction commit applies @@ -123,11 +167,19 @@ xfs_growfs_data_private( */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); - if (nb > mp->m_sb.sb_dblocks) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, - nb - mp->m_sb.sb_dblocks); + if (delta) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta); if (id.nfree) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); + + /* + * Sync sb counters now to reflect the updated values. This is + * particularly important for shrink because the write verifier + * will fail if sb_fdblocks is ever larger than sb_dblocks. + */ + if (xfs_has_lazysbcount(mp)) + xfs_log_sb(tp); + xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -139,28 +191,29 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - /* - * If we expanded the last AG, free the per-AG reservation - * so we can reinitialize it with the new size. - */ - if (new) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, id.agno); - error = xfs_ag_resv_free(pag); - xfs_perag_put(pag); - if (error) - return error; + if (delta > 0) { + /* + * If we expanded the last AG, free the per-AG reservation + * so we can reinitialize it with the new size. + */ + if (lastag_extended) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, id.agno); + error = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (error) + return error; + } + /* + * Reserve AG metadata blocks. ENOSPC here does not mean there + * was a growfs failure, just that there still isn't space for + * new user data after the grow has been run. + */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error == -ENOSPC) + error = 0; } - - /* - * Reserve AG metadata blocks. ENOSPC here does not mean there was a - * growfs failure, just that there still isn't space for new user data - * after the grow has been run. - */ - error = xfs_fs_reserve_ag_blocks(mp); - if (error == -ENOSPC) - error = 0; return error; out_trans_cancel: @@ -170,8 +223,8 @@ out_trans_cancel: static int xfs_growfs_log_private( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_log_t *in) /* growfs log input struct */ + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_log *in) /* growfs log input struct */ { xfs_extlen_t nb; @@ -268,7 +321,7 @@ out_error: int xfs_growfs_log( xfs_mount_t *mp, - xfs_growfs_log_t *in) + struct xfs_growfs_log *in) { int error; @@ -293,11 +346,8 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - - spin_lock(&mp->m_sb_lock); - cnt->freertx = mp->m_sb.sb_frextents; - spin_unlock(&mp->m_sb_lock); + xfs_fdblocks_unavailable(mp); + cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); } /* @@ -376,46 +426,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + free = percpu_counter_sum(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp); + delta = request - mp->m_resblks; + mp->m_resblks = request; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + if (!error) + xfs_mod_fdblocks(mp, fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; } - out: if (outval) { outval->resblks = mp->m_resblks; @@ -433,13 +473,10 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); - - if (sb && !IS_ERR(sb)) { + if (!freeze_bdev(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(mp->m_super->s_bdev); } - break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: @@ -461,59 +498,56 @@ xfs_fs_goingdown( * consistent. We don't do an unmount here; just shutdown the shop, make sure * that absolutely nothing persistent happens to this filesystem after this * point. + * + * The shutdown state change is atomic, resulting in the first and only the + * first shutdown call processing the shutdown. This means we only shutdown the + * log once as it requires, and we don't spam the logs when multiple concurrent + * shutdowns race to set the shutdown flags. */ void xfs_do_force_shutdown( struct xfs_mount *mp, - int flags, + uint32_t flags, char *fname, int lnnum) { - bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; + int tag; + const char *why; - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; - - if (flags & SHUTDOWN_FORCE_UMOUNT) { - xfs_alert(mp, -"User initiated shutdown received. Shutting down filesystem"); + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; } - - xfs_notice(mp, -"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, - __func__, flags, lnnum, fname, __return_address); - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, -"Corruption of in-memory data detected. Shutting down filesystem"); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); + if (mp->m_sb_bp) + mp->m_sb_bp->b_flags |= XBF_DONE; + + if (flags & SHUTDOWN_FORCE_UMOUNT) + xfs_alert(mp, "User initiated shutdown received."); + + if (xlog_force_shutdown(mp->m_log, flags)) { + tag = XFS_PTAG_SHUTDOWN_LOGERROR; + why = "Log I/O Error"; + } else if (flags & SHUTDOWN_CORRUPT_INCORE) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of in-memory data"; + } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of on-disk metadata"; + } else { + tag = XFS_PTAG_SHUTDOWN_IOERROR; + why = "Metadata I/O Error"; } + trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum); + + xfs_alert_tag(mp, tag, +"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.", + why, flags, __return_address, fname, lnnum); xfs_alert(mp, "Please unmount the filesystem and rectify the problem(s)"); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); } /* @@ -529,10 +563,8 @@ xfs_fs_reserve_ag_blocks( int err2; mp->m_finobt_nores = false; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_init(pag, NULL); - xfs_perag_put(pag); if (err2 && !error) error = err2; } @@ -558,10 +590,8 @@ xfs_fs_unreserve_ag_blocks( int error = 0; int err2; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_free(pag); - xfs_perag_put(pag); if (err2 && !error) error = err2; } |