aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_fsops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_fsops.c')
-rw-r--r--fs/xfs/xfs_fsops.c372
1 files changed, 201 insertions, 171 deletions
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3e61d0cc23f8..13851c0d640b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,105 +17,149 @@
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_trace.h"
+
+/*
+ * Write new AG headers to disk. Non-transactional, but need to be
+ * written and completed prior to the growfs transaction being logged.
+ * To do this, we use a delayed write buffer list and wait for
+ * submission and IO completion of the list as a whole. This allows the
+ * IO subsystem to merge all the AG headers in a single AG into a single
+ * IO and hide most of the latency of the IO from us.
+ *
+ * This also means that if we get an error whilst building the buffer
+ * list to write, we can cancel the entire list without having written
+ * anything.
+ */
+static int
+xfs_resizefs_init_new_ags(
+ struct xfs_trans *tp,
+ struct aghdr_init_data *id,
+ xfs_agnumber_t oagcount,
+ xfs_agnumber_t nagcount,
+ xfs_rfsblock_t delta,
+ struct xfs_perag *last_pag,
+ bool *lastag_extended)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta;
+ int error;
+
+ *lastag_extended = false;
+
+ INIT_LIST_HEAD(&id->buffer_list);
+ for (id->agno = nagcount - 1;
+ id->agno >= oagcount;
+ id->agno--, delta -= id->agsize) {
+
+ if (id->agno == nagcount - 1)
+ id->agsize = nb - (id->agno *
+ (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+ else
+ id->agsize = mp->m_sb.sb_agblocks;
+
+ error = xfs_ag_init_headers(mp, id);
+ if (error) {
+ xfs_buf_delwri_cancel(&id->buffer_list);
+ return error;
+ }
+ }
+
+ error = xfs_buf_delwri_submit(&id->buffer_list);
+ if (error)
+ return error;
+
+ if (delta) {
+ *lastag_extended = true;
+ error = xfs_ag_extend_space(last_pag, tp, delta);
+ }
+ return error;
+}
/*
* growfs operations
*/
static int
xfs_growfs_data_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_data_t *in) /* growfs data input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_data *in) /* growfs data input struct */
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
- xfs_rfsblock_t nb, nb_mod;
- xfs_rfsblock_t new;
+ xfs_rfsblock_t nb, nb_div, nb_mod;
+ int64_t delta;
+ bool lastag_extended;
xfs_agnumber_t oagcount;
- xfs_trans_t *tp;
+ struct xfs_trans *tp;
struct aghdr_init_data id = {};
+ struct xfs_perag *last_pag;
nb = in->newblocks;
- if (nb < mp->m_sb.sb_dblocks)
- return -EINVAL;
- if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
+ error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
+ if (error)
return error;
- error = xfs_buf_read_uncached(mp->m_ddev_targp,
+
+ if (nb > mp->m_sb.sb_dblocks) {
+ error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
- if (error)
- return error;
- xfs_buf_relse(bp);
+ if (error)
+ return error;
+ xfs_buf_relse(bp);
+ }
- new = nb; /* use new as a temporary here */
- nb_mod = do_div(new, mp->m_sb.sb_agblocks);
- nagcount = new + (nb_mod != 0);
+ nb_div = nb;
+ nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
+ nagcount = nb_div + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
- if (nb < mp->m_sb.sb_dblocks)
- return -EINVAL;
}
- new = nb - mp->m_sb.sb_dblocks;
- oagcount = mp->m_sb.sb_agcount;
+ delta = nb - mp->m_sb.sb_dblocks;
+ /*
+ * Reject filesystems with a single AG because they are not
+ * supported, and reject a shrink operation that would cause a
+ * filesystem to become unsupported.
+ */
+ if (delta < 0 && nagcount < 2)
+ return -EINVAL;
+ oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
- error = xfs_initialize_perag(mp, nagcount, &nagimax);
+ error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
if (error)
return error;
+ } else if (nagcount < oagcount) {
+ /* TODO: shrinking the entire AGs hasn't yet completed */
+ return -EINVAL;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
- XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Write new AG headers to disk. Non-transactional, but need to be
- * written and completed prior to the growfs transaction being logged.
- * To do this, we use a delayed write buffer list and wait for
- * submission and IO completion of the list as a whole. This allows the
- * IO subsystem to merge all the AG headers in a single AG into a single
- * IO and hide most of the latency of the IO from us.
- *
- * This also means that if we get an error whilst building the buffer
- * list to write, we can cancel the entire list without having written
- * anything.
- */
- INIT_LIST_HEAD(&id.buffer_list);
- for (id.agno = nagcount - 1;
- id.agno >= oagcount;
- id.agno--, new -= id.agsize) {
-
- if (id.agno == nagcount - 1)
- id.agsize = nb -
- (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
- else
- id.agsize = mp->m_sb.sb_agblocks;
+ last_pag = xfs_perag_get(mp, oagcount - 1);
+ if (delta > 0) {
+ error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
+ delta, last_pag, &lastag_extended);
+ } else {
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
+ "EXPERIMENTAL online shrink feature in use. Use at your own risk!");
- error = xfs_ag_init_headers(mp, &id);
- if (error) {
- xfs_buf_delwri_cancel(&id.buffer_list);
- goto out_trans_cancel;
- }
+ error = xfs_ag_shrink_space(last_pag, &tp, -delta);
}
- error = xfs_buf_delwri_submit(&id.buffer_list);
+ xfs_perag_put(last_pag);
if (error)
goto out_trans_cancel;
- xfs_trans_agblocks_delta(tp, id.nfree);
-
- /* If there are new blocks in the old last AG, extend it. */
- if (new) {
- error = xfs_ag_extend_space(mp, tp, &id, new);
- if (error)
- goto out_trans_cancel;
- }
-
/*
* Update changed superblock fields transactionally. These are not
* seen by the rest of the world until the transaction commit applies
@@ -123,11 +167,19 @@ xfs_growfs_data_private(
*/
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
- if (nb > mp->m_sb.sb_dblocks)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
- nb - mp->m_sb.sb_dblocks);
+ if (delta)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
if (id.nfree)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
+
+ /*
+ * Sync sb counters now to reflect the updated values. This is
+ * particularly important for shrink because the write verifier
+ * will fail if sb_fdblocks is ever larger than sb_dblocks.
+ */
+ if (xfs_has_lazysbcount(mp))
+ xfs_log_sb(tp);
+
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
if (error)
@@ -139,28 +191,29 @@ xfs_growfs_data_private(
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
- /*
- * If we expanded the last AG, free the per-AG reservation
- * so we can reinitialize it with the new size.
- */
- if (new) {
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, id.agno);
- error = xfs_ag_resv_free(pag);
- xfs_perag_put(pag);
- if (error)
- return error;
+ if (delta > 0) {
+ /*
+ * If we expanded the last AG, free the per-AG reservation
+ * so we can reinitialize it with the new size.
+ */
+ if (lastag_extended) {
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, id.agno);
+ error = xfs_ag_resv_free(pag);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+ /*
+ * Reserve AG metadata blocks. ENOSPC here does not mean there
+ * was a growfs failure, just that there still isn't space for
+ * new user data after the grow has been run.
+ */
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error == -ENOSPC)
+ error = 0;
}
-
- /*
- * Reserve AG metadata blocks. ENOSPC here does not mean there was a
- * growfs failure, just that there still isn't space for new user data
- * after the grow has been run.
- */
- error = xfs_fs_reserve_ag_blocks(mp);
- if (error == -ENOSPC)
- error = 0;
return error;
out_trans_cancel:
@@ -170,8 +223,8 @@ out_trans_cancel:
static int
xfs_growfs_log_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_log_t *in) /* growfs log input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_log *in) /* growfs log input struct */
{
xfs_extlen_t nb;
@@ -268,7 +321,7 @@ out_error:
int
xfs_growfs_log(
xfs_mount_t *mp,
- xfs_growfs_log_t *in)
+ struct xfs_growfs_log *in)
{
int error;
@@ -293,11 +346,8 @@ xfs_fs_counts(
cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
-
- spin_lock(&mp->m_sb_lock);
- cnt->freertx = mp->m_sb.sb_frextents;
- spin_unlock(&mp->m_sb_lock);
+ xfs_fdblocks_unavailable(mp);
+ cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*
@@ -376,46 +426,36 @@ xfs_reserve_blocks(
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space.
+ *
+ * The code below estimates how many blocks it can request from
+ * fdblocks to stash in the reserve pool. This is a classic TOCTOU
+ * race since fdblocks updates are not always coordinated via
+ * m_sb_lock. Set the reserve size even if there's not enough free
+ * space to fill it because mod_fdblocks will refill an undersized
+ * reserve when it can.
*/
- error = -ENOSPC;
- do {
- free = percpu_counter_sum(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
- if (free <= 0)
- break;
-
- delta = request - mp->m_resblks;
- lcounter = free - delta;
- if (lcounter < 0)
- /* We can't satisfy the request, just get what we can */
- fdblks_delta = free;
- else
- fdblks_delta = delta;
-
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp);
+ delta = request - mp->m_resblks;
+ mp->m_resblks = request;
+ if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
- * count or we'll get an ENOSPC. If we get a ENOSPC, it means
- * things changed while we were calculating fdblks_delta and so
- * we should try again to see if there is anything left to
- * reserve.
+ * count or we'll get an ENOSPC. Don't set the reserved flag
+ * here - we don't want to reserve the extra reserve blocks
+ * from the reserve.
*
- * Don't set the reserved flag here - we don't want to reserve
- * the extra reserve blocks from the reserve.....
+ * The desired reserve size can change after we drop the lock.
+ * Use mod_fdblocks to put the space into the reserve or into
+ * fdblocks as appropriate.
*/
+ fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ if (!error)
+ xfs_mod_fdblocks(mp, fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
- } while (error == -ENOSPC);
-
- /*
- * Update the reserve counters if blocks have been successfully
- * allocated.
- */
- if (!error && fdblks_delta) {
- mp->m_resblks += fdblks_delta;
- mp->m_resblks_avail += fdblks_delta;
}
-
out:
if (outval) {
outval->resblks = mp->m_resblks;
@@ -433,13 +473,10 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
-
- if (sb && !IS_ERR(sb)) {
+ if (!freeze_bdev(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
- thaw_bdev(sb->s_bdev, sb);
+ thaw_bdev(mp->m_super->s_bdev);
}
-
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
@@ -461,59 +498,56 @@ xfs_fs_goingdown(
* consistent. We don't do an unmount here; just shutdown the shop, make sure
* that absolutely nothing persistent happens to this filesystem after this
* point.
+ *
+ * The shutdown state change is atomic, resulting in the first and only the
+ * first shutdown call processing the shutdown. This means we only shutdown the
+ * log once as it requires, and we don't spam the logs when multiple concurrent
+ * shutdowns race to set the shutdown flags.
*/
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
- int flags,
+ uint32_t flags,
char *fname,
int lnnum)
{
- bool logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+ int tag;
+ const char *why;
- /*
- * No need to duplicate efforts.
- */
- if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
- return;
- /*
- * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
- * queue up anybody new on the log reservations, and wakes up
- * everybody who's sleeping on log reservations to tell them
- * the bad news.
- */
- if (xfs_log_force_umount(mp, logerror))
- return;
-
- if (flags & SHUTDOWN_FORCE_UMOUNT) {
- xfs_alert(mp,
-"User initiated shutdown received. Shutting down filesystem");
+ if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
+ xlog_shutdown_wait(mp->m_log);
return;
}
-
- xfs_notice(mp,
-"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
- __func__, flags, lnnum, fname, __return_address);
-
- if (flags & SHUTDOWN_CORRUPT_INCORE) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-"Corruption of in-memory data detected. Shutting down filesystem");
- if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
- xfs_stack_trace();
- } else if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
+ if (mp->m_sb_bp)
+ mp->m_sb_bp->b_flags |= XBF_DONE;
+
+ if (flags & SHUTDOWN_FORCE_UMOUNT)
+ xfs_alert(mp, "User initiated shutdown received.");
+
+ if (xlog_force_shutdown(mp->m_log, flags)) {
+ tag = XFS_PTAG_SHUTDOWN_LOGERROR;
+ why = "Log I/O Error";
+ } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of in-memory data";
+ } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of on-disk metadata";
+ } else {
+ tag = XFS_PTAG_SHUTDOWN_IOERROR;
+ why = "Metadata I/O Error";
}
+ trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
+
+ xfs_alert_tag(mp, tag,
+"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.",
+ why, flags, __return_address, fname, lnnum);
xfs_alert(mp,
"Please unmount the filesystem and rectify the problem(s)");
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
}
/*
@@ -529,10 +563,8 @@ xfs_fs_reserve_ag_blocks(
int err2;
mp->m_finobt_nores = false;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_init(pag, NULL);
- xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
@@ -558,10 +590,8 @@ xfs_fs_unreserve_ag_blocks(
int error = 0;
int err2;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_free(pag);
- xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}