diff options
Diffstat (limited to 'fs/xfs')
167 files changed, 9644 insertions, 6117 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 04611a1068b4..03135a1c31b6 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -102,9 +102,11 @@ xfs-y += xfs_log.o \ xfs_buf_item_recover.o \ xfs_dquot_item_recover.o \ xfs_extfree_item.o \ + xfs_attr_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ xfs_inode_item_recover.o \ + xfs_iunlink_item.o \ xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ @@ -128,6 +130,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o +# notify failure +ifeq ($(CONFIG_MEMORY_FAILURE),y) +xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o +endif + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 6f49bf39183c..c557a030acfe 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/backing-dev.h> #include "xfs_message.h" #include "xfs_trace.h" @@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(lflags); } while (1); } diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1e4ee042d52f..bb0c700afe3c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -120,18 +120,18 @@ xfs_initialize_perag_data( for (index = 0; index < agcount; index++) { /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. + * Read the AGF and AGI buffers to populate the per-ag + * structures for us. */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) + pag = xfs_perag_get(mp, index); + error = xfs_alloc_read_agf(pag, NULL, 0, NULL); + if (!error) + error = xfs_ialloc_read_agi(pag, NULL, NULL); + if (error) { + xfs_perag_put(pag); return error; + } - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; @@ -173,7 +173,6 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -192,20 +191,79 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } +/* Find the size of the AG, in blocks. */ +static xfs_agblock_t +__xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks) +{ + ASSERT(agno < agcount); + + if (agno < agcount - 1) + return mp->m_sb.sb_agblocks; + return dblocks - (agno * mp->m_sb.sb_agblocks); +} + +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); +} + +/* Calculate the first and last possible inode number in an AG. */ +static void +__xfs_agino_range( + struct xfs_mount *mp, + xfs_agblock_t eoag, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); + *first = XFS_AGB_TO_AGINO(mp, bno); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, M_IGEO(mp)->cluster_align); + *last = XFS_AGB_TO_AGINO(mp, bno) - 1; +} + +void +xfs_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); +} + int xfs_initialize_perag( struct xfs_mount *mp, xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks, xfs_agnumber_t *maxagi) { struct xfs_perag *pag; @@ -264,13 +322,18 @@ xfs_initialize_perag( if (error) goto out_remove_pag; - error = xfs_iunlink_init(pag); - if (error) - goto out_hash_destroy; - /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + + /* + * Pre-calculated geometry + */ + pag->block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag->min_block = XFS_AGFL_BLOCK(mp); + __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + &pag->agino_max); } index = xfs_set_inode_alloc(mp, agcount); @@ -281,8 +344,6 @@ xfs_initialize_perag( mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_hash_destroy: - xfs_buf_hash_destroy(pag); out_remove_pag: radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: @@ -294,7 +355,6 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); - xfs_iunlink_destroy(pag); kmem_free(pag); } return error; @@ -322,12 +382,6 @@ xfs_get_aghdr_buf( return 0; } -static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) -{ - return mp->m_sb.sb_logstart > 0 && - id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); -} - /* * Generic btree root block init function */ @@ -353,7 +407,7 @@ xfs_freesp_init_recs( arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { struct xfs_alloc_rec *nrec; xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart); @@ -480,7 +534,7 @@ xfs_rmaproot_init( } /* account for the log space */ - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { rrec = XFS_RMAP_REC_ADDR(block, be16_to_cpu(block->bb_numrecs) + 1); rrec->rm_startblock = cpu_to_be32( @@ -551,7 +605,7 @@ xfs_agfblock_init( agf->agf_refcount_blocks = cpu_to_be32(1); } - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { int64_t logblocks = mp->m_sb.sb_logblocks; be32_add_cpu(&agf->agf_freeblks, -logblocks); @@ -762,11 +816,11 @@ xfs_ag_init_headers( int xfs_ag_shrink_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans **tpp, - xfs_agnumber_t agno, xfs_extlen_t delta) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -783,14 +837,14 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp); + ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(pag, *tpp, &agibp); if (error) return error; agi = agibp->b_addr; - error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp); + error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp); if (error) return error; @@ -802,13 +856,14 @@ xfs_ag_shrink_space( if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta); + args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta); /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ - error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta); + error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp, + aglen - delta); if (error) return error; @@ -816,7 +871,7 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(agibp->b_pag); + error = xfs_ag_resv_free(pag); if (error) return error; @@ -845,7 +900,7 @@ xfs_ag_shrink_space( be32_add_cpu(&agi->agi_length, -delta); be32_add_cpu(&agf->agf_length, -delta); - err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + err2 = xfs_ag_resv_init(pag, *tpp); if (err2) { be32_add_cpu(&agi->agi_length, delta); be32_add_cpu(&agf->agf_length, delta); @@ -869,8 +924,9 @@ xfs_ag_shrink_space( xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; + resv_init_out: - err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + err2 = xfs_ag_resv_init(pag, *tpp); if (!err2) return error; resv_err: @@ -884,9 +940,8 @@ resv_err: */ int xfs_ag_extend_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len) { struct xfs_buf *bp; @@ -894,23 +949,20 @@ xfs_ag_extend_space( struct xfs_agf *agf; int error; - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + + error = xfs_ialloc_read_agi(pag, tp, &bp); if (error) return error; agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); - ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); /* * Change agf length. */ - error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + error = xfs_alloc_read_agf(pag, tp, 0, &bp); if (error) return error; @@ -925,49 +977,49 @@ xfs_ag_extend_space( * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - error = xfs_rmap_free(tp, bp, bp->b_pag, - be32_to_cpu(agf->agf_length) - len, + error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; - return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, be32_to_cpu(agf->agf_length) - len), len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); + if (error) + return error; + + /* Update perag geometry */ + pag->block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + return 0; } /* Retrieve AG geometry. */ int xfs_ag_get_geometry( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { struct xfs_buf *agi_bp; struct xfs_buf *agf_bp; struct xfs_agi *agi; struct xfs_agf *agf; - struct xfs_perag *pag; unsigned int freeblks; int error; - if (agno >= mp->m_sb.sb_agcount) - return -EINVAL; - /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); if (error) goto out_agi; - pag = agi_bp->b_pag; - /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = agno; + ageo->ag_number = pag->pag_agno; agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index e411d51c2589..191b22b9a35b 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -67,6 +67,12 @@ struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; + /* Precalculated geometry info */ + xfs_agblock_t block_count; + xfs_agblock_t min_block; + xfs_agino_t agino_min; + xfs_agino_t agino_max; + #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ @@ -97,17 +103,11 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - /* - * Unlinked inode information. This incore information reflects - * data stored in the AGI, so callers must hold the AGI buffer lock - * or have some other means to control concurrency. - */ - struct rhashtable pagi_unlinked_hash; #endif /* __KERNEL__ */ }; int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi); + xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); void xfs_free_perag(struct xfs_mount *mp); @@ -117,6 +117,71 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, void xfs_perag_put(struct xfs_perag *pag); /* + * Per-ag geometry infomation and validation + */ +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); + +static inline bool +xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) +{ + if (agbno >= pag->block_count) + return false; + if (agbno <= pag->min_block) + return false; + return true; +} + +static inline bool +xfs_verify_agbext( + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_agblock_t len) +{ + if (agbno + len <= agbno) + return false; + + if (!xfs_verify_agbno(pag, agbno)) + return false; + + return xfs_verify_agbno(pag, agbno + len - 1); +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +static inline bool +xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino < pag->agino_min) + return false; + if (agino > pag->agino_max) + return false; + return true; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +static inline bool +xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino == NULLAGINO) + return true; + return xfs_verify_agino(pag, agino); +} + +static inline bool +xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + return mp->m_sb.sb_logstart > 0 && + agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); +} + +/* * Perag iteration APIs */ static inline struct xfs_perag * @@ -168,11 +233,10 @@ struct aghdr_init_data { }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); -int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp, - xfs_agnumber_t agno, xfs_extlen_t delta); -int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len); -int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_ag_geometry *ageo); +int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp, + xfs_extlen_t delta); +int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, + xfs_extlen_t len); +int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fe94058d4e9e..5af123d13a63 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -322,7 +322,7 @@ out: * address. */ if (has_resv) { - error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); + error2 = xfs_alloc_read_agf(pag, tp, 0, NULL); if (error2) return error2; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 353e53b892e6..de79f5d07f65 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -82,6 +82,24 @@ xfs_prealloc_blocks( } /* + * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_mod_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -93,14 +111,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ if (xfs_has_rmapbt(mp)) - blocks++; /* rmap root block */ + blocks++; /* rmap root block */ if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ @@ -230,7 +248,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -245,11 +263,7 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, *bno)) - goto out_bad_rec; - if (*bno > *bno + *len) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) + if (!xfs_verify_agbext(pag, *bno, *len)) goto out_bad_rec; return 0; @@ -257,7 +271,8 @@ xfs_alloc_get_rec( out_bad_rec: xfs_warn(mp, "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + pag->pag_agno); xfs_warn(mp, "start block 0x%x block count 0x%x", *bno, *len); return -EFSCORRUPTED; @@ -685,20 +700,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -int /* error */ +int xfs_alloc_read_agfl( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* buffer for the ag free block array */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **bpp) { - struct xfs_buf *bp; /* return value */ - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_buf *bp; + int error; - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; @@ -1057,7 +1071,8 @@ xfs_alloc_ag_vextent_small( be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, + &fbno, 0); if (error) goto error; if (fbno == NULLAGBLOCK) @@ -1501,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32() & 1) + if (prandom_u32_max(2)) return 0; #endif @@ -2493,7 +2508,7 @@ __xfs_free_extent_later( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); @@ -2591,7 +2606,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2621,7 +2636,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2679,7 +2694,7 @@ xfs_alloc_fix_freelist( else targs.oinfo = XFS_RMAP_OINFO_AG; while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; @@ -2694,7 +2709,7 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) goto out_agbp_relse; @@ -2723,7 +2738,7 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - error = xfs_alloc_put_freelist(tp, agbp, + error = xfs_alloc_put_freelist(pag, tp, agbp, agflbp, bno, 0); if (error) goto out_agflbp_relse; @@ -2749,6 +2764,7 @@ out_no_agbp: */ int xfs_alloc_get_freelist( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agblock_t *bnop, @@ -2759,9 +2775,8 @@ xfs_alloc_get_freelist( xfs_agblock_t bno; __be32 *agfl_bno; int error; - int logflags; + uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; /* * Freelist is empty, give up. @@ -2773,8 +2788,7 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), - &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) return error; @@ -2789,7 +2803,6 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); pag->pagf_flcount--; @@ -2812,9 +2825,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2850,29 +2863,11 @@ xfs_alloc_log_agf( } /* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags) /* XFS_ALLOC_FLAGS_... */ -{ - struct xfs_buf *bp; - int error; - - error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); - if (!error) - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Put the block on the freelist for the allocation group. */ int xfs_alloc_put_freelist( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_buf *agflbp, @@ -2881,21 +2876,22 @@ xfs_alloc_put_freelist( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag; __be32 *blockp; int error; - int logflags; + uint32_t logflags; __be32 *agfl_bno; int startoff; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) - return error; + if (!agflbp) { + error = xfs_alloc_read_agfl(pag, tp, &agflbp); + if (error) + return error; + } + be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); pag->pagf_flcount++; @@ -3052,61 +3048,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +int xfs_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_BUF_ */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - int error; + struct xfs_mount *mp = pag->pag_mount; + int error; - trace_xfs_read_agf(mp, agno); + trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (error) return error; - ASSERT(!(*bpp)->b_error); - xfs_buf_set_ref(*bpp, XFS_AGF_REF); + xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); return 0; } /* - * Read in the allocation group header (free/alloc section). + * Read in the allocation group header (free/alloc section) and initialise the + * perag structure if necessary. If the caller provides @agfbpp, then return the + * locked buffer to the caller, otherwise free it. */ -int /* error */ +int xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - struct xfs_agf *agf; /* ag freelist header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_buf *agfbp; + struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(mp, agno); + trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); - ASSERT(agno != NULLAGNUMBER); - error = xfs_read_agf(mp, tp, agno, + error = xfs_read_agf(pag, tp, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, - bpp); + &agfbp); if (error) return error; - ASSERT(!(*bpp)->b_error); - agf = (*bpp)->b_addr; - pag = (*bpp)->b_pag; + agf = agfbp->b_addr; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3120,7 +3112,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); /* * Update the in-core allocbt counter. Filter out the rmapbt @@ -3130,13 +3122,14 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(mp)) + if (xfs_has_rmapbt(pag->pag_mount)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, &mp->m_allocbt_blks); + atomic64_add(allocbt_blks, + &pag->pag_mount->m_allocbt_blks); } #ifdef DEBUG - else if (!xfs_is_shutdown(mp)) { + else if (!xfs_is_shutdown(pag->pag_mount)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3147,6 +3140,10 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif + if (agfbpp) + *agfbpp = agfbp; + else + xfs_trans_brelse(tp, agfbp); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 1c14a0b1abea..2c3f762dfb58 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ -#define XFS_ALLOC_AGFL_RESERVE 4 unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); @@ -96,6 +95,11 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); +int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk); +int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, struct xfs_buf *agflbp, + xfs_agblock_t bno, int btreeblk); /* * Compute and fill in value of m_alloc_maxlevels. @@ -105,56 +109,13 @@ xfs_alloc_compute_maxlevels( struct xfs_mount *mp); /* file system mount structure */ /* - * Get a block from the freelist. - * Returns with the buffer for the block gotten. - */ -int /* error */ -xfs_alloc_get_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk); /* destination is a AGF btree */ - -/* * Log the given fields from the agf structure. */ void xfs_alloc_log_agf( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields);/* mask of fields to be logged (XFS_AGF_...) */ - -/* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags); /* XFS_ALLOC_FLAGS_... */ - -/* - * Put the block on the freelist for the allocation group. - */ -int /* error */ -xfs_alloc_put_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for a.g. freelist header */ - struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk); /* owner was a AGF btree */ - -/* - * Read in the allocation group header (free/alloc section). - */ -int /* error */ -xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp); /* buffer for the ag freelist header */ + uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* * Allocate an extent (variable-size). @@ -207,10 +168,12 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); -int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); +int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); +int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8c9f73cc0bee..549a3cba0234 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -60,8 +60,8 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, - &bno, 1); + error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -71,7 +71,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); new->s = cpu_to_be32(bno); @@ -89,7 +89,8 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, + bno, 1); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 23523b802539..e28d93d232de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -24,6 +24,10 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" + +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -46,33 +50,27 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, - struct xfs_da_state **state); -STATIC int xfs_attr_fillstate(xfs_da_state_t *state); -STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp); -STATIC int xfs_attr_node_removename(struct xfs_da_args *args, - struct xfs_da_state *state); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip) || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0)) + if (!xfs_inode_has_attr_fork(ip)) + return 0; + if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0) return 0; return 1; } @@ -85,7 +83,7 @@ bool xfs_attr_is_leaf( struct xfs_inode *ip) { - struct xfs_ifork *ifp = ip->i_afp; + struct xfs_ifork *ifp = &ip->i_af; struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; @@ -97,6 +95,123 @@ xfs_attr_is_leaf( return imap.br_startoff == 0 && imap.br_blockcount == 1; } +/* + * XXX (dchinner): name path state saving and refilling is an optimisation to + * avoid needing to look up name entries after rolling transactions removing + * remote xattr blocks between the name entry lookup and name entry removal. + * This optimisation got sidelined when combining the set and remove state + * machines, but the code has been left in place because it is worthwhile to + * restore the optimisation once the combined state machine paths have settled. + * + * This comment is a public service announcement to remind Future Dave that he + * still needs to restore this code to working order. + */ +#if 0 +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +static int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +static int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} +#else +static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; } +#endif + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -114,7 +229,7 @@ xfs_attr_get_ilocked( if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) return xfs_attr_leaf_get(args); @@ -166,7 +281,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -STATIC int +int xfs_attr_calc_size( struct xfs_da_args *args, int *local) @@ -199,6 +314,33 @@ xfs_attr_calc_size( return nblks; } +/* Initialize transaction reservation for attr operations */ +void +xfs_init_attr_trans( + struct xfs_da_args *args, + struct xfs_trans_res *tres, + unsigned int *total) +{ + struct xfs_mount *mp = args->dp->i_mount; + + if (args->value) { + tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; + *total = args->total; + } else { + *tres = M_RES(mp)->tr_attrrm; + *total = XFS_ATTRRM_SPACE_RES(mp); + } +} + +/* + * Add an attr to a shortform fork. If there is no space, + * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC. + * to use. + */ STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, @@ -210,7 +352,7 @@ xfs_attr_try_sf_addname( /* * Build initial attribute list (if required). */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) + if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS) xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); @@ -230,411 +372,477 @@ xfs_attr_try_sf_addname( return error; } -/* - * Check to see if the attr should be upgraded from non-existent or shortform to - * single-leaf-block attribute list. - */ -static inline bool -xfs_attr_is_shortform( - struct xfs_inode *ip) +static int +xfs_attr_sf_addname( + struct xfs_attr_intent *attr) { - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + int error = 0; + + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + ASSERT(!error || error == -EEXIST); + attr->xattri_dela_state = XFS_DAS_DONE; + goto out; + } + + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args); + if (error) + return error; + + attr->xattri_dela_state = XFS_DAS_LEAF_ADD; +out: + trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); + return error; } /* - * Checks to see if a delayed attribute transaction should be rolled. If so, - * transaction is finished or rolled as needed. + * Handle the state change on completion of a multi-state attr operation. + * + * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first + * modification in a attr replace operation and we still have to do the second + * state, indicated by @replace_state. + * + * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on + * completion of the second half of the attr replace operation we correctly + * signal that it is done. */ -STATIC int -xfs_attr_trans_roll( - struct xfs_delattr_context *dac) +static enum xfs_delattr_state +xfs_attr_complete_op( + struct xfs_attr_intent *attr, + enum xfs_delattr_state replace_state) { - struct xfs_da_args *args = dac->da_args; - int error; + struct xfs_da_args *args = attr->xattri_da_args; + bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + args->op_flags &= ~XFS_DA_OP_REPLACE; + if (do_replace) { + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; + } + return XFS_DAS_DONE; +} + +static int +xfs_attr_leaf_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + ASSERT(xfs_attr_is_leaf(args->dp)); + + /* + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. + */ + error = xfs_attr_leaf_try_add(args); + + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; - if (dac->flags & XFS_DAC_DEFER_FINISH) { /* - * The caller wants us to finish all the deferred ops so that we - * avoid pinning the log tail with a large number of deferred - * ops. + * We're not in leaf format anymore, so roll the transaction and + * retry the add to the newly allocated node block. */ - dac->flags &= ~XFS_DAC_DEFER_FINISH; - error = xfs_defer_finish(&args->trans); - } else - error = xfs_trans_roll_inode(&args->trans, args->dp); + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + goto out; + } + if (error) + return error; + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_LEAF_REPLACE); +out: + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; } /* - * Set the attribute specified in @args. + * Add an entry to a node format attr tree. + * + * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell + * the difference between leaf + remote attr blocks and a node format tree, + * so we may still end up having to convert from leaf to node format here. */ -int -xfs_attr_set_args( - struct xfs_da_args *args) +static int +xfs_attr_node_addname( + struct xfs_attr_intent *attr) { - struct xfs_buf *leaf_bp = NULL; - int error = 0; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_da_args *args = attr->xattri_da_args; + int error; - do { - error = xfs_attr_set_iter(&dac, &leaf_bp); - if (error != -EAGAIN) - break; + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) { - if (leaf_bp) - xfs_trans_brelse(args->trans, leaf_bp); + error = xfs_attr_node_try_addname(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) return error; - } - } while (true); + /* + * No state change, we really are in node form now + * but we need the transaction rolled to continue. + */ + goto out; + } + if (error) + return error; + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_NODE_REPLACE); +out: + trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); return error; } -STATIC int -xfs_attr_sf_addname( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static int +xfs_attr_rmtval_alloc( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; + struct xfs_da_args *args = attr->xattri_da_args; int error = 0; /* - * Try to add the attr to the attribute list in the inode. + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. */ - error = xfs_attr_try_sf_addname(dp, args); + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); + if (error) + return error; + /* Roll the transaction only if there is more to allocate. */ + if (attr->xattri_blkcnt > 0) + goto out; + } - /* Should only be 0, -EEXIST or -ENOSPC */ - if (error != -ENOSPC) + error = xfs_attr_rmtval_set_value(args); + if (error) return error; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + ++attr->xattri_dela_state); /* - * It won't fit in the shortform, transform to a leaf block. GROT: - * another possible req'mt for a double-split btree op. + * If we are not doing a rename, we've finished the operation but still + * have to clear the incomplete flag protecting the new attr from + * exposing partially initialised state if we crash during creation. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); - if (error) - return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + error = xfs_attr3_leaf_clearflag(args); +out: + trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +static int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. */ - xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_attr_fillstate(state); + if (error) + return error; /* - * We're still in XFS_DAS_UNINIT state here. We've converted - * the attr fork to leaf format and will restart with the leaf - * add. + * Mark the attribute as INCOMPLETE */ - trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); - dac->flags |= XFS_DAC_DEFER_FINISH; - return -EAGAIN; + return xfs_attr3_leaf_setflag(args); +} + +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); } /* - * Set the attribute specified in @args. - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - * returned. + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. */ -int -xfs_attr_set_iter( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static +int xfs_attr_node_removename_setup( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = NULL; - int forkoff, error = 0; - - /* State machine switch */ - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - /* - * If the fork is shortform, attempt to add the attr. If there - * is no space, this converts to leaf format and returns - * -EAGAIN with the leaf buffer held across the roll. The caller - * will deal with a transaction roll error, but otherwise - * release the hold once we return with a clean transaction. - */ - if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(dac, leaf_bp); - if (*leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, *leaf_bp); - *leaf_bp = NULL; - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state; + int error; - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, *leaf_bp); - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the - * transaction once more. The goal here is to - * call node_addname with the inode and - * transaction in the same state (inode locked - * and joined, transaction clean) no matter how - * we got to this step. - * - * At this point, we are still in - * XFS_DAS_UNINIT, but when we come back, we'll - * be a node, so we'll fall down into the node - * handling code below - */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } else if (error) { - return error; - } + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + if (error != -EEXIST) + goto out; + error = 0; - dac->dela_state = XFS_DAS_FOUND_LBLK; - } else { - error = xfs_attr_node_addname_find_attr(dac); - if (error) - return error; + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_node_addname(dac); - if (error) - return error; + error = xfs_attr_leaf_mark_incomplete(args, state); + if (error) + goto out; + if (args->rmtblkno > 0) + error = xfs_attr_rmtval_invalidate(args); +out: + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } - dac->dela_state = XFS_DAS_FOUND_NBLK; - } - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FOUND_LBLK: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ + return error; +} - /* Open coded xfs_attr_rmtval_set without trans handling */ - if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { - dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; - } - } +/* + * Remove the original attr we have just replaced. This is dependent on the + * original lookup and insert placing the old attr in args->blkno/args->index + * and the new attr in args->blkno2/args->index2. + */ +static int +xfs_attr_leaf_remove_attr( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff; + int error; - /* - * Repeat allocating remote blocks for the attr value until - * blkcnt drops to zero. - */ - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return(dac->dela_state, - args->dp); - return -EAGAIN; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; + xfs_attr3_leaf_remove(bp, args); - /* - * If this is not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - return error; - } + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in - * series. - */ - dac->dela_state = XFS_DAS_FLIP_LFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FLIP_LFLAG: - /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; + return error; +} - fallthrough; - case XFS_DAS_RM_LBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_LBLK; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - if (error) - return error; +/* + * Shrink an attribute from leaf to shortform. Used by the node format remove + * path when the node format collapses to a single block and so we have to check + * if it can be collapsed further. + */ +static int +xfs_attr_leaf_shrink( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp; + int forkoff; + int error; - dac->dela_state = XFS_DAS_RD_LEAF; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } + if (!xfs_attr_is_leaf(dp)) + return 0; - fallthrough; - case XFS_DAS_RD_LEAF: - /* - * This is the last step for leaf format. Read the block with - * the old attr, remove the old attr, check for shortform - * conversion and return. - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; - xfs_attr3_leaf_remove(bp, args); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else { + xfs_trans_brelse(args->trans, bp); + } - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ + return error; +} - return error; +/* + * Run the attribute operation specified in @attr. + * + * This routine is meant to function as a delayed operation and will set the + * state to XFS_DAS_DONE when the operation is complete. Calling functions will + * need to handle this, and recall the function until either an error or + * XFS_DAS_DONE is detected. + */ +int +xfs_attr_set_iter( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; - case XFS_DAS_FOUND_NBLK: - /* - * Find space for remote blocks and fall into the allocation - * state. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; + /* State machine switch */ +next_state: + switch (attr->xattri_dela_state) { + case XFS_DAS_UNINIT: + ASSERT(0); + return -EFSCORRUPTED; + case XFS_DAS_SF_ADD: + return xfs_attr_sf_addname(attr); + case XFS_DAS_LEAF_ADD: + return xfs_attr_leaf_addname(attr); + case XFS_DAS_NODE_ADD: + return xfs_attr_node_addname(attr); + + case XFS_DAS_SF_REMOVE: + error = xfs_attr_sf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_LEAF_REMOVE: + error = xfs_attr_leaf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_NODE_REMOVE: + error = xfs_attr_node_removename_setup(attr); + if (error == -ENOATTR && + (args->op_flags & XFS_DA_OP_RECOVERY)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + error = 0; + break; } + if (error) + return error; + attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; + if (args->rmtblkno == 0) + attr->xattri_dela_state++; + break; + case XFS_DAS_LEAF_SET_RMT: + case XFS_DAS_NODE_SET_RMT: + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; + attr->xattri_dela_state++; fallthrough; - case XFS_DAS_ALLOC_NODE: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - dac->dela_state = XFS_DAS_ALLOC_NODE; - if (args->rmtblkno > 0) { - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - } - /* - * If this was not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - goto out; - } + case XFS_DAS_LEAF_ALLOC_RMT: + case XFS_DAS_NODE_ALLOC_RMT: + error = xfs_attr_rmtval_alloc(attr); + if (error) + return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + break; + goto next_state; + case XFS_DAS_LEAF_REPLACE: + case XFS_DAS_NODE_REPLACE: /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * We must "flip" the incomplete flags on the "new" and "old" + * attribute/value pairs so that one disappears and one appears + * atomically. */ error = xfs_attr3_leaf_flipflags(args); if (error) - goto out; + return error; /* - * Commit the flag value change and start the next trans in - * series + * We must commit the flag value change now to make it atomic + * and then we can start the next trans in series at REMOVE_OLD. */ - dac->dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + break; - case XFS_DAS_FLIP_NFLAG: + case XFS_DAS_LEAF_REMOVE_OLD: + case XFS_DAS_NODE_REMOVE_OLD: /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). + * If we have a remote attr, start the process of removing it + * by invalidating any cached buffers. + * + * If we don't have a remote attr, we skip the remote block + * removal state altogether with a second state increment. */ xfs_attr_restore_rmt_blk(args); - - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - fallthrough; - case XFS_DAS_RM_NBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - + error = xfs_attr_rmtval_invalidate(args); if (error) return error; + } else { + attr->xattri_dela_state++; + } - dac->dela_state = XFS_DAS_CLR_FLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + goto next_state; + + case XFS_DAS_LEAF_REMOVE_RMT: + case XFS_DAS_NODE_REMOVE_RMT: + error = xfs_attr_rmtval_remove(attr); + if (error == -EAGAIN) { + error = 0; + break; } + if (error) + return error; - fallthrough; - case XFS_DAS_CLR_FLAG: /* - * The last state for node format. Look up the old attr and - * remove it. + * We've finished removing the remote attr blocks, so commit the + * transaction and move on to removing the attr name from the + * leaf/node block. Removing the attr might require a full + * transaction reservation for btree block freeing, so we + * can't do that in the same transaction where we removed the + * remote attr blocks. */ - error = xfs_attr_node_addname_clear_incomplete(dac); + attr->xattri_dela_state++; + break; + + case XFS_DAS_LEAF_REMOVE_ATTR: + error = xfs_attr_leaf_remove_attr(attr); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + + case XFS_DAS_NODE_REMOVE_ATTR: + error = xfs_attr_node_remove_attr(attr); + if (!error) + error = xfs_attr_leaf_shrink(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); break; default: ASSERT(0); break; } -out: + + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return error; } @@ -648,12 +856,13 @@ xfs_attr_lookup( { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_sf_findname(args, NULL, NULL); if (xfs_attr_is_leaf(dp)) { @@ -665,33 +874,85 @@ xfs_attr_lookup( return error; } - return xfs_attr_node_hasname(args, NULL); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); + return error; } -/* - * Remove the attribute specified in @args. - */ -int -xfs_attr_remove_args( +static int +xfs_attr_intent_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ +{ + + struct xfs_attr_intent *new; + + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); + new->xattri_op_flags = op_flags; + new->xattri_da_args = args; + + *attr = new; + return 0; +} + +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_add( struct xfs_da_args *args) { - int error; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_attr_intent *new; + int error = 0; - do { - error = xfs_attr_remove_iter(&dac); - if (error != -EAGAIN) - break; + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) - return error; + new->xattri_dela_state = xfs_attr_init_add_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - } while (true); + return 0; +} - return error; +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_replace( + struct xfs_da_args *args) +{ + struct xfs_attr_intent *new; + int error = 0; + + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_replace_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); + + return 0; +} + +/* Removes an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_remove( + struct xfs_da_args *args) +{ + + struct xfs_attr_intent *new; + int error; + + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_remove_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); + + return 0; } /* @@ -724,21 +985,21 @@ xfs_attr_set( /* * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. + * removal to fail as well. Preserve the logged flag, since we need + * to pass that through to the logging code. */ - args->op_flags = XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_OKNOENT | + (args->op_flags & XFS_DA_OP_LOGGED); if (args->value) { XFS_STATS_INC(mp, xs_attr_set); - - args->op_flags |= XFS_DA_OP_ADDNAME; args->total = xfs_attr_calc_size(args, &local); /* * If the inode doesn't have an attribute fork, add one. * (inode must not be locked when we call this routine) */ - if (XFS_IFORK_Q(dp) == 0) { + if (xfs_inode_has_attr_fork(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); @@ -748,20 +1009,10 @@ xfs_attr_set( return error; } - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - total = args->total; - if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); - - tres = M_RES(mp)->tr_attrrm; - total = XFS_ATTRRM_SPACE_RES(mp); rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } @@ -769,6 +1020,7 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ + xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; @@ -776,33 +1028,43 @@ xfs_attr_set( if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(args->trans, dp, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } error = xfs_attr_lookup(args); - if (args->value) { - if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) - goto out_trans_cancel; - if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_trans_cancel; - if (error != -ENOATTR && error != -EEXIST) + switch (error) { + case -EEXIST: + /* if no value, we are performing a remove operation */ + if (!args->value) { + error = xfs_attr_defer_remove(args); + break; + } + /* Pure create fails if the attr already exists */ + if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - error = xfs_attr_set_args(args); - if (error) - goto out_trans_cancel; - /* shortform attribute has already been committed */ - if (!args->trans) - goto out_unlock; - } else { - if (error != -EEXIST) + error = xfs_attr_defer_replace(args); + break; + case -ENOATTR: + /* Can't remove what isn't there. */ + if (!args->value) goto out_trans_cancel; - error = xfs_attr_remove_args(args); - if (error) + /* Pure replace fails if no existing attr to replace. */ + if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; + + error = xfs_attr_defer_add(args); + break; + default: + goto out_trans_cancel; } + if (error) + goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -837,7 +1099,7 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_shortform *sf; - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; return be16_to_cpu(sf->hdr.totsize); } @@ -845,28 +1107,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) * Add a name to the shortform attribute list structure * This is the external routine. */ -STATIC int -xfs_attr_shortform_addname(xfs_da_args_t *args) +static int +xfs_attr_shortform_addname( + struct xfs_da_args *args) { - int newsize, forkoff, retval; + int newsize, forkoff; + int error; trace_xfs_attr_sf_addname(args); - retval = xfs_attr_shortform_lookup(args); - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - return retval; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) - return retval; - retval = xfs_attr_sf_removename(args); - if (retval) - return retval; + error = xfs_attr_shortform_lookup(args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + return error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + return error; + + error = xfs_attr_sf_removename(args); + if (error) + return error; + /* - * Since we have removed the old attr, clear ATTR_REPLACE so - * that the leaf format add routine won't trip over the attr - * not being around. + * Since we have removed the old attr, clear XFS_DA_OP_REPLACE + * so that the new attr doesn't fit in shortform format, the + * leaf format add routine won't trip over the attr not being + * around. */ - args->attr_flags &= ~XATTR_REPLACE; + args->op_flags &= ~XFS_DA_OP_REPLACE; + break; + case 0: + break; + default: + return error; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -889,8 +1164,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ -/* Store info about a remote block */ -STATIC void +/* Save the current remote block info and clear the current pointers. */ +static void xfs_attr_save_rmt_blk( struct xfs_da_args *args) { @@ -899,10 +1174,13 @@ xfs_attr_save_rmt_blk( args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* Set stored info about a remote block */ -STATIC void +static void xfs_attr_restore_rmt_blk( struct xfs_da_args *args) { @@ -925,48 +1203,47 @@ xfs_attr_restore_rmt_blk( */ STATIC int xfs_attr_leaf_try_add( - struct xfs_da_args *args, - struct xfs_buf *bp) + struct xfs_da_args *args) { - int retval; + struct xfs_buf *bp; + int error; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; /* - * Look up the given attribute in the leaf block. Figure out if - * the given flags produce an error or call for an atomic rename. + * Look up the xattr name to set the insertion point for the new xattr. */ - retval = xfs_attr_leaf_hasname(args, &bp); - if (retval != -ENOATTR && retval != -EEXIST) - return retval; - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_brelse; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto out_brelse; trace_xfs_attr_leaf_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - xfs_attr_save_rmt_blk(args); - /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - /* - * Add the attribute to the leaf block - */ return xfs_attr3_leaf_add(bp, args); out_brelse: xfs_trans_brelse(args->trans, bp); - return retval; + return error; } /* @@ -1012,9 +1289,10 @@ xfs_attr_leaf_removename( dp = args->dp; error = xfs_attr_leaf_hasname(args, &bp); - if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; return error; } else if (error != -EEXIST) return error; @@ -1062,32 +1340,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Return EEXIST if attr is found, or ENOATTR if not - * statep: If not null is set to point at the found state. Caller will - * be responsible for freeing the state in this case. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_hasname( +xfs_attr_node_lookup( struct xfs_da_args *args, - struct xfs_da_state **statep) + struct xfs_da_state *state) { - struct xfs_da_state *state; int retval, error; - state = xfs_da_state_alloc(args); - if (statep != NULL) - *statep = state; - /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); if (error) - retval = error; - - if (!statep) - xfs_da_state_free(state); + return error; return retval; } @@ -1098,46 +1364,48 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - int retval; + struct xfs_da_args *args = attr->xattri_da_args; + int error; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, &dac->da_state); - if (retval != -ENOATTR && retval != -EEXIST) - goto error; - - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto error; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto error; - trace_xfs_attr_node_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - xfs_attr_save_rmt_blk(args); + trace_xfs_attr_node_replace(args); /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto error; } return 0; error: - if (dac->da_state) - xfs_da_state_free(dac->da_state); - return retval; + if (attr->xattri_da_state) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } + return error; } /* @@ -1146,25 +1414,16 @@ error: * This will involve walking down the Btree, and may involve splitting * leaf nodes and even splitting intermediate nodes up to and including * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - * - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - *returned. */ -STATIC int -xfs_attr_node_addname( - struct xfs_delattr_context *dac) +static int +xfs_attr_node_try_addname( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; + struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; - trace_xfs_attr_node_addname(args); + trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1175,25 +1434,9 @@ xfs_attr_node_addname( /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* - * have been a b-tree. + * have been a b-tree. Let the caller deal with this. */ - xfs_da_state_free(state); - state = NULL; - error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - - /* - * Now that we have converted the leaf to a node, we can - * roll the transaction, and try xfs_attr3_leaf_add - * again on re-entry. No need to set dela_state to do - * this. dela_state is still unset by this function at - * this point. - */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_node_addname_return( - dac->dela_state, args->dp); - return -EAGAIN; + goto out; } /* @@ -1205,7 +1448,6 @@ xfs_attr_node_addname( error = xfs_da3_split(state); if (error) goto out; - dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1214,148 +1456,12 @@ xfs_attr_node_addname( } out: - if (state) - xfs_da_state_free(state); - return error; -} - - -STATIC int -xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = NULL; - int retval = 0; - int error = 0; - - /* - * Re-find the "old" attribute entry after any split ops. The INCOMPLETE - * flag means that we will find the "old" attr, not the "new" one. - */ - args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); - if (error) - goto out; - - error = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - } - retval = error = 0; - -out: - if (state) - xfs_da_state_free(state); - if (error) - return error; - return retval; -} - -/* - * Shrink an attribute from leaf to shortform - */ -STATIC int -xfs_attr_node_shrink( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_inode *dp = args->dp; - int error, forkoff; - struct xfs_buf *bp; - - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - } else - xfs_trans_brelse(args->trans, bp); - - return error; -} - -/* - * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers - * for later deletion of the entry. - */ -STATIC int -xfs_attr_leaf_mark_incomplete( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - int error; - - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - return error; - - /* - * Mark the attribute as INCOMPLETE - */ - return xfs_attr3_leaf_setflag(args); -} - -/* - * Initial setup for xfs_attr_node_removename. Make sure the attr is there and - * the blocks are valid. Attr keys with remote blocks will be marked - * incomplete. - */ -STATIC -int xfs_attr_node_removename_setup( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state **state = &dac->da_state; - int error; - - error = xfs_attr_node_hasname(args, state); - if (error != -EEXIST) - goto out; - error = 0; - - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == - XFS_ATTR_LEAF_MAGIC); - - if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, *state); - if (error) - goto out; - - error = xfs_attr_rmtval_invalidate(args); - } -out: - if (error) - xfs_da_state_free(*state); - + xfs_da_state_free(state); + attr->xattri_da_state = NULL; return error; } -STATIC int +static int xfs_attr_node_removename( struct xfs_da_args *args, struct xfs_da_state *state) @@ -1374,246 +1480,42 @@ xfs_attr_node_removename( return retval; } -/* - * Remove the attribute specified in @args. - * - * This will involve walking down the Btree, and may involve joining - * leaf nodes and even joining intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * This routine is meant to function as either an in-line or delayed operation, - * and may return -EAGAIN when the transaction needs to be rolled. Calling - * functions will need to handle this, and call the function until a - * successful error code is returned. - */ -int -xfs_attr_remove_iter( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; - int retval, error = 0; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_node_removename(args); - - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - - /* - * Shortform or leaf formats don't require transaction rolls and - * thus state transitions. Call the right helper and return. - */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_removename(args); - - if (xfs_attr_is_leaf(dp)) - return xfs_attr_leaf_removename(args); - - /* - * Node format may require transaction rolls. Set up the - * state context and fall into the state machine. - */ - if (!dac->da_state) { - error = xfs_attr_node_removename_setup(dac); - if (error) - return error; - state = dac->da_state; - } - - fallthrough; - case XFS_DAS_RMTBLK: - dac->dela_state = XFS_DAS_RMTBLK; - - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a - * deadlock. - */ - if (args->rmtblkno > 0) { - /* - * May return -EAGAIN. Roll and repeat until all remote - * blocks are removed. - */ - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) { - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return error; - } else if (error) { - goto out; - } - - /* - * Refill the state structure with buffers (the prior - * calls released our buffers) and close out this - * transaction before proceeding. - */ - ASSERT(args->rmtblkno == 0); - error = xfs_attr_refillstate(state); - if (error) - goto out; - dac->dela_state = XFS_DAS_RM_NAME; - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_NAME: - /* - * If we came here fresh from a transaction roll, reattach all - * the buffers to the current transaction. - */ - if (dac->dela_state == XFS_DAS_RM_NAME) { - error = xfs_attr_refillstate(state); - if (error) - goto out; - } - - retval = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. If so, roll - * the transacton and fall into the shrink state. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - - dac->flags |= XFS_DAC_DEFER_FINISH; - dac->dela_state = XFS_DAS_RM_SHRINK; - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_SHRINK: - /* - * If the result is small enough, push it all into the inode. - * This is our final state so it's safe to return a dirty - * transaction. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); - ASSERT(error != -EAGAIN); - break; - default: - ASSERT(0); - error = -EINVAL; - goto out; - } -out: - if (state) - xfs_da_state_free(state); - return error; -} - -/* - * Fill in the disk block numbers in the state structure for the buffers - * that are attached to the state structure. - * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commits have released these buffers. - */ -STATIC int -xfs_attr_fillstate(xfs_da_state_t *state) +static int +xfs_attr_node_remove_attr( + struct xfs_attr_intent *attr) { - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level; - - trace_xfs_attr_fillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = xfs_da_state_alloc(args); + int retval = 0; + int error = 0; /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". + * The attr we are removing has already been marked incomplete, so + * we need to set the filter appropriately to re-find the "old" + * attribute entry after any split ops. */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - return 0; -} - -/* - * Reattach the buffers to the state structure based on the disk block - * numbers stored in the state structure. - * This is done after some set of transaction commits have released those - * buffers from our grip. - */ -STATIC int -xfs_attr_refillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level, error; - - trace_xfs_attr_refillstate(state->args); + args->attr_filter |= XFS_ATTR_INCOMPLETE; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } + error = xfs_attr_node_removename(args, state); /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". + * Check to see if the tree needs to be collapsed. */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); + if (error) + goto out; } + retval = error = 0; - return 0; +out: + xfs_da_state_free(state); + if (error) + return error; + return retval; } /* @@ -1639,7 +1541,8 @@ xfs_attr_node_get( /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_attr_node_hasname(args, &state); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); if (error != -EEXIST) goto out_release; @@ -1653,13 +1556,12 @@ xfs_attr_node_get( * If not in a transaction, we have to release all the buffers. */ out_release: - for (i = 0; state != NULL && i < state->path.active; i++) { + for (i = 0; i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } @@ -1679,3 +1581,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 5e71f719bdd5..81be9b3e4004 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -425,7 +425,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_delattr_context.da_state + * Enum values for xfs_attr_intent.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -434,46 +434,102 @@ struct xfs_attr_list_context { * to where it was and resume executing where it left off. */ enum xfs_delattr_state { - XFS_DAS_UNINIT = 0, /* No state has been set yet */ - XFS_DAS_RMTBLK, /* Removing remote blks */ - XFS_DAS_RM_NAME, /* Remove attr name */ - XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ - XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ - XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ - XFS_DAS_RD_LEAF, /* Read in the new leaf */ - XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ - XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ - XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ - XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + + /* + * Initial sequence states. The replace setup code relies on the + * ADD and REMOVE states for a specific format to be sequential so + * that we can transform the initial operation to be performed + * according to the xfs_has_larp() state easily. + */ + XFS_DAS_SF_ADD, /* Initial sf add state */ + XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */ + + XFS_DAS_LEAF_ADD, /* Initial leaf add state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */ + + XFS_DAS_NODE_ADD, /* Initial node add state */ + XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */ + + /* Leaf state set/replace/remove sequence */ + XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ + XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ + XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ + XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ + + /* Node state sequence, must match leaf state above */ + XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ + XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ + XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */ + + XFS_DAS_DONE, /* finished operation */ }; -/* - * Defines for xfs_delattr_context.flags - */ -#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ +#define XFS_DAS_STRINGS \ + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ + { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ + { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ + { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ + { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ + { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ + { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } + +struct xfs_attri_log_nameval; /* * Context used for keeping track of delayed attribute operations */ -struct xfs_delattr_context { - struct xfs_da_args *da_args; - - /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; +struct xfs_attr_intent { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *da_state; + struct xfs_da_state *xattri_da_state; + + struct xfs_da_args *xattri_da_args; + + /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; /* Used to keep track of current state of delayed operation */ - unsigned int flags; - enum xfs_delattr_state dela_state; + enum xfs_delattr_state xattri_dela_state; + + /* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* + */ + unsigned int xattri_op_flags; + + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; }; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -489,11 +545,77 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_remove_iter(struct xfs_delattr_context *dac); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); -void xfs_delattr_context_init(struct xfs_delattr_context *dac, - struct xfs_da_args *args); +int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0); +} + +static inline enum xfs_delattr_state +xfs_attr_init_add_state(struct xfs_da_args *args) +{ + /* + * When called from the completion of a attr remove to determine the + * next state, the attribute fork may be null. This can occur only occur + * on a pure remove, but we grab the next state before we check if a + * replace operation is being performed. If we are called from any other + * context, i_af is guaranteed to exist. Hence if the attr fork is + * null, we were called from a pure remove operation and so we are done. + */ + if (!xfs_inode_has_attr_fork(args->dp)) + return XFS_DAS_DONE; + + args->op_flags |= XFS_DA_OP_ADDNAME; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_ADD; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_ADD; + return XFS_DAS_NODE_ADD; +} + +static inline enum xfs_delattr_state +xfs_attr_init_remove_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_REMOVE; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_REMOVE; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_REMOVE; + return XFS_DAS_NODE_REMOVE; +} + +/* + * If we are logging the attributes, then we have to start with removal of the + * old attribute so that there is always consistent state that we can recover + * from if the system goes down part way through. We always log the new attr + * value, so even when we remove the attr first we still have the information in + * the log to finish the replace operation atomically. + */ +static inline enum xfs_delattr_state +xfs_attr_init_replace_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; + if (args->op_flags & XFS_DA_OP_LOGGED) + return xfs_attr_init_remove_state(args); + return xfs_attr_init_add_state(args); +} + +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 014daa8c542d..beee51ad75ce 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -28,6 +28,7 @@ #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_errortag.h" /* @@ -288,6 +289,23 @@ xfs_attr3_leaf_verify_entry( return NULL; } +/* + * Validate an attribute leaf block. + * + * Empty leaf blocks can occur under the following circumstances: + * + * 1. setxattr adds a new extended attribute to a file; + * 2. The file has zero existing attributes; + * 3. The attribute is too large to fit in the attribute fork; + * 4. The attribute is small enough to fit in a leaf block; + * 5. A log flush occurs after committing the transaction that creates + * the (empty) leaf block; and + * 6. The filesystem goes down after the log flush but before the new + * attribute can be committed to the leaf block. + * + * Hence we need to ensure that we don't fail the validation purely + * because the leaf is empty. + */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) @@ -445,6 +463,14 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ +/* + * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE + * flag on disk - if there's an incomplete attr then recovery needs to tear it + * down. If there's no incomplete attr, then recovery needs to tear that attr + * down to replace it with the attr that has been logged. In this case, the + * INCOMPLETE flag will not be set in attr->attr_filter, but rather + * XFS_DA_OP_RECOVERY will be set in args->op_flags. + */ static bool xfs_attr_match( struct xfs_da_args *args, @@ -452,14 +478,18 @@ xfs_attr_match( unsigned char *name, int flags) { + if (args->namelen != namelen) return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* - * If we are looking for incomplete entries, show only those, else only - * show complete entries. - */ + + /* Recovery ignores the INCOMPLETE flag. */ + if ((args->op_flags & XFS_DA_OP_RECOVERY) && + args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return true; + + /* All remaining matches need to be filtered by INCOMPLETE state. */ if (args->attr_filter != (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) return false; @@ -560,7 +590,7 @@ xfs_attr_shortform_bytesfit( * to real extents, or the delalloc conversion will take care of the * literal area rebalancing. */ - if (bytes <= XFS_IFORK_ASIZE(dp)) + if (bytes <= xfs_inode_attr_fork_size(dp)) return dp->i_forkoff; /* @@ -652,7 +682,7 @@ xfs_attr_shortform_create( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; - struct xfs_ifork *ifp = dp->i_afp; + struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); @@ -689,7 +719,7 @@ xfs_attr_sf_findname( int end; int i; - sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), @@ -734,7 +764,7 @@ xfs_attr_shortform_add( mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = dp->i_afp; + ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) @@ -767,11 +797,9 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - ASSERT(ip->i_afp->if_nextents == 0); + ASSERT(ip->i_af.if_nextents == 0); - xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - ip->i_afp = NULL; + xfs_ifork_zap_attr(ip); ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -795,9 +823,17 @@ xfs_attr_sf_removename( dp = args->dp; mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); + + /* + * If we are recovering an operation, finding nothing to + * remove is not an error - it just means there was nothing + * to clean up. + */ + if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) + return 0; if (error != -EEXIST) return error; size = xfs_attr_sf_entsize(sfe); @@ -818,7 +854,7 @@ xfs_attr_sf_removename( totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & XFS_DA_OP_ADDNAME)) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -851,7 +887,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) trace_xfs_attr_sf_lookup(args); - ifp = args->dp->i_afp; + ifp = &args->dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; @@ -879,8 +915,8 @@ xfs_attr_shortform_getvalue( struct xfs_attr_sf_entry *sfe; int i; - ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = xfs_attr_sf_nextentry(sfe), i++) { @@ -892,14 +928,10 @@ xfs_attr_shortform_getvalue( return -ENOATTR; } -/* - * Convert from using the shortform to the leaf. On success, return the - * buffer so that we can keep it locked until we're totally done with it. - */ +/* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp; struct xfs_attr_shortform *sf; @@ -914,7 +946,7 @@ xfs_attr_shortform_to_leaf( trace_xfs_attr_sf_to_leaf(args); dp = args->dp; - ifp = dp->i_afp; + ifp = &dp->i_af; sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); @@ -961,7 +993,6 @@ xfs_attr_shortform_to_leaf( sfe = xfs_attr_sf_nextentry(sfe); } error = 0; - *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; @@ -1022,8 +1053,8 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); + ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -1127,9 +1158,17 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(xfs_has_attr2(dp->i_mount)); - ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_remove(dp, args->trans); + /* + * Don't remove the attr fork if this operation is the first + * part of a attr replace operations. We're going to add a new + * attr immediately, so we need to keep the attr fork around in + * this case. + */ + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + ASSERT(xfs_has_attr2(dp->i_mount)); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + } goto out; } @@ -1189,6 +1228,11 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + error = -EIO; + goto out; + } + error = xfs_da_grow_inode(args, &blkno); if (error) goto out; @@ -1486,8 +1530,9 @@ xfs_attr3_leaf_add_work( entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; - if (args->op_flags & XFS_DA_OP_RENAME) { - entry->flags |= XFS_ATTR_INCOMPLETE; + if (args->op_flags & XFS_DA_OP_REPLACE) { + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index efa757f1e912..368f4d9fa1d5 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, - struct xfs_buf **leaf_bp); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); int xfs_attr_sf_findname(struct xfs_da_args *args, struct xfs_attr_sf_entry **sfep, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 83b95be9ded8..d440393b40eb 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -543,6 +543,7 @@ xfs_attr_rmtval_stale( { struct xfs_mount *mp = ip->i_mount; struct xfs_buf *bp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -550,14 +551,18 @@ xfs_attr_rmtval_stale( XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) return -EFSCORRUPTED; - bp = xfs_buf_incore(mp->m_ddev_targp, + error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), - XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, map->br_blockcount), + incore_flags, &bp); + if (error) { + if (error == -ENOENT) + return 0; + return error; } + xfs_buf_stale(bp); + xfs_buf_relse(bp); return 0; } @@ -568,14 +573,14 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_bmbt_irec *map = &attr->xattri_map; int error; - dac->lblkno = 0; - dac->blkcnt = 0; + attr->xattri_lblkno = 0; + attr->xattri_blkcnt = 0; args->rmtblkcnt = 0; args->rmtblkno = 0; memset(map, 0, sizeof(struct xfs_bmbt_irec)); @@ -584,8 +589,8 @@ xfs_attr_rmtval_find_space( if (error) return error; - dac->blkcnt = args->rmtblkcnt; - dac->lblkno = args->rmtblkno; + attr->xattri_blkcnt = args->rmtblkcnt; + attr->xattri_lblkno = args->rmtblkno; return 0; } @@ -598,17 +603,18 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_bmbt_irec *map = &attr->xattri_map; int nmap; int error; nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, - dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + error = xfs_bmapi_write(args->trans, dp, + (xfs_fileoff_t)attr->xattri_lblkno, + attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total, map, &nmap); if (error) return error; @@ -618,8 +624,8 @@ xfs_attr_rmtval_set_blk( (map->br_startblock != HOLESTARTBLOCK)); /* roll attribute extent map forwards */ - dac->lblkno += map->br_blockcount; - dac->blkcnt -= map->br_blockcount; + attr->xattri_lblkno += map->br_blockcount; + attr->xattri_blkcnt -= map->br_blockcount; return 0; } @@ -673,9 +679,9 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; int error, done; /* @@ -695,8 +701,8 @@ xfs_attr_rmtval_remove( * the parent */ if (!done) { - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); + trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d72eff30ca18..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); -int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4dccd4d90622..49d0d4ea63fc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { + uint64_t maxblocks; /* max blocks at this level */ + xfs_extnum_t maxleafents; /* max leaf entries possible */ int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum number of - * leaf entries, is controlled by the size of the on-disk extent count, - * either a signed 32-bit number for the data fork, or a signed 16-bit - * number for the attr fork. + * The maximum number of extents in a fork, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count. * * Note that we can no longer assume that if we are in ATTR1 that the * fork offset of all the inodes will be @@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels( * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; + maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; + else sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); @@ -130,7 +128,7 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_EXTENTS && @@ -142,7 +140,7 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_BTREE && @@ -296,7 +294,7 @@ xfs_check_block( else thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", @@ -321,7 +319,7 @@ xfs_bmap_check_leaf_extents( int whichfork) /* data or attr fork */ { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ struct xfs_buf *bp; /* buffer for "block" */ @@ -468,7 +466,7 @@ error0: if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -485,7 +483,7 @@ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) @@ -540,7 +538,7 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ @@ -618,7 +616,7 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* @@ -747,7 +745,7 @@ xfs_bmap_local_to_extents_empty( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); @@ -787,7 +785,7 @@ xfs_bmap_local_to_extents( * So sending the data fork of a regular inode is invalid. */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { @@ -882,7 +880,7 @@ xfs_bmap_add_attrfork_btree( mp = ip->i_mount; - if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip)) + if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -922,7 +920,7 @@ xfs_bmap_add_attrfork_extents( int error; /* error return value */ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= - XFS_IFORK_DSIZE(ip)) + xfs_inode_data_fork_size(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, @@ -953,7 +951,7 @@ xfs_bmap_add_attrfork_local( { struct xfs_da_args dargs; /* args for dir/attr code */ - if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip)) return 0; if (S_ISDIR(VFS_I(ip)->i_mode)) { @@ -1025,7 +1023,7 @@ xfs_bmap_add_attrfork( int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(XFS_IFORK_Q(ip) == 0); + ASSERT(xfs_inode_has_attr_fork(ip) == 0); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -1036,16 +1034,15 @@ xfs_bmap_add_attrfork( rsvd, &tp); if (error) return error; - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) goto trans_cancel; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) goto trans_cancel; - ASSERT(ip->i_afp == NULL); - ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: @@ -1118,7 +1115,7 @@ xfs_iread_bmbt_block( xfs_extnum_t num_recs; xfs_extnum_t j; int whichfork = cur->bc_ino.whichfork; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); @@ -1166,7 +1163,7 @@ xfs_iread_extents( int whichfork) { struct xfs_iread_state ir; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur; int error; @@ -1210,7 +1207,7 @@ xfs_bmap_first_unused( xfs_fileoff_t *first_unused, /* unused block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fileoff_t lastaddr = 0; @@ -1257,7 +1254,7 @@ xfs_bmap_last_before( xfs_fileoff_t *last_block, /* last block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; int error; @@ -1291,7 +1288,7 @@ xfs_bmap_last_extent( struct xfs_bmbt_irec *rec, int *is_empty) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; int error; @@ -1357,7 +1354,7 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; @@ -1391,7 +1388,7 @@ xfs_bmap_add_extent_delay_real( int whichfork) { struct xfs_mount *mp = bma->ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ @@ -1399,7 +1396,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ @@ -1452,7 +1449,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -1470,13 +1467,13 @@ xfs_bmap_add_extent_delay_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1950,14 +1947,14 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(!isnullstartblock(new->br_startblock)); @@ -2000,7 +1997,7 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -2018,13 +2015,13 @@ xfs_bmap_add_extent_unwritten_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; /* @@ -2479,10 +2476,10 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2510,15 +2507,15 @@ xfs_bmap_add_extent_hole_delay( */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* @@ -2616,9 +2613,9 @@ xfs_bmap_add_extent_hole_real( struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, - int flags) + uint32_t flags) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur = *curp; int error; /* error return value */ @@ -2626,7 +2623,7 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); @@ -2661,17 +2658,17 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2906,15 +2903,15 @@ xfs_bmap_extsize_align( /* * For large extent hint sizes, the aligned extent might be larger than - * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls - * the length back under MAXEXTLEN. The outer allocation loops handle - * short allocation just fine, so it is safe to do this. We only want to - * do it when we are forced to, though, because it means more allocation - * operations are required. + * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so + * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer + * allocation loops handle short allocation just fine, so it is safe to + * do this. We only want to do it when we are forced to, though, because + * it means more allocation operations are required. */ - while (align_alen > MAXEXTLEN) + while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; - ASSERT(align_alen <= MAXEXTLEN); + ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation @@ -3004,9 +3001,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - /* see MAXEXTLEN handling above */ + /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || - align_alen + extsz > MAXEXTLEN); + align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG @@ -3187,7 +3184,8 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, + NULL); if (error) { /* Couldn't lock the AGF, so skip this AG. */ if (error == -EAGAIN) { @@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map( xfs_fileoff_t obno, xfs_fileoff_t end, int n, - int flags) + uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { @@ -3811,7 +3809,7 @@ xfs_bmapi_update_map( xfs_fileoff_t obno, xfs_fileoff_t end, int *n, - int flags) + uint32_t flags) { xfs_bmbt_irec_t *mval = *map; @@ -3864,11 +3862,11 @@ xfs_bmapi_read( xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3961,7 +3959,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; int error; @@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc( * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ - alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) @@ -4088,7 +4086,7 @@ xfs_bmapi_allocate( { struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4104,7 +4102,7 @@ xfs_bmapi_allocate( if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) bma->prev.br_startoff = NULLFILEOFF; } else { - bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); if (!bma->eof) bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); @@ -4184,10 +4182,10 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags) + uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4264,7 +4262,7 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; @@ -4285,7 +4283,7 @@ xfs_bmapi_finish( int whichfork, int error) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); if ((bma->logflags & xfs_ilog_fext(whichfork)) && ifp->if_format != XFS_DINODE_FMT_EXTENTS) @@ -4312,7 +4310,7 @@ xfs_bmapi_write( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ + uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ @@ -4324,7 +4322,7 @@ xfs_bmapi_write( }; struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4424,8 +4422,8 @@ xfs_bmapi_write( * xfs_extlen_t and therefore 32 bits. Hence we have to * check for 32-bit overflows and handle them here. */ - if (len > (xfs_filblks_t)MAXEXTLEN) - bma.length = MAXEXTLEN; + if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) + bma.length = XFS_MAX_BMBT_EXTLEN; else bma.length = len; @@ -4505,7 +4503,7 @@ xfs_bmapi_convert_delalloc( struct iomap *iomap, unsigned int *seq) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; @@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_may_overflow(ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* @@ -4551,7 +4551,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc( bma.ip = ip; bma.wasdel = true; bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, + XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* @@ -4598,7 +4599,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) @@ -4629,7 +4630,7 @@ xfs_bmapi_remap( xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4639,9 +4640,9 @@ xfs_bmapi_remap( int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); - ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); @@ -4796,12 +4797,12 @@ xfs_bmap_del_extent_delay( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); int error = 0; bool isrt; @@ -4923,10 +4924,10 @@ xfs_bmap_del_extent_cow( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; - int state = BMAP_COWFORK; + uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); @@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ - int bflags) /* bmapi flags */ + uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ @@ -5015,13 +5016,13 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(del->br_blockcount > 0); xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); @@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real( * Deleting the middle of the extent. */ - /* - * For directories, -ENOSPC is returned since a directory entry - * remove operation must not fail due to low extent count - * availability. -ENOSPC will be handled by higher layers of XFS - * by letting the corresponding empty Data/Free blocks to linger - * until a future remove operation. Dabtree blocks would be - * swapped with the last block in the leaf space and then the - * new last block will be unmapped. - * - * The above logic also applies to the source directory entry of - * a rename operation. - */ - error = xfs_iext_count_may_overflow(ip, whichfork, 1); - if (error) { - ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && - whichfork == XFS_DATA_FORK); - error = -ENOSPC; - goto done; - } - old = got; got.br_blockcount = del->br_startoff - got.br_startoff; @@ -5281,7 +5262,7 @@ __xfs_bunmapi( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ - int flags, /* misc flags */ + uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ @@ -5299,7 +5280,6 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ - xfs_fileoff_t max_len; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5308,7 +5288,7 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; if (xfs_is_shutdown(mp)) @@ -5318,16 +5298,6 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - /* - * Guesstimate how many blocks we can unmap without running the risk of - * blowing out the transaction with a mix of EFIs and reflink - * adjustments. - */ - if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) - max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); - else - max_len = len; - error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -5366,7 +5336,7 @@ __xfs_bunmapi( extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && - (nexts == 0 || extno < nexts) && max_len > 0) { + (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. @@ -5400,14 +5370,6 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - /* How much can we safely unmap? */ - if (max_len < del.br_blockcount) { - del.br_startoff += del.br_blockcount - max_len; - if (!wasdel) - del.br_startblock += del.br_blockcount - max_len; - del.br_blockcount = max_len; - } - if (!isrt) goto delete; @@ -5543,7 +5505,6 @@ delete: if (error) goto error0; - max_len -= del.br_blockcount; end = del.br_startoff - 1; nodelete: /* @@ -5609,7 +5570,7 @@ xfs_bunmapi( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_extnum_t nexts, int *done) { @@ -5641,7 +5602,7 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) return false; return true; @@ -5668,7 +5629,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5789,7 +5750,7 @@ xfs_bmap_collapse_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, prev; struct xfs_iext_cursor icur; @@ -5904,7 +5865,7 @@ xfs_bmap_insert_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, next; struct xfs_iext_cursor icur; @@ -6004,7 +5965,7 @@ xfs_bmap_split_extent( xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 03d9aaf87413..16db95b11589 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -39,7 +39,7 @@ struct xfs_bmalloca { bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ - int flags; + uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 @@ -47,17 +47,17 @@ struct xfs_bmalloca { /* * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ -#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ +#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ +#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents @@ -65,7 +65,7 @@ struct xfs_bmalloca { * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ -#define XFS_BMAPI_ZERO 0x080 +#define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily @@ -75,16 +75,16 @@ struct xfs_bmalloca { * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ -#define XFS_BMAPI_REMAP 0x100 +#define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ -#define XFS_BMAPI_COWFORK 0x200 +#define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ -#define XFS_BMAPI_NODISCARD 0x1000 +#define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ -#define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w) (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } -static inline int xfs_bmapi_whichfork(int bmapi_flags) +static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; @@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* * Flags for xfs_bmap_add_extent*. */ -#define BMAP_LEFT_CONTIG (1 << 0) -#define BMAP_RIGHT_CONTIG (1 << 1) -#define BMAP_LEFT_FILLING (1 << 2) -#define BMAP_RIGHT_FILLING (1 << 3) -#define BMAP_LEFT_DELAY (1 << 4) -#define BMAP_RIGHT_DELAY (1 << 5) -#define BMAP_LEFT_VALID (1 << 6) -#define BMAP_RIGHT_VALID (1 << 7) -#define BMAP_ATTRFORK (1 << 8) -#define BMAP_COWFORK (1 << 9) +#define BMAP_LEFT_CONTIG (1u << 0) +#define BMAP_RIGHT_CONTIG (1u << 1) +#define BMAP_LEFT_FILLING (1u << 2) +#define BMAP_RIGHT_FILLING (1u << 3) +#define BMAP_LEFT_DELAY (1u << 4) +#define BMAP_RIGHT_DELAY (1u << 5) +#define BMAP_LEFT_VALID (1u << 6) +#define BMAP_RIGHT_VALID (1u << 7) +#define BMAP_ATTRFORK (1u << 8) +#define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ @@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); + int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, @@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -static inline int xfs_bmap_fork_to_state(int whichfork) +static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: @@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags); + uint32_t flags); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 453309fc85f2..cfa052d40105 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -304,7 +304,7 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + ifp = xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, @@ -322,7 +322,7 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + ifp = xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, @@ -550,7 +550,7 @@ xfs_bmbt_init_cursor( struct xfs_inode *ip, /* inode owning the btree */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); @@ -564,7 +564,7 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; @@ -597,7 +597,11 @@ xfs_bmbt_maxrecs( return xfs_bmbt_block_maxrecs(blocklen, leaf); } -/* Compute the max possible height for block mapping btrees. */ +/* + * Calculate the maximum possible height of the btree that the on-disk format + * supports. This is used for sizing structures large enough to support every + * possible configuration of a filesystem that might get mounted. + */ unsigned int xfs_bmbt_maxlevels_ondisk(void) { @@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void) minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; /* One extra level for the inode root. */ - return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1; + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1; } /* @@ -659,7 +664,7 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); + ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f18a875f51c6..4c16c8c31fcb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -52,6 +52,70 @@ xfs_btree_magic( } /* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + __be64 dsibling) +{ + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_perag *pag, + struct xfs_btree_cur *cur, + int level, + xfs_agblock_t agbno, + __be32 dsibling) +{ + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) + return NULL; + + sibling = be32_to_cpu(dsibling); + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(pag, sibling)) + return __this_address; + } + return NULL; +} + +/* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ @@ -65,6 +129,8 @@ __xfs_btree_check_lblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -83,16 +149,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /* Check a long btree block header. */ @@ -128,8 +194,11 @@ __xfs_btree_check_sblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -146,16 +215,16 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* Check a short btree block header. */ @@ -216,7 +285,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); + return xfs_verify_agbno(cur->bc_ag.pag, agbno); } /* @@ -373,8 +442,14 @@ xfs_btree_del_cursor( break; } + /* + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. + */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - xfs_is_shutdown(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -647,7 +722,7 @@ xfs_btree_ifork_ptr( if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; - return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); + return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); } /* @@ -751,20 +826,20 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - int64_t imask; /* mask for current bit number */ + uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ - for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; @@ -773,7 +848,7 @@ xfs_btree_offsets( /* * Find the highest bit, so the last byte offset. */ - for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; @@ -1456,7 +1531,7 @@ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ + uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ @@ -2818,7 +2893,7 @@ xfs_btree_split_worker( * in any way. */ if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + new_pflags |= PF_MEMALLOC | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); xfs_trans_set_context(args->cur->bc_tp); @@ -3194,7 +3269,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3274,7 +3349,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3294,7 +3369,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3379,6 +3454,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -3479,7 +3556,7 @@ xfs_btree_kill_iroot( { int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; union xfs_btree_key *kp; @@ -4271,6 +4348,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4445,20 +4537,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /** @@ -4499,22 +4592,21 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 22d9f411fde6..eef27858a013 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ -#define XFS_BB_MAGIC (1 << 0) -#define XFS_BB_LEVEL (1 << 1) -#define XFS_BB_NUMRECS (1 << 2) -#define XFS_BB_LEFTSIB (1 << 3) -#define XFS_BB_RIGHTSIB (1 << 4) -#define XFS_BB_BLKNO (1 << 5) -#define XFS_BB_LSN (1 << 6) -#define XFS_BB_UUID (1 << 7) -#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_MAGIC (1u << 0) +#define XFS_BB_LEVEL (1u << 1) +#define XFS_BB_NUMRECS (1u << 2) +#define XFS_BB_LEFTSIB (1u << 3) +#define XFS_BB_RIGHTSIB (1u << 4) +#define XFS_BB_BLKNO (1u << 5) +#define XFS_BB_LSN (1u << 6) +#define XFS_BB_UUID (1u << 7) +#define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 -#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) +#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -345,7 +345,7 @@ xfs_btree_dup_cursor( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ -void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9dc1ecb9713d..e576560b46e9 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_errortag.h" /* * xfs_da_btree.c @@ -116,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_cache_free(xfs_da_state_cache, state); } +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; +} + static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) { if (whichfork == XFS_DATA_FORK) @@ -482,6 +494,9 @@ xfs_da3_split( trace_xfs_da_split(state->args); + if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + return -EIO; + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then @@ -2177,8 +2192,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap); diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0faf7d9ac241..ffa3df5b2893 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -30,6 +30,7 @@ struct xfs_da_geometry { unsigned int free_hdr_size; /* dir2 free header size */ unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + xfs_extnum_t max_extents; /* Max. extents in corresponding fork */ xfs_dir2_data_aoff_t data_first_offset; size_t data_entry_offset; @@ -76,27 +77,33 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - int op_flags; /* operation flags */ + uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* * Operation flags: */ -#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ -#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ -#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ -#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ -#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ +#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ +#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */ +#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ +#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ +#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ +#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ +#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ - { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_REPLACE, "REPLACE" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" } + { XFS_DA_OP_NOTIME, "NOTIME" }, \ + { XFS_DA_OP_REMOVE, "REMOVE" }, \ + { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ + { XFS_DA_OP_LOGGED, "LOGGED" } /* * Storage for holding state during Btree searches and split/join ops. @@ -197,7 +204,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, * Utility routines. */ -#define XFS_DABUF_MAP_HOLE_OK (1 << 0) +#define XFS_DABUF_MAP_HOLE_OK (1u << 0) int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, @@ -220,6 +227,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5a49caa5c9df..25e2841084e1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) * Directory address space divided into sections, * spaces separated by 32GB. */ +#define XFS_DIR2_MAX_SPACES 3 #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) @@ -688,10 +689,10 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) +#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) /* diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 0805ade2d300..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -22,6 +22,10 @@ #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" +#include "xfs_buf.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -184,36 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, + [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static void +/* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; - if (!dfp->dfp_intent) - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (dfp->dfp_intent) + return 1; + + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -STATIC void +static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - xfs_defer_create_intent(tp, dfp, true); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } + return ret; } /* Abort all the intents that were committed. */ @@ -449,6 +478,8 @@ xfs_defer_finish_one( dfp->dfp_count--; error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { + int ret; + /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to @@ -459,7 +490,9 @@ xfs_defer_finish_one( dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; - xfs_defer_create_intent(tp, dfp, false); + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; } if (error) @@ -487,7 +520,7 @@ int xfs_defer_finish_noroll( struct xfs_trans **tp) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); @@ -506,17 +539,24 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - xfs_defer_create_intents(*tp); - list_splice_init(&(*tp)->t_dfops, &dop_pending); + int has_intents = xfs_defer_create_intents(*tp); - error = xfs_defer_trans_roll(tp); - if (error) - goto out_shutdown; + list_splice_init(&(*tp)->t_dfops, &dop_pending); - /* Possibly relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) + if (has_intents < 0) { + error = has_intents; goto out_shutdown; + } + if (has_intents || dfp) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + + /* Relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + } dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); @@ -665,13 +705,15 @@ xfs_defer_ops_capture( if (list_empty(&tp->t_dfops)) return NULL; + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + /* Create an object to capture the defer ops. */ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); - xfs_defer_create_intents(tp); - /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; @@ -748,6 +790,10 @@ xfs_defer_ops_capture_and_commit( /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } if (!dfc) return xfs_trans_commit(tp); @@ -774,17 +820,25 @@ xfs_defer_ops_continue( struct xfs_trans *tp, struct xfs_defer_resources *dres) { + unsigned int i; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); - /* Lock and join the captured inode to the new transaction. */ + /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_lock(dfc->dfc_held.dr_bp[i]); + + /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); + dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); @@ -854,7 +908,9 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - + error = xfs_attr_intent_init_cache(); + if (error) + goto err; return 0; err: xfs_defer_destroy_item_caches(); @@ -865,6 +921,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7bb8a31ad65b..114a3a4930a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -19,6 +19,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, + XFS_DEFER_OPS_TYPE_ATTR, XFS_DEFER_OPS_TYPE_MAX, }; @@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_attr_defer_type; + /* * Deferred operation item relogging limits. diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 50546eadaae2..92bac3373f1f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,7 +19,11 @@ #include "xfs_error.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +const struct xfs_name xfs_name_dotdot = { + .name = (const unsigned char *)"..", + .len = 2, + .type = XFS_DIR3_FT_DIR, +}; /* * Convert inode mode to directory entry filetype @@ -54,10 +58,10 @@ xfs_mode_to_ftype( */ xfs_dahash_t xfs_ascii_ci_hashname( - struct xfs_name *name) + const struct xfs_name *name) { - xfs_dahash_t hash; - int i; + xfs_dahash_t hash; + int i; for (i = 0, hash = 0; i < name->len; i++) hash = tolower(name->name[i]) ^ rol32(hash, 7); @@ -146,6 +150,8 @@ xfs_da_mount( dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> + mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ @@ -157,6 +163,12 @@ xfs_da_mount( dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + + if (xfs_has_large_extent_counts(mp)) + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + else + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } @@ -181,7 +193,7 @@ xfs_dir_isempty( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; - if (dp->i_disk_size > XFS_IFORK_DSIZE(dp)) + if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; return !sfp->count; @@ -243,13 +255,13 @@ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -337,16 +349,16 @@ xfs_dir_cilookup_result( int xfs_dir_lookup( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name, - xfs_ino_t *inum, /* out: inode number */ - struct xfs_name *ci_name) /* out: actual name if CI match */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - int lock_mode; + struct xfs_da_args *args; + int rval; + bool v; + int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); @@ -423,7 +435,7 @@ xfs_dir_removename( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -475,13 +487,13 @@ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, /* name of entry to replace */ + const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -598,19 +610,23 @@ xfs_dir2_grow_inode( int xfs_dir2_isblock( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isblock) { - xfs_fileoff_t last; /* last file offset */ - int rval; + struct xfs_mount *mp = args->dp->i_mount; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (XFS_IS_CORRUPT(args->dp->i_mount, - rval != 0 && - args->dp->i_disk_size != args->geo->blksize)) + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isblock = false; + if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) + return 0; + + *isblock = true; + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; - *vp = rval; return 0; } @@ -620,14 +636,20 @@ xfs_dir2_isblock( int xfs_dir2_isleaf( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isleaf) { - xfs_fileoff_t last; /* last file offset */ - int rval; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - *vp = last == args->geo->leafblk + args->geo->fsbcount; + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isleaf = false; + if (eof != args->geo->leafblk + args->geo->fsbcount) + return 0; + + *isleaf = true; return 0; } @@ -728,7 +750,7 @@ xfs_dir2_namecheck( xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, - struct xfs_name *name) + const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index d03e6098ded9..dd39f17dd9a9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -21,7 +21,7 @@ struct xfs_dir2_data_unused; struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; -extern struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dotdot; /* * Convert inode mode to directory entry filetype @@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t *inum, + const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); @@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index df0869bba275..00f960a703b2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -842,7 +842,7 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; /* @@ -1055,7 +1055,7 @@ xfs_dir2_leaf_to_block( * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; return xfs_dir2_block_to_sf(args, dbp, size, &sfh); @@ -1071,7 +1071,7 @@ xfs_dir2_sf_to_block( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d9b66306a9a7..cb9e950a911d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC); ltp = xfs_dir2_leaf_tail_p(geo, leaf); @@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int( return __this_address; /* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; @@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address; diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 711709a2aa53..7404a9ff1a92 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr { }; /* xfs_dir2.c */ -xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name); enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -201,7 +201,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, + const struct xfs_name *name); enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 5a97a87eaa20..8cd37e6e9d38 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -237,7 +237,7 @@ xfs_dir2_block_sfsize( (i8count ? /* inumber */ count * XFS_INO64_SIZE : count * XFS_INO32_SIZE); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return size; /* size value is a failure */ } /* @@ -406,7 +406,7 @@ xfs_dir2_sf_addname( * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). */ - if (new_isize > XFS_IFORK_DSIZE(dp) || + if (new_isize > xfs_inode_data_fork_size(dp) || (pick = xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { /* @@ -710,7 +710,7 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; @@ -865,7 +865,6 @@ xfs_dir2_sf_lookup( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ - int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ @@ -929,8 +928,7 @@ xfs_dir2_sf_lookup( if (!ci_sfep) return -ENOENT; /* otherwise process the CI match as required by the caller */ - error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return error; + return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); } /* @@ -1031,7 +1029,7 @@ xfs_dir2_sf_replace_needblock( newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && - sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); + sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp); } /* diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a23a52e643ad..5362908164b0 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -59,7 +59,10 @@ #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 -#define XFS_ERRTAG_MAX 39 +#define XFS_ERRTAG_LARP 39 +#define XFS_ERRTAG_DA_LEAF_SPLIT 40 +#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 +#define XFS_ERRTAG_MAX 42 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -103,5 +106,8 @@ #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 +#define XFS_RANDOM_LARP 1 +#define XFS_RANDOM_DA_LEAF_SPLIT 1 +#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d665c04e69dd..371dc07233e0 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ XFS_SB_FEAT_INCOMPAT_META_UUID| \ XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ + XFS_SB_FEAT_INCOMPAT_NREXT64) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature( return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */ +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \ + (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS) #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( @@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } +static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +{ + return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); +} static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) @@ -525,26 +534,26 @@ typedef struct xfs_agf { #define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 -#define XFS_AGF_REFCOUNT_ROOT 0x00008000 -#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 -#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_MAGICNUM (1u << 0) +#define XFS_AGF_VERSIONNUM (1u << 1) +#define XFS_AGF_SEQNO (1u << 2) +#define XFS_AGF_LENGTH (1u << 3) +#define XFS_AGF_ROOTS (1u << 4) +#define XFS_AGF_LEVELS (1u << 5) +#define XFS_AGF_FLFIRST (1u << 6) +#define XFS_AGF_FLLAST (1u << 7) +#define XFS_AGF_FLCOUNT (1u << 8) +#define XFS_AGF_FREEBLKS (1u << 9) +#define XFS_AGF_LONGEST (1u << 10) +#define XFS_AGF_BTREEBLKS (1u << 11) +#define XFS_AGF_UUID (1u << 12) +#define XFS_AGF_RMAP_BLOCKS (1u << 13) +#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14) +#define XFS_AGF_REFCOUNT_ROOT (1u << 15) +#define XFS_AGF_REFCOUNT_LEVEL (1u << 16) +#define XFS_AGF_SPARE64 (1u << 17) #define XFS_AGF_NUM_BITS 18 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ @@ -619,22 +628,22 @@ typedef struct xfs_agi { #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_MAGICNUM (1u << 0) +#define XFS_AGI_VERSIONNUM (1u << 1) +#define XFS_AGI_SEQNO (1u << 2) +#define XFS_AGI_LENGTH (1u << 3) +#define XFS_AGI_COUNT (1u << 4) +#define XFS_AGI_ROOT (1u << 5) +#define XFS_AGI_LEVEL (1u << 6) +#define XFS_AGI_FREECOUNT (1u << 7) +#define XFS_AGI_NEWINO (1u << 8) +#define XFS_AGI_DIRINO (1u << 9) +#define XFS_AGI_UNLINKED (1u << 10) #define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1u << 11) +#define XFS_AGI_FREE_LEVEL (1u << 12) +#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */ #define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ @@ -695,7 +704,7 @@ struct xfs_agfl { * When the bigtime feature is enabled, ondisk inode timestamps become an * unsigned 64-bit nanoseconds counter. This means that the bigtime inode * timestamp epoch is the start of the classic timestamp range, which is - * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers * /must/ use the bigtime conversion functions when encoding and decoding raw * timestamps. */ @@ -791,16 +800,41 @@ struct xfs_dinode { __be32 di_nlink; /* number of links to file */ __be16 di_projid_lo; /* lower part of owner's project id */ __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + __be64 di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + __be64 di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + __u8 di_v2_pad[6]; + __be16 di_flushiter; + }; + }; xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ __be64 di_size; /* number of bytes in file */ __be64 di_nblocks; /* # of direct & btree blocks used */ __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + __be32 di_nextents; + __be16 di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + __be32 di_big_anextents; + __be16 di_nrext64_pad; + } __packed; + } __packed; __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ __s8 di_aformat; /* format of attr fork's data */ __be32 di_dmevmask; /* DMIG event mask */ @@ -870,6 +904,56 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_UUID, "uuid" } /* + * Max values for extnum and aextnum. + * + * The original on-disk extent counts were held in signed fields, resulting in + * maximum extent counts of 2^31 and 2^15 for the data and attr forks + * respectively. Similarly the maximum extent length is limited to 2^21 blocks + * by the 21-bit wide blockcount field of a BMBT extent record. + * + * The newly introduced data fork extent counter can hold a 64-bit value, + * however the maximum number of extents in a file is also limited to 2^54 + * extents by the 54-bit wide startoff field of a BMBT extent record. + * + * It is further limited by the maximum supported file size of 2^63 + * *bytes*. This leads to a maximum extent count for maximally sized filesystem + * blocks (64kB) of: + * + * 2^63 bytes / 2^16 bytes per block = 2^47 blocks + * + * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence + * 2^48 was chosen as the maximum data fork extent count. + * + * The maximum file size that can be represented by the data fork extent counter + * in the worst case occurs when all extents are 1 block in length and each + * block is 1KB in size. + * + * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and + * with 1KB sized blocks, a file can reach upto, + * 1KB * (2^31) = 2TB + * + * This is much larger than the theoretical maximum size of a directory + * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB. + * + * Hence, a directory inode can never overflow its data fork extent counter. + */ +#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) +#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) + +/* + * When we upgrade an inode to the large extent counts, the maximum value by + * which the extent count can increase is bound by the change in size of the + * on-disk field. No upgrade operation should ever be adding more than a few + * tens of extents, so if we get a really large value it is a sign of a code bug + * or corruption. + */ +#define XFS_MAX_EXTCNT_UPGRADE_NR \ + min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \ + XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL) + +/* * Inode minimum and maximum sizes. */ #define XFS_DINODE_MIN_LOG 8 @@ -918,10 +1002,6 @@ enum xfs_dinode_fmt { ((w) == XFS_DATA_FORK ? \ (dip)->di_format : \ (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) /* * For block and character special files the 32bit dev_t is stored at the @@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); } +static inline bool xfs_dinode_has_large_extent_counts( + const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ -#define XFS_DQTYPE_USER 0x01 /* user dquot record */ -#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ -#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ -#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ +#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */ +#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */ +#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */ +#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ @@ -1475,20 +1564,6 @@ struct xfs_rmap_rec { #define RMAPBT_UNUSED_OFFSET_BITLEN 7 #define RMAPBT_OFFSET_BITLEN 54 -#define XFS_RMAP_ATTR_FORK (1 << 0) -#define XFS_RMAP_BMBT_BLOCK (1 << 1) -#define XFS_RMAP_UNWRITTEN (1 << 2) -#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ - XFS_RMAP_BMBT_BLOCK) -#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) -struct xfs_rmap_irec { - xfs_agblock_t rm_startblock; /* extent start block */ - xfs_extlen_t rm_blockcount; /* extent length */ - uint64_t rm_owner; /* extent owner */ - uint64_t rm_offset; /* offset within the owner */ - unsigned int rm_flags; /* state flags */ -}; - /* * Key structure * @@ -1537,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp); * on the startblock. This speeds up mount time deletion of stale * staging extents because they're all at the right side of the tree. */ -#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define XFS_REFC_COWFLAG (1U << 31) #define REFCNTBT_COWFLAG_BITLEN 1 #define REFCNTBT_AGBLOCK_BITLEN 31 @@ -1551,12 +1626,6 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -struct xfs_refcount_irec { - xfs_agblock_t rc_startblock; /* starting block number */ - xfs_extlen_t rc_blockcount; /* count of free blocks */ - xfs_nlink_t rc_refcount; /* number of inodes linked here */ -}; - #define MAXREFCOUNT ((xfs_nlink_t)~0U) #define MAXREFCEXTLEN ((xfs_extlen_t)~0U) @@ -1596,6 +1665,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) #define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) +#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK)) + /* * bmbt records have a file offset (block) field that is 54 bits wide, so this * is the largest xfs_fileoff_t that we ever expect to see. diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index c43877c8a279..1cfd5bc6520a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -93,21 +93,6 @@ struct getbmapx { #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ /* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - -/* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. */ @@ -251,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ +#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ /* * Minimum and maximum sizes need for growth checks. @@ -392,7 +378,7 @@ struct xfs_bulkstat { uint32_t bs_extsize_blks; /* extent size hint, blocks */ uint32_t bs_nlink; /* number of links */ - uint32_t bs_extents; /* number of extents */ + uint32_t bs_extents; /* 32-bit data fork extent counter */ uint32_t bs_aextents; /* attribute number of extents */ uint16_t bs_version; /* structure version */ uint16_t bs_forkoff; /* inode fork offset in bytes */ @@ -401,8 +387,9 @@ struct xfs_bulkstat { uint16_t bs_checked; /* checked inode metadata */ uint16_t bs_mode; /* type and mode */ uint16_t bs_pad2; /* zeroed */ + uint64_t bs_extents64; /* 64-bit data fork extent counter */ - uint64_t bs_pad[7]; /* zeroed */ + uint64_t bs_pad[6]; /* zeroed */ }; #define XFS_BULKSTAT_VERSION_V1 (1) @@ -474,17 +461,28 @@ struct xfs_bulk_ireq { * Only return results from the specified @agno. If @ino is zero, start * with the first inode of @agno. */ -#define XFS_BULK_IREQ_AGNO (1 << 0) +#define XFS_BULK_IREQ_AGNO (1U << 0) /* * Return bulkstat information for a single inode, where @ino value is a * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* * values below. Not compatible with XFS_BULK_IREQ_AGNO. */ -#define XFS_BULK_IREQ_SPECIAL (1 << 1) +#define XFS_BULK_IREQ_SPECIAL (1U << 1) -#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ - XFS_BULK_IREQ_SPECIAL) +/* + * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign + * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use + * xfs_bulkstat->bs_extents for returning data fork extent count and set + * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and + * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than + * XFS_MAX_EXTCNT_DATA_FORK_OLD. + */ +#define XFS_BULK_IREQ_NREXT64 (1U << 2) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL | \ + XFS_BULK_IREQ_NREXT64) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -562,16 +560,10 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; - /* * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. * @@ -720,34 +712,34 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ -#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) +#define XFS_SCRUB_IFLAG_REPAIR (1u << 0) /* o: Metadata object needs repair. */ -#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) +#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1) /* * o: Metadata object could be optimized. It's not corrupt, but * we could improve on it somehow. */ -#define XFS_SCRUB_OFLAG_PREEN (1 << 2) +#define XFS_SCRUB_OFLAG_PREEN (1u << 2) /* o: Cross-referencing failed. */ -#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) +#define XFS_SCRUB_OFLAG_XFAIL (1u << 3) /* o: Metadata object disagrees with cross-referenced metadata. */ -#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) +#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4) /* o: Scan was not complete. */ -#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) +#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5) /* o: Metadata object looked funny but isn't corrupt. */ -#define XFS_SCRUB_OFLAG_WARNING (1 << 6) +#define XFS_SCRUB_OFLAG_WARNING (1u << 6) /* * o: IFLAG_REPAIR was set but metadata object did not need fixing or * optimization and has therefore not been altered. */ -#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ @@ -781,15 +773,15 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -831,7 +823,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b418fe0c0679..94db50eb706a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,6 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -116,7 +115,7 @@ xfs_inobt_get_rec( xfs_inobt_btrec_to_irec(mp, rec, irec); - if (!xfs_verify_agino(mp, agno, irec->ir_startino)) + if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) goto out_bad_rec; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) @@ -137,7 +136,8 @@ xfs_inobt_get_rec( out_bad_rec: xfs_warn(mp, "%s Inode BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno); + cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", + cur->bc_ag.pag->pag_agno); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32() & 1; + do_sparse = prandom_u32_max(2); #endif /* @@ -805,7 +805,7 @@ sparse_alloc: * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, - args.agbno, args.len, prandom_u32()); + args.agbno, args.len, get_random_u32()); if (error) return error; @@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno); + error = xfs_ialloc_read_agi(pag, tp, NULL); if (error) return false; } @@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags); + error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; } @@ -1679,7 +1679,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, &agbp); if (error) return error; @@ -2169,7 +2169,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2215,7 +2215,7 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", @@ -2414,9 +2414,9 @@ out_drop: */ void xfs_ialloc_log_agi( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* allocation group header buffer */ - int fields) /* bitmask of fields to log */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ @@ -2571,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = { */ int xfs_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { + struct xfs_mount *mp = pag->pag_mount; int error; - trace_xfs_read_agi(mp, agno); + trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); if (error) return error; if (tp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); + xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); - xfs_buf_set_ref(*bpp, XFS_AGI_REF); + xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } +/* + * Read in the agi and initialise the per-ag data. If the caller supplies a + * @agibpp, return the locked AGI buffer to them, otherwise release it. + */ int xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { - struct xfs_agi *agi; /* allocation group header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_buf *agibp; + struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(mp, agno); + trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(mp, tp, agno, bpp); + error = xfs_read_agi(pag, tp, &agibp); if (error) return error; - agi = (*bpp)->b_addr; - pag = (*bpp)->b_pag; + agi = agibp->b_addr; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2623,27 +2624,11 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(mp)); - return 0; -} - -/* - * Read in the agi to initialise the per-ag data in the mount structure - */ -int -xfs_ialloc_pagi_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno) /* allocation group number */ -{ - struct xfs_buf *bp = NULL; - int error; - - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) - return error; - if (bp) - xfs_trans_brelse(tp, bp); + xfs_is_shutdown(pag->pag_mount)); + if (agibpp) + *agibpp = agibp; + else + xfs_trans_brelse(tp, agibp); return 0; } @@ -2772,6 +2757,8 @@ xfs_ialloc_setup_geometry( igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + if (xfs_has_large_extent_counts(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; @@ -2910,8 +2897,7 @@ xfs_ialloc_calc_rootino( * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 8b5c2b709022..9bbbca6ac4ed 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -60,27 +60,12 @@ void xfs_ialloc_log_agi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* allocation group header buffer */ - int fields); /* bitmask of fields to log */ + uint32_t fields); /* bitmask of fields to log */ -/* - * Read in the allocation group header (inode allocation section) - */ -int /* error */ -xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp); /* allocation group hdr buf */ - -/* - * Read in the allocation group header to initialise the per-ag data - * in the mount structure - */ -int -xfs_ialloc_pagi_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno); /* allocation group number */ +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); +int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); /* * Lookup a record by ino in the btree given by cur. @@ -102,8 +87,6 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); -int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b2ad2fdc40f5..8c83e265770c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -683,10 +683,10 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { - xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + struct xfs_mount *mp = pag->pag_mount; + xfs_agblock_t agblocks = pag->block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -697,8 +697,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -722,7 +721,7 @@ xfs_inobt_cur( ASSERT(*agi_bpp == NULL); ASSERT(*curpp == NULL); - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; @@ -757,16 +756,15 @@ xfs_inobt_count_blocks( /* Read finobt block count from AGI header. */ static int xfs_finobt_read_blocks( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp; struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) return error; @@ -794,14 +792,14 @@ xfs_finobt_calc_reserves( return 0; if (xfs_has_inobtcounts(mp)) - error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); + error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, &tree_len); if (error) return error; - *ask += xfs_inobt_max_size(mp, pag->pag_agno); + *ask += xfs_inobt_max_size(pag); *used += tree_len; return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cae9708c8587..758aacd8166b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -41,14 +42,12 @@ xfs_inode_buf_verify( bool readahead) { struct xfs_mount *mp = bp->b_mount; - xfs_agnumber_t agno; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { struct xfs_dinode *dip; @@ -59,7 +58,7 @@ xfs_inode_buf_verify( unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - xfs_verify_agino_or_null(mp, agno, unlinked_ino); + xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -178,7 +177,6 @@ xfs_inode_from_disk( xfs_failaddr_t fa; ASSERT(ip->i_cowfp == NULL); - ASSERT(ip->i_afp == NULL); fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); if (fa) { @@ -230,7 +228,8 @@ xfs_inode_from_disk( ip->i_nblocks = be64_to_cpu(from->di_nblocks); ip->i_extsize = be32_to_cpu(from->di_extsize); ip->i_forkoff = from->di_forkoff; - ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked); if (from->di_dmevmask || from->di_dmstate) xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); @@ -279,6 +278,25 @@ xfs_inode_to_disk_ts( return ts; } +static inline void +xfs_inode_to_disk_iext_counters( + struct xfs_inode *ip, + struct xfs_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af)); + /* + * We might be upgrading the inode to use larger extent counters + * than was previously used. Hence zero the unused field. + */ + to->di_nrext64_pad = cpu_to_be16(0); + } else { + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af)); + } +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -296,7 +314,6 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); @@ -307,10 +324,8 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(ip->i_disk_size); to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = ip->i_forkoff; - to->di_aformat = xfs_ifork_format(ip->i_afp); + to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = cpu_to_be16(ip->i_diflags); if (xfs_has_v3inodes(ip->i_mount)) { @@ -323,11 +338,14 @@ xfs_inode_to_disk( to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = cpu_to_be16(ip->i_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t @@ -336,20 +354,40 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents; + xfs_extnum_t max_extents; + mode_t mode = be16_to_cpu(dip->di_mode); + uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); + uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); + + di_nextents = xfs_dfork_nextents(dip, whichfork); + + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. + * + * For all types, check that when the size says the should be in extent + * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISDIR(mode) || S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } - switch (XFS_DFORK_FORMAT(dip, whichfork)) { + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* - * no local regular files yet + * No local regular files yet. */ - if (whichfork == XFS_DATA_FORK) { - if (S_ISREG(be16_to_cpu(dip->di_mode))) - return __this_address; - if (be64_to_cpu(dip->di_size) > - XFS_DFORK_SIZE(dip, mp, whichfork)) - return __this_address; - } + if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) + return __this_address; if (di_nextents) return __this_address; break; @@ -358,12 +396,11 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - if (whichfork == XFS_ATTR_FORK) { - if (di_nextents > MAXAEXTNUM) - return __this_address; - } else if (di_nextents > MAXEXTNUM) { + max_extents = xfs_iext_max_nextents( + xfs_dinode_has_large_extent_counts(dip), + whichfork); + if (di_nextents > max_extents) return __this_address; - } break; default: return __this_address; @@ -396,6 +433,24 @@ xfs_dinode_verify_forkoff( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_nrext64( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) { + if (!xfs_has_large_extent_counts(mp)) + return __this_address; + if (dip->di_nrext64_pad != 0) + return __this_address; + } else if (dip->di_version >= 3) { + if (dip->di_v3_pad != 0) + return __this_address; + } + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -407,6 +462,9 @@ xfs_dinode_verify( uint16_t flags; uint64_t flags2; uint64_t di_size; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; @@ -437,10 +495,19 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) + return fa; + + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + /* Fork checks carried over from xfs_iformat_fork */ - if (mode && - be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks)) + if (mode && nextents + naextents > nblocks) + return __this_address; + + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) @@ -497,7 +564,7 @@ xfs_dinode_verify( default: return __this_address; } - if (dip->di_anextents) + if (naextents) return __this_address; } @@ -639,7 +706,7 @@ xfs_inode_validate_extsize( if (extsize_bytes % blocksize_bytes) return __this_address; - if (extsize > MAXEXTLEN) + if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) @@ -696,7 +763,7 @@ xfs_inode_validate_cowextsize( if (cowextsize_bytes % mp->m_sb.sb_blocksize) return __this_address; - if (cowextsize > MAXEXTLEN) + if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (cowextsize > mp->m_sb.sb_agblocks / 2) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9149f4f796fc..6b21760184d9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -35,8 +35,8 @@ xfs_init_local_fork( const void *data, int64_t size) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int mem_size = size, real_size = 0; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + int mem_size = size; bool zero_terminate; /* @@ -50,8 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -79,7 +78,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %zd).", + "corrupt inode %llu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -103,9 +102,9 @@ xfs_iformat_extents( int whichfork) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; @@ -117,8 +116,8 @@ xfs_iformat_extents( * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); + xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", + ip->i_ino, nex); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); @@ -174,7 +173,7 @@ xfs_iformat_btree( int size; int level; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); @@ -193,7 +192,7 @@ xfs_iformat_btree( XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", + xfs_warn(mp, "corrupt inode %llu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, @@ -230,7 +229,7 @@ xfs_iformat_data_fork( * depend on it. */ ip->i_df.if_format = dip->di_format; - ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + ip->i_df.if_nextents = xfs_dfork_data_extents(dip); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -277,17 +276,23 @@ xfs_dfork_attr_shortform_size( return be16_to_cpu(atp->hdr.totsize); } -struct xfs_ifork * -xfs_ifork_alloc( +void +xfs_ifork_init_attr( + struct xfs_inode *ip, enum xfs_dinode_fmt format, xfs_extnum_t nextents) { - struct xfs_ifork *ifp; + ip->i_af.if_format = format; + ip->i_af.if_nextents = nextents; +} - ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); - ifp->if_format = format; - ifp->if_nextents = nextents; - return ifp; +void +xfs_ifork_zap_attr( + struct xfs_inode *ip) +{ + xfs_idestroy_fork(&ip->i_af); + memset(&ip->i_af, 0, sizeof(struct xfs_ifork)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; } int @@ -295,16 +300,16 @@ xfs_iformat_attr_fork( struct xfs_inode *ip, struct xfs_dinode *dip) { + xfs_extnum_t naextents = xfs_dfork_attr_extents(dip); int error = 0; /* * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, - be16_to_cpu(dip->di_anextents)); + xfs_ifork_init_attr(ip, dip->di_aformat, naextents); - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, xfs_dfork_attr_shortform_size(dip)); @@ -324,10 +329,8 @@ xfs_iformat_attr_fork( break; } - if (error) { - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - ip->i_afp = NULL; - } + if (error) + xfs_ifork_zap_attr(ip); return error; } @@ -371,7 +374,7 @@ xfs_iroot_realloc( return; } - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (rec_diff > 0) { /* * If there wasn't any memory allocated before, just @@ -401,7 +404,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); return; } @@ -455,7 +458,7 @@ xfs_iroot_realloc( ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); return; } @@ -481,11 +484,11 @@ xfs_idata_realloc( int64_t byte_diff, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); - ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); if (byte_diff == 0) return; @@ -497,12 +500,7 @@ xfs_idata_realloc( return; } - /* - * For inline data, the underlying buffer must be a multiple of 4 bytes - * in size so that it can be logged and stay on word boundaries. - * We enforce that here. - */ - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } @@ -545,7 +543,7 @@ xfs_iextents_copy( int whichfork) { int state = xfs_bmap_fork_to_state(whichfork); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; int64_t copied = 0; @@ -597,7 +595,7 @@ xfs_iflush_fork( if (!iip) return; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, * for the attribute fork. @@ -613,7 +611,7 @@ xfs_iflush_fork( if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); } break; @@ -632,7 +630,7 @@ xfs_iflush_fork( (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); @@ -662,7 +660,7 @@ xfs_iext_state_to_fork( if (state & BMAP_COWFORK) return ip->i_cowfp; else if (state & BMAP_ATTRFORK) - return ip->i_afp; + return &ip->i_af; return &ip->i_df; } @@ -713,18 +711,17 @@ int xfs_ifork_verify_local_attr( struct xfs_inode *ip) { - struct xfs_ifork *ifp = ip->i_afp; + struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!ifp) + if (!xfs_inode_has_attr_fork(ip)) fa = __this_address; else fa = xfs_attr_shortform_verify(ip); if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); + ifp->if_u1.if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } @@ -737,14 +734,15 @@ xfs_iext_count_may_overflow( int whichfork, int nr_to_add) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); uint64_t max_exts; uint64_t nr_exts; if (whichfork == XFS_COW_FORK) return 0; - max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), + whichfork); if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) max_exts = 10; @@ -755,3 +753,27 @@ xfs_iext_count_may_overflow( return 0; } + +/* + * Upgrade this inode's extent counter fields to be able to handle a potential + * increase in the extent count by nr_to_add. Normally this is the same + * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. + */ +int +xfs_iext_count_upgrade( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint nr_to_add) +{ + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + + if (!xfs_has_large_extent_counts(ip->i_mount) || + xfs_inode_has_large_extent_counts(ip) || + XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + return -EFBIG; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 3d64a3acb0ed..d3943d6ad0b9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -21,9 +21,9 @@ struct xfs_ifork { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ - xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -40,19 +40,6 @@ struct xfs_ifork { #define XFS_IEXT_PUNCH_HOLE_CNT (1) /* - * Directory entry addition can cause the following, - * 1. Data block can be added/removed. - * A new extent can cause extent count to increase by 1. - * 2. Free disk block can be added/removed. - * Same behaviour as described above for Data block. - * 3. Dabtree blocks. - * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new - * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. - */ -#define XFS_IEXT_DIR_MANIP_CNT(mp) \ - ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) - -/* * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to * be added. One extra extent for dabtree in case a local attr is * large enough to cause a double split. It can also cause extent @@ -90,28 +77,8 @@ struct xfs_ifork { /* * Fork handling. */ - -#define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3)) - -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_afp : \ - (ip)->i_cowfp)) -#define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) -#define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - ((w) == XFS_ATTR_FORK ? \ - XFS_IFORK_ASIZE(ip) : \ - 0)) #define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t)) static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) { @@ -133,8 +100,68 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } -struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, - xfs_extnum_t nextents); +static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + case XFS_COW_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_DATA_FORK_LARGE; + return XFS_MAX_EXTCNT_DATA_FORK_SMALL; + + case XFS_ATTR_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + return XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + + default: + ASSERT(0); + return 0; + } +} + +static inline xfs_extnum_t +xfs_dfork_data_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be64_to_cpu(dip->di_big_nextents); + + return be32_to_cpu(dip->di_nextents); +} + +static inline xfs_extnum_t +xfs_dfork_attr_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be32_to_cpu(dip->di_big_anextents); + + return be16_to_cpu(dip->di_anextents); +} + +static inline xfs_extnum_t +xfs_dfork_nextents( + struct xfs_dinode *dip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_dfork_data_extents(dip); + case XFS_ATTR_FORK: + return xfs_dfork_attr_extents(dip); + default: + ASSERT(0); + break; + } + + return 0; +} + +void xfs_ifork_zap_attr(struct xfs_inode *ip); +void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format, + xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); @@ -229,6 +256,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); +int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, + uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b322db523d65..f13e0809dc63 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr) /* Log Clients */ #define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ @@ -114,7 +113,12 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_MAX 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_MAX 30 + /* * Flags to log operation header @@ -237,6 +241,8 @@ typedef struct xfs_trans_header { #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 +#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ +#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -252,7 +258,9 @@ typedef struct xfs_trans_header { { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ - { XFS_LI_BUD, "XFS_LI_BUD" } + { XFS_LI_BUD, "XFS_LI_BUD" }, \ + { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ + { XFS_LI_ATTRD, "XFS_LI_ATTRD" } /* * Inode Log Item Format definitions. @@ -388,16 +396,41 @@ struct xfs_log_dinode { uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ - uint8_t di_pad[6]; /* unused, zeroed space */ - uint16_t di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + uint64_t di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + uint64_t di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ + uint16_t di_flushiter; /* V2 inode incremented on flush */ + }; + }; xfs_log_timestamp_t di_atime; /* time last accessed */ xfs_log_timestamp_t di_mtime; /* time last modified */ xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + uint32_t di_nextents; + uint16_t di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + uint32_t di_big_anextents; + uint16_t di_nrext64_pad; + } __packed; + } __packed; uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ @@ -580,25 +613,49 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ + xfs_extent_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; +static inline size_t +xfs_efi_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ + xfs_extent_32_t efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +static inline size_t +xfs_efi_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ + xfs_extent_64_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; +static inline size_t +xfs_efi_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * This is the structure used to lay out an efd log item in the * log. The efd_extents array is a variable size array whose @@ -609,25 +666,49 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ + xfs_extent_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; +static inline size_t +xfs_efd_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ + xfs_extent_32_t efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +static inline size_t +xfs_efd_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ + xfs_extent_64_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; +static inline size_t +xfs_efd_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * RUI/RUD (reverse mapping) log format definitions */ @@ -869,4 +950,44 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; +/* + * Flags for deferred attribute operations. + * Upper bits are flags, lower byte is type code + */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) + +/* + * This is the structure used to lay out an attr log item in the + * log. + */ +struct xfs_attri_log_format { + uint16_t alfi_type; /* attri log item type */ + uint16_t alfi_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfi_id; /* attri identifier */ + uint64_t alfi_ino; /* the inode for this attr operation */ + uint32_t alfi_op_flags; /* marks the op as a set or remove */ + uint32_t alfi_name_len; /* attr name length */ + uint32_t alfi_value_len; /* attr value length */ + uint32_t alfi_attr_filter;/* attr filter flags */ +}; + +struct xfs_attrd_log_format { + uint16_t alfd_type; /* attrd log item type */ + uint16_t alfd_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfd_alf_id; /* id of corresponding attri */ +}; + #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index ff69a0000817..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops; extern const struct xlog_recover_item_ops xlog_rud_item_ops; extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; +extern const struct xlog_recover_item_ops xlog_attri_item_ops; +extern const struct xlog_recover_item_ops xlog_attrd_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -108,12 +110,6 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -126,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +int xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 67798ff5e14e..9975b93a7412 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -14,6 +14,7 @@ #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" +#include "xfs_trace.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res( } /* + * Compute an alternate set of log reservation sizes for use exclusively with + * minimum log size calculations. + */ +static void +xfs_log_calc_trans_resv_for_minlogblocks( + struct xfs_mount *mp, + struct xfs_trans_resv *resv) +{ + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + + xfs_trans_resv_calc(mp, resv); + + if (xfs_has_reflink(mp)) { + /* + * In the early days of reflink, typical log operation counts + * were greatly overestimated. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + resv->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + } else if (xfs_has_rmapbt(mp)) { + /* + * In the early days of non-reflink rmap, the impact of rmapbt + * updates on log counts were not taken into account at all. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + } + + /* + * In the early days of reflink, we did not use deferred refcount + * update log items, so log reservations must be recomputed using the + * old calculations. + */ + resv->tr_write.tr_logres = + xfs_calc_write_reservation_minlogsize(mp); + resv->tr_itruncate.tr_logres = + xfs_calc_itruncate_reservation_minlogsize(mp); + resv->tr_qm_dqalloc.tr_logres = + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; +} + +/* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. @@ -46,19 +106,25 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { + struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; + unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); - resp = (struct xfs_trans_res *)M_RES(mp); - end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); - for (; resp < end_resp; resp++) { + xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); + + resp = (struct xfs_trans_res *)&resv; + end_resp = (struct xfs_trans_res *)(&resv + 1); + for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; + + trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ @@ -66,9 +132,10 @@ xfs_log_get_max_trans_res( } if (attr_space > log_space) { - *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } + trace_xfs_log_get_max_trans_res(mp, max_resp); } /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index a02c5062f9b2..cb035da3f990 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -16,7 +16,6 @@ * and quota-limits. This is a waste in the common case, but hey ... */ typedef uint64_t xfs_qcnt_t; -typedef uint16_t xfs_qwarncnt_t; typedef uint8_t xfs_dqtype_t; @@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t; /* * flags for q_flags field in the dquot. */ -#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ +#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ @@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */ +#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */ +#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be * modified. */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 +#define XFS_QMOPT_RES_REGBLKS (1u << 7) +#define XFS_QMOPT_RES_RTBLKS (1u << 8) +#define XFS_QMOPT_BCOUNT (1u << 9) +#define XFS_QMOPT_ICOUNT (1u << 10) +#define XFS_QMOPT_RTBCOUNT (1u << 11) +#define XFS_QMOPT_DELBCOUNT (1u << 12) +#define XFS_QMOPT_DELRTBCOUNT (1u << 13) +#define XFS_QMOPT_RES_INOS (1u << 14) /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_INHERIT (1u << 31) + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } /* * flags to xfs_trans_mod_dquot. @@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 327ba25e9e17..3f34bafe18dd 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, int xfs_refcount_lookup_le( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); } @@ -63,13 +66,16 @@ xfs_refcount_lookup_le( int xfs_refcount_lookup_ge( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -80,13 +86,16 @@ xfs_refcount_lookup_ge( int xfs_refcount_lookup_eq( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } @@ -96,7 +105,17 @@ xfs_refcount_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { - irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + uint32_t start; + + start = be32_to_cpu(rec->refc.rc_startblock); + if (start & XFS_REFC_COWFLAG) { + start &= ~XFS_REFC_COWFLAG; + irec->rc_domain = XFS_REFC_DOMAIN_COW; + } else { + irec->rc_domain = XFS_REFC_DOMAIN_SHARED; + } + + irec->rc_startblock = start; irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } @@ -111,48 +130,35 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; - xfs_agblock_t realstart; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !*stat) return error; xfs_refcount_btrec_to_irec(rec, irec); - - agno = cur->bc_ag.pag->pag_agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; - /* handle special COW-staging state */ - realstart = irec->rc_startblock; - if (realstart & XFS_REFC_COW_START) { - if (irec->rc_refcount != 1) - goto out_bad_rec; - realstart &= ~XFS_REFC_COW_START; - } else if (irec->rc_refcount < 2) { + if (!xfs_refcount_check_domain(irec)) goto out_bad_rec; - } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, realstart)) - goto out_bad_rec; - if (realstart > realstart + irec->rc_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); return 0; out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", agno); + "Refcount BTree record corruption in AG %d detected!", + pag->pag_agno); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -170,12 +176,17 @@ xfs_refcount_update( struct xfs_refcount_irec *irec) { union xfs_btree_rec rec; + uint32_t start; int error; trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); - rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec.refc.rc_startblock = cpu_to_be32(start); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, @@ -197,9 +208,12 @@ xfs_refcount_insert( int error; trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + cur->bc_rec.rc.rc_domain = irec->rc_domain; + error = xfs_btree_insert(cur, i); if (error) goto out_error; @@ -245,7 +259,8 @@ xfs_refcount_delete( } if (error) goto out_error; - error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); + error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock, + &found_rec); out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, @@ -344,6 +359,7 @@ xfs_refc_next( STATIC int xfs_refcount_split_extent( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t agbno, bool *shape_changed) { @@ -352,7 +368,7 @@ xfs_refcount_split_extent( int error; *shape_changed = false; - error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec); if (error) goto out_error; if (!found_rec) @@ -365,6 +381,8 @@ xfs_refcount_split_extent( error = -EFSCORRUPTED; goto out_error; } + if (rcext.rc_domain != domain) + return 0; if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; @@ -416,6 +434,9 @@ xfs_refcount_merge_center_extents( trace_xfs_refcount_merge_center_extents(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, center, right); + ASSERT(left->rc_domain == center->rc_domain); + ASSERT(right->rc_domain == center->rc_domain); + /* * Make sure the center and right extents are not in the btree. * If the center extent was synthesized, the first delete call @@ -424,8 +445,8 @@ xfs_refcount_merge_center_extents( * call removes the center and the second one removes the right * extent. */ - error = xfs_refcount_lookup_ge(cur, center->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_ge(cur, center->rc_domain, + center->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -452,8 +473,8 @@ xfs_refcount_merge_center_extents( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -492,10 +513,12 @@ xfs_refcount_merge_left_extent( trace_xfs_refcount_merge_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft); + ASSERT(left->rc_domain == cleft->rc_domain); + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cleft->rc_domain, + cleft->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -513,8 +536,8 @@ xfs_refcount_merge_left_extent( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -553,13 +576,15 @@ xfs_refcount_merge_right_extent( trace_xfs_refcount_merge_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right); + ASSERT(right->rc_domain == cright->rc_domain); + /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, * remove it. */ if (cright->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cright->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cright->rc_domain, + cright->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -577,8 +602,8 @@ xfs_refcount_merge_right_extent( } /* Enlarge the right extent. */ - error = xfs_refcount_lookup_le(cur, right->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, right->rc_domain, + right->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -601,8 +626,6 @@ out_error: return error; } -#define XFS_FIND_RCEXT_SHARED 1 -#define XFS_FIND_RCEXT_COW 2 /* * Find the left extent and the one after it (cleft). This function assumes * that we've already split any extent crossing agbno. @@ -612,16 +635,16 @@ xfs_refcount_find_left_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec); if (error) goto out_error; if (!found_rec) @@ -635,11 +658,9 @@ xfs_refcount_find_left_extents( goto out_error; } - if (xfs_refc_next(&tmp) != agbno) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (xfs_refc_next(&tmp) != agbno) return 0; /* We have a left extent; retrieve (or invent) the next right one */ *left = tmp; @@ -656,6 +677,9 @@ xfs_refcount_find_left_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) *cleft = tmp; @@ -672,8 +696,10 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = min(aglen, tmp.rc_startblock - agbno); cleft->rc_refcount = 1; + cleft->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -681,6 +707,7 @@ xfs_refcount_find_left_extents( cleft->rc_startblock = agbno; cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; + cleft->rc_domain = domain; } trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); @@ -701,16 +728,16 @@ xfs_refcount_find_right_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec); if (error) goto out_error; if (!found_rec) @@ -724,11 +751,9 @@ xfs_refcount_find_right_extents( goto out_error; } - if (tmp.rc_startblock != agbno + aglen) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (tmp.rc_startblock != agbno + aglen) return 0; /* We have a right extent; retrieve (or invent) the next left one */ *right = tmp; @@ -745,6 +770,9 @@ xfs_refcount_find_right_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) *cright = tmp; @@ -761,8 +789,10 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = right->rc_startblock - cright->rc_startblock; cright->rc_refcount = 1; + cright->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -770,6 +800,7 @@ xfs_refcount_find_right_extents( cright->rc_startblock = agbno; cright->rc_blockcount = aglen; cright->rc_refcount = 1; + cright->rc_domain = domain; } trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); @@ -795,10 +826,10 @@ xfs_refc_valid( STATIC int xfs_refcount_merge_extents( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adjust, - int flags, bool *shape_changed) { struct xfs_refcount_irec left = {0}, cleft = {0}; @@ -813,12 +844,12 @@ xfs_refcount_merge_extents( * just below (agbno + aglen) [cright], and just above (agbno + aglen) * [right]. */ - error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, - *aglen, flags); + error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain, + *agbno, *aglen); if (error) return error; - error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, - *aglen, flags); + error = xfs_refcount_find_right_extents(cur, &right, &cright, domain, + *agbno, *aglen); if (error) return error; @@ -871,7 +902,7 @@ xfs_refcount_merge_extents( aglen); } - return error; + return 0; } /* @@ -886,8 +917,13 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_ag.refc.shape_changes * - xfs_allocfree_log_count(cur->bc_mp, 1); + /* + * Worst case estimate: full splits of the free space and rmap btrees + * to handle each of the shape changes to the refcount btree. + */ + overhead = xfs_allocfree_block_count(cur->bc_mp, + cur->bc_ag.refc.shape_changes); + overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -929,7 +965,8 @@ xfs_refcount_adjust_extents( if (*aglen == 0) return 0; - error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno, + &found_rec); if (error) goto out_error; @@ -937,10 +974,11 @@ xfs_refcount_adjust_extents( error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; - if (!found_rec) { + if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_SHARED; } /* @@ -953,6 +991,8 @@ xfs_refcount_adjust_extents( tmp.rc_blockcount = min(*aglen, ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; + tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -960,6 +1000,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ + cur->bc_ag.refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -970,7 +1011,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -982,15 +1022,30 @@ xfs_refcount_adjust_extents( (*agbno) += tmp.rc_blockcount; (*aglen) -= tmp.rc_blockcount; - error = xfs_refcount_lookup_ge(cur, *agbno, + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* Move the cursor to the start of ext. */ + error = xfs_refcount_lookup_ge(cur, + XFS_REFC_DOMAIN_SHARED, *agbno, &found_rec); if (error) goto out_error; } - /* Stop if there's nothing left to modify */ - if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) - break; + /* + * A previous step trimmed agbno/aglen such that the end of the + * range would not be in the middle of the record. If this is + * no longer the case, something is seriously wrong with the + * btree. Make sure we never feed the synthesized record into + * the processing loop below. + */ + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || + XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Adjust the reference count and either update the tree @@ -1001,11 +1056,11 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); + cur->bc_ag.refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1014,7 +1069,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, @@ -1067,13 +1121,15 @@ xfs_refcount_adjust( /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno + aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1082,8 +1138,8 @@ xfs_refcount_adjust( /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, - XFS_FIND_RCEXT_SHARED, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, + new_agbno, new_aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1122,6 +1178,32 @@ xfs_refcount_finish_one_cleanup( } /* + * Set up a continuation a deferred refcount operation by updating the intent. + * Checks to make sure we're not going to run off the end of the AG. + */ +static inline int +xfs_refcount_continue_op( + struct xfs_btree_cur *cur, + xfs_fsblock_t startblock, + xfs_agblock_t new_agbno, + xfs_extlen_t new_len, + xfs_fsblock_t *new_fsbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; + + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + return -EFSCORRUPTED; + + *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + + ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + + return 0; +} + +/* * Process one of the deferred refcount operations. We pass back the * btree cursor to maintain our lock on the btree between calls. * This saves time and eliminates a buffer deadlock between the @@ -1173,8 +1255,8 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, + &agbp); if (error) goto out_drop; @@ -1188,12 +1270,20 @@ xfs_refcount_finish_one( case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_INCREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_DECREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1304,7 +1394,8 @@ xfs_refcount_find_shared( *flen = 0; /* Try to find a refcount extent that crosses the start */ - error = xfs_refcount_lookup_le(cur, agbno, &have); + error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno, + &have); if (error) goto out_error; if (!have) { @@ -1322,6 +1413,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1337,6 +1430,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; } /* If the extent starts after the range we want, bail out */ @@ -1368,7 +1463,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } - if (tmp.rc_startblock >= agbno + aglen || + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED || + tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); @@ -1452,17 +1548,23 @@ xfs_refcount_adjust_cow_extents( return 0; /* Find any overlapping refcount records */ - error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno, + &found_rec); if (error) goto out_error; error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && + ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + error = -EFSCORRUPTED; + goto out_error; + } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + - XFS_REFC_COW_START; + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_COW; } switch (adj) { @@ -1477,6 +1579,8 @@ xfs_refcount_adjust_cow_extents( tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; + tmp.rc_domain = XFS_REFC_DOMAIN_COW; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -1539,24 +1643,24 @@ xfs_refcount_adjust_cow( bool shape_changed; int error; - agbno += XFS_REFC_COW_START; - /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno, &shape_changed); if (error) goto out_error; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno + aglen, &shape_changed); if (error) goto out_error; /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, - XFS_FIND_RCEXT_COW, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno, + &aglen, adj, &shape_changed); if (error) goto out_error; @@ -1663,10 +1767,18 @@ xfs_refcount_recover_extent( be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); + rr = kmalloc(sizeof(struct xfs_refcount_recovery), + GFP_KERNEL | __GFP_NOFAIL); + INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - list_add_tail(&rr->rr_list, debris); + if (XFS_IS_CORRUPT(cur->bc_mp, + rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + kfree(rr); + return -EFSCORRUPTED; + } + + list_add_tail(&rr->rr_list, debris); return 0; } @@ -1684,10 +1796,11 @@ xfs_refcount_recover_cow_leftovers( union xfs_btree_irec low; union xfs_btree_irec high; xfs_fsblock_t fsb; - xfs_agblock_t agbno; int error; - if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); + if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) return -EOPNOTSUPP; INIT_LIST_HEAD(&debris); @@ -1706,7 +1819,7 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) goto out_trans; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); @@ -1714,7 +1827,7 @@ xfs_refcount_recover_cow_leftovers( /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); - low.rc.rc_startblock = XFS_REFC_COW_START; + low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); @@ -1735,8 +1848,8 @@ xfs_refcount_recover_cow_leftovers( &rr->rr_rrec); /* Free the orphan record */ - agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, + rr->rr_rrec.rc_startblock); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); @@ -1748,7 +1861,7 @@ xfs_refcount_recover_cow_leftovers( goto out_free; list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; @@ -1758,7 +1871,7 @@ out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; } @@ -1767,6 +1880,7 @@ out_free: int xfs_refcount_has_record( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, bool *exists) @@ -1778,6 +1892,7 @@ xfs_refcount_has_record( low.rc.rc_startblock = bno; memset(&high, 0xFF, sizeof(high)); high.rc.rc_startblock = bno + len - 1; + low.rc.rc_domain = high.rc.rc_domain = domain; return xfs_btree_has_record(cur, &low, &high, exists); } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9eb01edbd89d..452f30556f5a 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -14,14 +14,33 @@ struct xfs_bmbt_irec; struct xfs_refcount_irec; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +static inline uint32_t +xfs_refcount_encode_startblock( + xfs_agblock_t startblock, + enum xfs_refc_domain domain) +{ + uint32_t start; + + /* + * low level btree operations need to handle the generic btree range + * query functions (which set rc_domain == -1U), so we check that the + * domain is /not/ shared. + */ + start = startblock & ~XFS_REFC_COWFLAG; + if (domain != XFS_REFC_DOMAIN_SHARED) + start |= XFS_REFC_COWFLAG; + + return start; +} + enum xfs_refcount_intent_type { XFS_REFCOUNT_INCREASE = 1, XFS_REFCOUNT_DECREASE, @@ -36,6 +55,18 @@ struct xfs_refcount_intent { xfs_fsblock_t ri_startblock; }; +/* Check that the refcount is appropriate for the record domain. */ +static inline bool +xfs_refcount_check_domain( + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1) + return false; + if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2) + return false; + return true; +} + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -67,16 +98,20 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, * log (plus any key updates) so we'll conservatively assume 32 bytes * per record. We must also leave space for btree splits on both ends * of the range and space for the CUD and a new CUI. + * + * Each EFI that we attach to the transaction is assumed to consume ~32 bytes. + * This is a low estimate for an EFI tracking a single extent (16 bytes for the + * EFI header, 16 for the extent, and 12 for the xlog op header), but the + * estimate is acceptable if there's more than one extent being freed. + * In the worst case of freeing every other block during a refcount decrease + * operation, we amortize the space used for one EFI log item across 16 + * extents. */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) -{ - return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; -} - extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + enum xfs_refc_domain domain, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d14c1720b0fb..e1f789866683 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -13,6 +13,7 @@ #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" +#include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -160,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } @@ -182,10 +188,13 @@ xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - struct xfs_refcount_irec *rec = &cur->bc_rec.rc; const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; - return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; } STATIC int64_t @@ -493,7 +502,7 @@ xfs_refcountbt_calc_reserves( if (!xfs_has_reflink(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; @@ -507,8 +516,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index cd322174dbff..b56aca1e7c66 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,18 +34,32 @@ int xfs_rmap_lookup_le( struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat) { + int get_stat = 0; + int error; + cur->bc_rec.r.rm_startblock = bno; - cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_blockcount = 0; cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + if (error || !(*stat) || !irec) + return error; + + error = xfs_rmap_get_rec(cur, irec, &get_stat); + if (error) + return error; + if (!get_stat) + return -EFSCORRUPTED; + + return 0; } /* @@ -201,7 +215,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -221,13 +235,8 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, irec->rm_startblock)) - goto out_bad_rec; - if (irec->rm_startblock > - irec->rm_startblock + irec->rm_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, - irec->rm_startblock + irec->rm_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) goto out_bad_rec; } @@ -240,7 +249,7 @@ xfs_rmap_get_rec( out_bad_rec: xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected!", - agno); + pag->pag_agno); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -251,7 +260,6 @@ out_bad_rec: struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; - int *stat; }; /* For each rmap given, figure out if it matches the key we want. */ @@ -276,7 +284,6 @@ xfs_rmap_find_left_neighbor_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -285,7 +292,7 @@ xfs_rmap_find_left_neighbor_helper( * return a match with the same owner and adjacent physical and logical * block ranges. */ -int +STATIC int xfs_rmap_find_left_neighbor( struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -296,6 +303,7 @@ xfs_rmap_find_left_neighbor( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; *stat = 0; @@ -313,21 +321,44 @@ xfs_rmap_find_left_neighbor( info.high.rm_flags = flags; info.high.rm_blockcount = 0; info.irec = irec; - info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_find_left_neighbor_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * extent conversion and remap operations run a bit faster if the + * physical extents aren't being shared. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* For each rmap given, figure out if it matches the key we want. */ @@ -353,7 +384,6 @@ xfs_rmap_lookup_le_range_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -374,6 +404,7 @@ xfs_rmap_lookup_le_range( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; info.high.rm_startblock = bno; @@ -386,20 +417,44 @@ xfs_rmap_lookup_le_range( info.high.rm_blockcount = 0; *stat = 0; info.irec = irec; - info.stat = stat; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_lookup_le_range_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, + bno, 0, owner, offset, flags); + + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * scrub run much faster on most filesystems because bmbt records are + * usually an exact match for rmap records. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_lookup_le_range_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* @@ -510,7 +565,7 @@ xfs_rmap_unmap( * for the AG headers at rm_startblock == 0 created by mkfs/growfs that * will not ever be removed from the tree. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -518,13 +573,6 @@ xfs_rmap_unmap( goto out_error; } - error = xfs_rmap_get_rec(cur, <rec, &i); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -786,18 +834,11 @@ xfs_rmap_map( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &have_lt); if (error) goto out_error; if (have_lt) { - error = xfs_rmap_get_rec(cur, <rec, &have_lt); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, have_lt != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -1022,7 +1063,7 @@ xfs_rmap_convert( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1030,13 +1071,6 @@ xfs_rmap_convert( goto done; } - error = xfs_rmap_get_rec(cur, &PREV, &i); - if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, @@ -1140,7 +1174,7 @@ xfs_rmap_convert( _RET_IP_); /* reset the cursor back to PREV */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2677,7 +2711,7 @@ xfs_rmap_record_exists( ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || (flags & XFS_RMAP_BMBT_BLOCK)); - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, &has_record); if (error) return error; @@ -2686,14 +2720,6 @@ xfs_rmap_record_exists( return 0; } - error = xfs_rmap_get_rec(cur, &irec, &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; - } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b718ebeda372..54741a591a17 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, - unsigned int flags, int *stat); + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, int *stat); @@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); -int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, - struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 69e104d0277f..7f83f62e51e0 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -90,7 +90,7 @@ xfs_rmapbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, + error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -129,7 +129,7 @@ xfs_rmapbt_free_block( bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; @@ -652,7 +652,7 @@ xfs_rmapbt_calc_reserves( if (!xfs_has_rmapbt(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; @@ -666,8 +666,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5740ba664867..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1008,6 +1008,7 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, @@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range( void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; xfs_rtblock_t high_key; @@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } @@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..a20cade590e9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -31,15 +31,66 @@ */ /* + * Check that all the V4 feature bits that the V5 filesystem format requires are + * correctly set. + */ +static bool +xfs_sb_validate_v5_features( + struct xfs_sb *sbp) +{ + /* We must not have any unknown V4 feature bits set */ + if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) + return false; + + /* + * The CRC bit is considered an invalid V4 flag, so we have to add it + * manually to the OKBITS mask. + */ + if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | + XFS_SB_VERSION2_CRCBIT)) + return false; + + /* Now check all the required V4 feature flags are set. */ + +#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_MOREBITSBIT) + +#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_CRCBIT) + + if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) + return false; + if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) + return false; + return true; +} + +/* * We support all XFS versions newer than a v4 superblock with V2 directories. */ bool xfs_sb_good_version( struct xfs_sb *sbp) { - /* all v5 filesystems are supported */ + /* + * All v5 filesystems are supported, but we must check that all the + * required v4 feature flags are enabled correctly as the code checks + * those flags and not for v5 support. + */ if (xfs_sb_is_v5(sbp)) - return true; + return xfs_sb_validate_v5_features(sbp); + + /* We must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) @@ -51,12 +102,6 @@ xfs_sb_good_version( if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; - /* And must not have any unknown v4 feature bits set */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - /* It's a supported v4 filesystem */ return true; } @@ -70,6 +115,8 @@ xfs_sb_version_to_features( /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) + features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) @@ -124,6 +171,9 @@ xfs_sb_version_to_features( features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) + features |= XFS_FEAT_NREXT64; + return features; } @@ -262,12 +312,15 @@ xfs_validate_sb_common( bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, +"Superblock has bad magic number 0x%x. Not an XFS filesystem?", + be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); + xfs_warn(mp, +"Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } @@ -911,6 +964,11 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); @@ -1135,6 +1193,8 @@ xfs_fs_geometry( } else { geo->logsectsize = BBSIZE; } + if (xfs_has_large_extent_counts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 25c4cab58851..c4381388c0c1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, /* * Values for t_flags. */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ -#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ +/* Transaction needs to be logged */ +#define XFS_TRANS_DIRTY (1u << 0) +/* Superblock is dirty and needs to be logged */ +#define XFS_TRANS_SB_DIRTY (1u << 1) +/* Transaction took a permanent log reservation */ +#define XFS_TRANS_PERM_LOG_RES (1u << 2) +/* Synchronous transaction commit needed */ +#define XFS_TRANS_SYNC (1u << 3) +/* Transaction can use reserve block pool */ +#define XFS_TRANS_RESERVE (1u << 4) +/* Transaction should avoid VFS level superblock write accounting */ +#define XFS_TRANS_NO_WRITECOUNT (1u << 5) +/* Transaction has freed blocks returned to it's reservation */ +#define XFS_TRANS_RES_FDBLKS (1u << 6) +/* Transaction contains an intent done log item */ +#define XFS_TRANS_HAS_INTENT_DONE (1u << 7) + /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f0b38f4aba80..bdc777b9ec4a 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -204,7 +204,7 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); char *sfp = (char *)ifp->if_u1.if_data; int size = ifp->if_bytes; char *endp = sfp + size; @@ -213,7 +213,7 @@ xfs_symlink_shortform_verify( /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6f83d9b306ee..5b2f27cbdb80 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -56,15 +56,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). With reflink, there are four trees (refcountbt). The number of - * blocks reserved is based on the formula: + * (rmapbt). The number of blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ uint -xfs_allocfree_log_count( +xfs_allocfree_block_count( struct xfs_mount *mp, uint num_ops) { @@ -73,13 +72,24 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_has_reflink(mp)) - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } /* + * Per-extent log reservation for refcount btree changes. These are never done + * in the same transaction as an allocation or a free, so we compute them + * separately. + */ +static unsigned int +xfs_refcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_refc_maxlevels - 1); +} + +/* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into * the amount of space they use on disk. @@ -136,7 +146,7 @@ xfs_calc_inobt_res( { return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res( { uint res, size = 0; - res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ @@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res( /* * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap - * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, - * as well as the realtime summary block. + * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime + * extents, as well as the realtime summary block. */ static unsigned int -xfs_rtalloc_log_count( +xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int rtbmp_bytes; - rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; } @@ -233,6 +243,28 @@ xfs_rtalloc_log_count( * register overflow from temporaries in the calculations. */ +/* + * Compute the log reservation required to handle the refcount update + * transaction. Refcount updates are always done via deferred log items. + * + * This is calculated as: + * Data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +static unsigned int +xfs_calc_refcountbt_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); +} /* * In a write transaction we can allocate a maximum of 2 @@ -247,7 +279,7 @@ xfs_rtalloc_log_count( * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size - * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 1 block * the allocation btrees: 2 trees * (2 * max depth - 1) * block size * And the bmap_finish transaction can free bmap blocks in a join (t3): @@ -255,34 +287,65 @@ xfs_rtalloc_log_count( * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_write_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz); } else { t2 = 0; } t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * two refcountbt splits for each transaction. The codebase runs + * refcountbt updates in separate transactions now, so to compute the + * minimum log size, add the refcountbtree splits back to t1 and t3 and + * do not account them separately as t4. Reflink did not support + * realtime when the reservations were established, so no adjustment to + * t2 is needed. + */ + if (for_minlogsize) { + unsigned int adj = 0; + + if (xfs_has_reflink(mp)) + adj = xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 2), + blksz); + t1 += adj; + t3 += adj; + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 1); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_write_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp, true); } /* @@ -299,38 +362,67 @@ xfs_calc_write_reservation( * the agf for each of the ags: 2 * sector size * the agfl for each of the ags: 2 * sector size * the super block to reflect the freed blocks: sector size - * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_itruncate_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); } else { t3 = 0; } - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * four refcountbt splits in the same transaction as bnobt/cntbt + * updates. The codebase runs refcountbt updates in separate + * transactions now, so to compute the minimum log size, add the + * refcount btree splits back here and do not compute them separately + * as t4. Reflink did not support realtime when the reservations were + * established, so do not adjust t3. + */ + if (for_minlogsize) { + if (xfs_has_reflink(mp)) + t2 += xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 4), + blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 2); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_itruncate_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_itruncate_reservation(mp, true); } /* * In renaming a files we can modify: - * the four inodes involved: 4 * inode size + * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets @@ -345,11 +437,11 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 5) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -389,7 +481,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -423,11 +515,11 @@ xfs_calc_remove_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -572,7 +664,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -670,7 +762,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void) */ STATIC uint xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - return xfs_calc_write_reservation(mp) + + return xfs_calc_write_reservation(mp, for_minlogsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } +unsigned int +xfs_calc_qm_dqalloc_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_qm_dqalloc_reservation(mp, true); +} + /* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size @@ -814,36 +914,18 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; - - /* - * In the early days of rmap+reflink, we always set the rmap maxlevels - * to 9 even if the AG was small enough that it would never grow to - * that height. Transaction reservation sizes influence the minimum - * log size calculation, which influences the size of the log that mkfs - * creates. Use the old value here to ensure that newly formatted - * small filesystems will mount on older kernels. - */ - if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) - mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + int logcount_adj = 0; /* * The following transactions are logged in physical format and * require a permanent reservation on space. */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_itruncate.tr_logcount = - XFS_ITRUNCATE_LOG_COUNT_REFLINK; - else - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -899,11 +981,9 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, + false); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* @@ -930,6 +1010,19 @@ xfs_trans_resv_calc( resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); - /* Put everything back the way it was. This goes at the end. */ - mp->m_rmap_maxlevels = rmap_maxlevels; + /* + * Add one logcount for BUI items that appear with rmap or reflink, + * one logcount for refcount intent items, and one logcount for rmap + * intent items. + */ + if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) + logcount_adj++; + if (xfs_has_reflink(mp)) + logcount_adj++; + if (xfs_has_rmapbt(mp)) + logcount_adj++; + + resp->tr_itruncate.tr_logcount += logcount_adj; + resp->tr_write.tr_logcount += logcount_adj; + resp->tr_qm_dqalloc.tr_logcount += logcount_adj; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index fc4e9b369a3a..0554b9d775d2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,7 +73,6 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -83,13 +82,24 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 +/* + * Original log operation counts were overestimated in the early days of + * reflink. These are retained here purely for minimum log size calculations + * and must not be used for runtime reservations. + */ +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 + void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); -uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); + +unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index e810d23f2d97..5c2765934732 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -13,25 +13,13 @@ #include "xfs_mount.h" #include "xfs_ag.h" -/* Find the size of the AG, in blocks. */ -inline xfs_agblock_t -xfs_ag_block_count( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - ASSERT(agno < mp->m_sb.sb_agcount); - - if (agno < mp->m_sb.sb_agcount - 1) - return mp->m_sb.sb_agblocks; - return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); -} /* * Verify that an AG block number pointer neither points outside the AG * nor points at static metadata. */ -inline bool -xfs_verify_agbno( +static inline bool +xfs_verify_agno_agbno( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno) @@ -59,7 +47,7 @@ xfs_verify_fsbno( if (agno >= mp->m_sb.sb_agcount) return false; - return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); + return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); } /* @@ -85,40 +73,12 @@ xfs_verify_fsbext( XFS_FSB_TO_AGNO(mp, fsbno + len - 1); } -/* Calculate the first and last possible inode number in an AG. */ -inline void -xfs_agino_range( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t *first, - xfs_agino_t *last) -{ - xfs_agblock_t bno; - xfs_agblock_t eoag; - - eoag = xfs_ag_block_count(mp, agno); - - /* - * Calculate the first inode, which will be in the first - * cluster-aligned block after the AGFL. - */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); - *first = XFS_AGB_TO_AGINO(mp, bno); - - /* - * Calculate the last inode, which will be at the end of the - * last (aligned) cluster that can be allocated in the AG. - */ - bno = round_down(eoag, M_IGEO(mp)->cluster_align); - *last = XFS_AGB_TO_AGINO(mp, bno) - 1; -} - /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. */ -inline bool -xfs_verify_agino( +static inline bool +xfs_verify_agno_agino( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino) @@ -131,19 +91,6 @@ xfs_verify_agino( } /* - * Verify that an AG inode number pointer neither points outside the AG - * nor points at static metadata, or is NULLAGINO. - */ -bool -xfs_verify_agino_or_null( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino) -{ - return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); -} - -/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -159,7 +106,7 @@ xfs_verify_ino( return false; if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) return false; - return xfs_verify_agino(mp, agno, agino); + return xfs_verify_agno_agino(mp, agno, agino); } /* Is this an internal inode number? */ @@ -229,12 +176,8 @@ xfs_icount_range( /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) { - xfs_agino_t first, last; - - xfs_agino_range(mp, agno, &first, &last); - nr_inos += last - first + 1; - } + for_each_perag(mp, agno, pag) + nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b6da06b40989..5ebdda7e1078 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ -typedef int32_t xfs_extnum_t; /* # of extents in a file */ -typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef uint64_t xfs_extnum_t; /* # of extents in a file */ +typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ @@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t; #define NULLAGINO ((xfs_agino_t)-1) /* - * Max values for extlen, extnum, aextnum. - */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ - -/* * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). @@ -173,6 +166,36 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +enum xfs_refc_domain { + XFS_REFC_DOMAIN_SHARED = 0, + XFS_REFC_DOMAIN_COW, +}; + +#define XFS_REFC_DOMAIN_STRINGS \ + { XFS_REFC_DOMAIN_SHARED, "shared" }, \ + { XFS_REFC_DOMAIN_COW, "cow" } + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ + enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */ +}; + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + /* per-AG block reservation types */ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, @@ -186,19 +209,10 @@ enum xfs_ag_resv_type { */ struct xfs_mount; -xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); -bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno); bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); -void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t *first, xfs_agino_t *last); -bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); -bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index bed798792226..b7b838bd4ba4 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -281,7 +281,7 @@ xchk_superblock( features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); if ((sb->sb_features2 & features_mask) != (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) - xchk_block_set_corrupt(sc, bp); + xchk_block_set_preen(sc, bp); if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ @@ -290,39 +290,38 @@ xchk_superblock( offsetof(struct xfs_dsb, sb_features_compat))) xchk_block_set_corrupt(sc, bp); } else { - /* Check compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); - if ((sb->sb_features_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + /* compat features must match */ + if (sb->sb_features_compat != + cpu_to_be32(mp->m_sb.sb_features_compat)) xchk_block_set_corrupt(sc, bp); - /* Check ro compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | - XFS_SB_FEAT_RO_COMPAT_FINOBT | - XFS_SB_FEAT_RO_COMPAT_RMAPBT | - XFS_SB_FEAT_RO_COMPAT_REFLINK); - if ((sb->sb_features_ro_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & - features_mask)) + /* ro compat features must match */ + if (sb->sb_features_ro_compat != + cpu_to_be32(mp->m_sb.sb_features_ro_compat)) xchk_block_set_corrupt(sc, bp); - /* Check incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | - XFS_SB_FEAT_INCOMPAT_FTYPE | - XFS_SB_FEAT_INCOMPAT_SPINODES | - XFS_SB_FEAT_INCOMPAT_META_UUID); - if ((sb->sb_features_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_incompat) & - features_mask)) - xchk_block_set_corrupt(sc, bp); + /* + * NEEDSREPAIR is ignored on a secondary super, so we should + * clear it when we find it, though it's not a corruption. + */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & features_mask) + xchk_block_set_preen(sc, bp); - /* Check log incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); - if ((sb->sb_features_log_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & - features_mask)) + /* all other incompat features must match */ + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & ~features_mask) xchk_block_set_corrupt(sc, bp); + /* + * log incompat features protect newer log record types from + * older log recovery code. Log recovery doesn't check the + * secondary supers, so we can clear these if needed. + */ + if (sb->sb_features_log_incompat) + xchk_block_set_preen(sc, bp); + /* Don't care about sb_crc */ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) @@ -542,16 +541,16 @@ xchk_agf( /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); - if (eoag != xfs_ag_block_count(mp, agno)) + if (eoag != pag->block_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); @@ -564,7 +563,7 @@ xchk_agf( if (xfs_has_rmapbt(mp)) { agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); @@ -574,7 +573,7 @@ xchk_agf( if (xfs_has_reflink(mp)) { agbno = be32_to_cpu(agf->agf_refcount_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_refcount_level); @@ -640,9 +639,8 @@ xchk_agfl_block( { struct xchk_agfl_info *sai = priv; struct xfs_scrub *sc = sai->sc; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; - if (xfs_verify_agbno(mp, agno, agbno) && + if (xfs_verify_agbno(sc->sa.pag, agbno) && sai->nr_entries < sai->sz_entries) sai->entries[sai->nr_entries++] = agbno; else @@ -872,12 +870,12 @@ xchk_agi( /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); - if (eoag != xfs_ag_block_count(mp, agno)) + if (eoag != pag->block_count) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check btree roots and levels */ agbno = be32_to_cpu(agi->agi_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_level); @@ -886,7 +884,7 @@ xchk_agi( if (xfs_has_finobt(mp)) { agbno = be32_to_cpu(agi->agi_free_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_free_level); @@ -903,17 +901,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d7bfed52f4cd..1b0b4e243f77 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -52,6 +52,18 @@ xrep_superblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + /* + * Don't write out a secondary super with NEEDSREPAIR or log incompat + * features set, since both are ignored when set on a secondary. + */ + if (xfs_has_crc(mp)) { + struct xfs_dsb *sb = bp->b_addr; + + sb->sb_features_incompat &= + ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sb->sb_features_log_incompat = 0; + } + /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); @@ -94,7 +106,7 @@ xrep_agf_check_agfl_block( { struct xfs_scrub *sc = priv; - if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno)) + if (!xfs_verify_agbno(sc->sa.pag, agbno)) return -EFSCORRUPTED; return 0; } @@ -118,10 +130,7 @@ xrep_check_btree_root( struct xfs_scrub *sc, struct xrep_find_ag_btree *fab) { - struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sc->sm->sm_agno; - - return xfs_verify_agbno(mp, agno, fab->root) && + return xfs_verify_agbno(sc->sa.pag, fab->root) && fab->height <= fab->maxlevels; } @@ -189,8 +198,7 @@ xrep_agf_init_header( agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, - sc->sa.pag->pag_agno)); + agf->agf_length = cpu_to_be32(sc->sa.pag->block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -393,7 +401,7 @@ xrep_agf( * btrees rooted in the AGF. If the AGFL contents are obviously bad * then we'll bail out. */ - error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp); + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); if (error) return error; @@ -654,8 +662,7 @@ xrep_agfl( * nothing wrong with the AGF, but all the AG header repair functions * have this chicken-and-egg problem. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; @@ -730,8 +737,7 @@ xrep_agi_find_btrees( int error; /* Read the AGF. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; @@ -770,8 +776,7 @@ xrep_agi_init_header( agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, - sc->sa.pag->pag_agno)); + agi->agi_length = cpu_to_be32(sc->sa.pag->block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 87518e1292f8..3b38f4e2a537 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -93,17 +93,14 @@ xchk_allocbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { - struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; xfs_extlen_t len; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); - if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_allocbt_xref(bs->sc, bno, len); diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 1719e1c4da59..3590e10e3e62 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -24,7 +24,7 @@ struct xchk_xattr_buf { * space bitmap follows immediately after; and we have a third buffer * for storing intermediate bitmap results. */ - uint8_t buf[0]; + uint8_t buf[]; }; /* A place to store attribute values. */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index a4cbbc346f60..f0b9cb6506fd 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -133,29 +133,13 @@ xchk_bmap_get_rmap( if (info->is_shared) { error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, owner, offset, rflags, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - goto out; + } else { + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); } - - /* - * Otherwise, use the (faster) regular lookup. - */ - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner, - offset, rflags, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) + if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) return false; - if (!has_rmap) - goto out; - error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - -out: if (!has_rmap) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -350,7 +334,7 @@ xchk_bmap_iextent( irec->br_startoff); /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > MAXEXTLEN) + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (info->is_rt && @@ -393,7 +377,7 @@ xchk_bmapbt_rec( struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork); uint64_t owner; int i; @@ -442,7 +426,7 @@ xchk_bmap_btree( struct xchk_bmap_info *info) { struct xfs_owner_info oinfo; - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; struct xfs_btree_cur *cur; @@ -494,7 +478,7 @@ xchk_bmap_check_rmap( return 0; /* Now look up the bmbt record. */ - ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork); + ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork); if (!ifp) { xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); @@ -556,7 +540,7 @@ xchk_bmap_check_ag_rmaps( struct xfs_buf *agf; int error; - error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf); if (error) return error; @@ -579,7 +563,7 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); struct xfs_perag *pag; xfs_agnumber_t agno; bool zero_size; @@ -594,7 +578,7 @@ xchk_bmap_check_rmaps( if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) return 0; - ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); /* * Only do this for complex maps that are in btree format, or for @@ -640,7 +624,7 @@ xchk_bmap( struct xchk_bmap_info info = { NULL }; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; struct xfs_iext_cursor icur; int error = 0; @@ -705,7 +689,7 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 39dd46f038fe..2f4519590dc1 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -462,7 +462,7 @@ xchk_btree_check_iroot_minrecs( */ if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && - XFS_IFORK_Q(bs->sc->ip)) + xfs_inode_has_attr_fork(bs->sc->ip)) return false; return true; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index bf1f3607d0b6..9bbbf20f401b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -23,6 +23,8 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" @@ -414,15 +416,15 @@ xchk_ag_read_headers( if (!sa->pag) return -ENOENT; - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp); + error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp); + error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) return error; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index b962cfbbd92b..84fe3d33d699 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -482,7 +482,7 @@ xchk_da_btree( int error; /* Skip short format data structures; no btree to scan. */ - if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork))) + if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork))) return 0; /* Set up initial da state. */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 200a63f58fe7..5c87800ab223 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -99,7 +99,7 @@ out: * we check the inode number to make sure it's sane, then we check that * we can look up this filename. Finally, we check the ftype. */ -STATIC int +STATIC bool xchk_dir_actor( struct dir_context *dir_iter, const char *name, @@ -124,7 +124,7 @@ xchk_dir_actor( xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); if (xchk_should_terminate(sdc->sc, &error)) - return error; + return !error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { @@ -191,8 +191,8 @@ out: * and return zero to xchk_directory. */ if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return -EFSCORRUPTED; - return error; + return false; + return !error; } /* Scrub a directory btree record. */ @@ -497,6 +497,7 @@ STATIC int xchk_directory_leaf1_bestfree( struct xfs_scrub *sc, struct xfs_da_args *args, + xfs_dir2_db_t last_data_db, xfs_dablk_t lblk) { struct xfs_dir3_icleaf_hdr leafhdr; @@ -534,10 +535,14 @@ xchk_directory_leaf1_bestfree( } /* - * There should be as many bestfree slots as there are dir data - * blocks that can fit under i_size. + * There must be enough bestfree slots to cover all the directory data + * blocks that we scanned. It is possible for there to be a hole + * between the last data block and i_disk_size. This seems like an + * oversight to the scrub author, but as we have been writing out + * directories like this (and xfs_repair doesn't mind them) for years, + * that's what we have to check. */ - if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_disk_size)) { + if (bestcount != last_data_db + 1) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } @@ -662,15 +667,16 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args; - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; xfs_fileoff_t free_lblk; xfs_fileoff_t lblk; struct xfs_iext_cursor icur; xfs_dablk_t dabno; + xfs_dir2_db_t last_data_db = 0; bool found; - int is_block = 0; + bool is_block = false; int error; /* Ignore local format directories. */ @@ -712,6 +718,7 @@ xchk_directory_blocks( args.geo->fsbcount); lblk < got.br_startoff + got.br_blockcount; lblk += args.geo->fsbcount) { + last_data_db = xfs_dir2_da_to_db(args.geo, lblk); error = xchk_directory_data_bestfree(sc, lblk, is_block); if (error) @@ -734,7 +741,7 @@ xchk_directory_blocks( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } - error = xchk_directory_leaf1_bestfree(sc, &args, + error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db, leaf_lblk); if (error) goto out; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 48a6cbdf95d0..6a6f8fe7f87c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -78,10 +78,10 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); if (error) break; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); if (error) break; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 2e61df3bca83..aa65ec88a0c0 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 00848ee542fb..e312be7cd375 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,13 +104,12 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); - if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); @@ -421,10 +420,10 @@ xchk_iallocbt_rec( const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_perag *pag = bs->cur->bc_ag.pag; struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; @@ -446,8 +445,8 @@ xchk_iallocbt_rec( agino = irec.ir_startino; /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(mp, agno, agino) || - !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) { + if (!xfs_verify_agino(pag, agino) || + !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 2405b09d03d0..51820b40ab1c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -232,7 +232,9 @@ xchk_dinode( size_t fork_recs; unsigned long long isize; uint64_t flags2; - uint32_t nextents; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + prid_t prid; uint16_t flags; uint16_t mode; @@ -267,6 +269,7 @@ xchk_dinode( * so just mark this inode for preening. */ xchk_ino_set_preen(sc, ino); + prid = 0; break; case 2: case 3: @@ -279,12 +282,17 @@ xchk_dinode( if (dip->di_projid_hi != 0 && !xfs_has_projid32(mp)) xchk_ino_set_corrupt(sc, ino); + + prid = be16_to_cpu(dip->di_projid_lo); break; default: xchk_ino_set_corrupt(sc, ino); return; } + if (xfs_has_projid32(mp)) + prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; + /* * di_uid/di_gid -- -1 isn't invalid, but there's no way that * userspace could have created that. @@ -293,6 +301,13 @@ xchk_dinode( dip->di_gid == cpu_to_be32(-1U)) xchk_ino_set_warning(sc, ino); + /* + * project id of -1 isn't supposed to be valid, but the kernel didn't + * always validate that. + */ + if (prid == -1U) + xchk_ino_set_warning(sc, ino); + /* di_format */ switch (dip->di_format) { case XFS_DINODE_FMT_DEV: @@ -376,8 +391,10 @@ xchk_dinode( xchk_inode_extsize(sc, dip, ino, mode, flags); + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + /* di_nextents */ - nextents = be32_to_cpu(dip->di_nextents); fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -397,7 +414,7 @@ xchk_dinode( /* di_forkoff */ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); - if (dip->di_anextents != 0 && dip->di_forkoff == 0) + if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) xchk_ino_set_corrupt(sc, ino); @@ -409,19 +426,18 @@ xchk_dinode( xchk_ino_set_corrupt(sc, ino); /* di_anextents */ - nextents = be16_to_cpu(dip->di_anextents); fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_aformat) { case XFS_DINODE_FMT_EXTENTS: - if (nextents > fork_recs) + if (naextents > fork_recs) xchk_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: - if (nextents <= fork_recs) + if (naextents <= fork_recs) xchk_ino_set_corrupt(sc, ino); break; default: - if (nextents != 0) + if (naextents != 0) xchk_ino_set_corrupt(sc, ino); } @@ -499,14 +515,14 @@ xchk_inode_xref_bmap( &nextents, &count); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents < be32_to_cpu(dip->di_nextents)) + if (nextents < xfs_dfork_data_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, &nextents, &acount); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents != be16_to_cpu(dip->di_anextents)) + if (nextents != xfs_dfork_attr_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); /* Check nblocks against the inode. */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index ab182a5cd0c0..d8dff3fd8053 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -38,7 +38,7 @@ struct xchk_parent_ctx { }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC int +STATIC bool xchk_parent_actor( struct dir_context *dc, const char *name, @@ -62,7 +62,7 @@ xchk_parent_actor( if (xchk_should_terminate(spc->sc, &error)) spc->cancelled = true; - return error; + return !error; } /* Count the number of dentries in the parent dir that point to this inode. */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index d6c1b00a4fc8..21b4c9006859 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -48,10 +48,10 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->flags |= XCHK_HAS_QUOTAOFFLOCK; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); + if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xchk_setup_fs(sc); if (error) return error; @@ -185,7 +185,7 @@ xchk_quota_data_fork( /* Check for data fork problems that apply only to quota files. */ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; - ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 2744eecdbaf0..a26ee0f24ef2 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -13,6 +13,8 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_ag.h" /* @@ -267,15 +269,13 @@ done: STATIC void xchk_refcountbt_xref_rmap( struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_nlink_t refcount) + const struct xfs_refcount_irec *irec) { struct xchk_refcnt_check refchk = { - .sc = sc, - .bno = bno, - .len = len, - .refcount = refcount, + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, .seen = 0, }; struct xfs_rmap_irec low; @@ -289,9 +289,9 @@ xchk_refcountbt_xref_rmap( /* Cross-reference with the rmapbt to confirm the refcount. */ memset(&low, 0, sizeof(low)); - low.rm_startblock = bno; + low.rm_startblock = irec->rc_startblock; memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; INIT_LIST_HEAD(&refchk.fragments); error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, @@ -300,7 +300,7 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); out_free: @@ -313,17 +313,16 @@ out_free: /* Cross-reference with the other btrees. */ STATIC void xchk_refcountbt_xref( - struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len, - xfs_nlink_t refcount) + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) { if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_xref_is_not_inode_chunk(sc, agbno, len); - xchk_refcountbt_xref_rmap(sc, agbno, len, refcount); + xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount); + xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock, + irec->rc_blockcount); + xchk_refcountbt_xref_rmap(sc, irec); } /* Scrub a refcountbt record. */ @@ -332,36 +331,27 @@ xchk_refcountbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { - struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_refcount_irec irec; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; - xfs_agblock_t bno; - xfs_extlen_t len; - xfs_nlink_t refcount; - bool has_cowflag; + struct xfs_perag *pag = bs->cur->bc_ag.pag; - bno = be32_to_cpu(rec->refc.rc_startblock); - len = be32_to_cpu(rec->refc.rc_blockcount); - refcount = be32_to_cpu(rec->refc.rc_refcount); + xfs_refcount_btrec_to_irec(rec, &irec); - /* Only CoW records can have refcount == 1. */ - has_cowflag = (bno & XFS_REFC_COW_START); - if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) + /* Check the domain and refcount are not incompatible. */ + if (!xfs_refcount_check_domain(&irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (has_cowflag) - (*cow_blocks) += len; + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + (*cow_blocks) += irec.rc_blockcount; /* Check the extent. */ - bno &= ~XFS_REFC_COW_START; - if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (refcount == 0) + if (irec.rc_refcount == 0) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_refcountbt_xref(bs->sc, bno, len, refcount); + xchk_refcountbt_xref(bs->sc, &irec); return 0; } @@ -425,7 +415,6 @@ xchk_xref_is_cow_staging( xfs_extlen_t len) { struct xfs_refcount_irec rc; - bool has_cowflag; int has_refcount; int error; @@ -433,8 +422,8 @@ xchk_xref_is_cow_staging( return; /* Find the CoW staging extent. */ - error = xfs_refcount_lookup_le(sc->sa.refc_cur, - agbno + XFS_REFC_COW_START, &has_refcount); + error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, &has_refcount); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (!has_refcount) { @@ -450,9 +439,8 @@ xchk_xref_is_cow_staging( return; } - /* CoW flag must be set, refcount must be 1. */ - has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START); - if (!has_cowflag || rc.rc_refcount != 1) + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); /* Must be at least as long as what was passed in */ @@ -476,7 +464,8 @@ xchk_xref_is_not_shared( if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); + error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, + agbno, len, &shared); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (shared) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 8f3cba14ada3..c18bd039fce9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -25,6 +25,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -198,7 +199,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp); + error = xfs_ialloc_read_agi(pag, NULL, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); @@ -206,9 +207,9 @@ xrep_calc_ag_resblks( } /* Now grab the block counters from the AGF. */ - error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { - aglen = xfs_ag_block_count(mp, sm->sm_agno); + aglen = pag->block_count; freelen = aglen; usedlen = aglen; } else { @@ -219,25 +220,22 @@ xrep_calc_ag_resblks( usedlen = aglen - freelen; xfs_buf_relse(bp); } - xfs_perag_put(pag); /* If the icount is impossible, make some worst-case assumptions. */ if (icount == NULLAGINO || - !xfs_verify_agino(mp, sm->sm_agno, icount)) { - xfs_agino_t first, last; - - xfs_agino_range(mp, sm->sm_agno, &first, &last); - icount = last - first + 1; + !xfs_verify_agino(pag, icount)) { + icount = pag->agino_max - pag->agino_min + 1; } /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || - aglen != xfs_ag_block_count(mp, sm->sm_agno) || + aglen != pag->block_count || freelen >= aglen) { - aglen = xfs_ag_block_count(mp, sm->sm_agno); + aglen = pag->block_count; freelen = aglen; usedlen = aglen; } + xfs_perag_put(pag); trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, freelen, usedlen); @@ -299,13 +297,13 @@ xrep_alloc_ag_block( switch (resv) { case XFS_AG_RESV_AGFL: case XFS_AG_RESV_RMAPBT: - error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1); + error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp, + sc->sa.agf_bp, &bno, 1); if (error) return error; if (bno == NULLAGBLOCK) return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, - 1, false); + xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); if (resv == XFS_AG_RESV_RMAPBT) xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); @@ -456,16 +454,19 @@ xrep_invalidate_blocks( * assume it's owned by someone else. */ for_each_xbitmap_block(fsbno, bmr, n, bitmap) { + int error; + /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; - bp = xfs_buf_incore(sc->mp->m_ddev_targp, + error = xfs_buf_incore(sc->mp->m_ddev_targp, XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK); - if (bp) { - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - } + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); } return 0; @@ -515,8 +516,8 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ - error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp, - agbno, 0); + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + sc->sa.agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, @@ -535,13 +536,12 @@ xrep_reap_block( { struct xfs_btree_cur *cur; struct xfs_buf *agf_bp = NULL; - xfs_agnumber_t agno; xfs_agblock_t agbno; bool has_other_rmap; int error; - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); /* * If we are repairing per-inode metadata, we need to read in the AGF @@ -549,7 +549,7 @@ xrep_reap_block( * the AGF buffer that the setup functions already grabbed. */ if (sc->ip) { - error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; } else { @@ -912,11 +912,13 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); sc->mp->m_qflags &= ~flag; spin_lock(&sc->mp->m_sb_lock); sc->mp->m_sb.sb_qflags &= ~flag; spin_unlock(&sc->mp->m_sb_lock); xfs_log_sb(sc->tp); + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); } /* diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8dae0345c7df..229826b2e1c0 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; bool non_inode; bool is_unwritten; bool is_bmbt; @@ -121,8 +121,8 @@ xchk_rmapbt_rec( * Otherwise we must point somewhere past the static metadata * but before the end of the FS. Run the regular check. */ - if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) || - !xfs_verify_agbno(mp, agno, irec.rm_startblock + + if (!xfs_verify_agbno(pag, irec.rm_startblock) || + !xfs_verify_agbno(pag, irec.rm_startblock + irec.rm_blockcount - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 8fa012057405..0a3bde64c675 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -40,6 +40,7 @@ xchk_setup_rt( /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -48,10 +49,10 @@ xchk_rtbitmap_rec( xfs_rtblock_t startblock; xfs_rtblock_t blockcount; - startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; - if (!xfs_verify_rtext(sc->mp, startblock, blockcount)) + if (!xfs_verify_rtext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -114,7 +115,7 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8d528d35b725..2e8e400f10a9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -173,10 +173,6 @@ xchk_teardown( mnt_drop_write_file(sc->file); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); - if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; - } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -344,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, }; -/* This isn't a stable feature, warn once per day. */ -static inline void -xchk_experimental_warning( - struct xfs_mount *mp) -{ - static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( - "xchk_warning", 86400 * HZ, 1); - ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&scrub_warning)) - xfs_alert(mp, -"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); -} - static int xchk_validate_inputs( struct xfs_mount *mp, @@ -482,7 +464,8 @@ xfs_scrub_metadata( if (error) goto out; - xchk_experimental_warning(mp); + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, + "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); if (!sc) { diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 80e5026bba44..3de5287e98d8 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,7 +88,6 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ #define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 599ee277bba2..75311f8daeeb 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -41,7 +41,7 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ @@ -52,8 +52,8 @@ xchk_symlink( /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { - if (len > XFS_IFORK_DSIZE(ip) || - len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip))) + if (len > xfs_inode_data_fork_size(ip) || + len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out; } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 5c52ee869272..b744c62052b6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -10,13 +10,14 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_trans.h" +#include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> @@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) xfs_acl_to_disk(args.value, acl); } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); kmem_free(args.value); /* diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index bb6abdcb265d..263404d0bfda 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else -static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu) +#define xfs_get_acl NULL +#define xfs_set_acl NULL +static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - return NULL; + return 0; } -# define xfs_set_acl NULL static inline void xfs_forget_acl(struct inode *inode, const char *name) { } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c8c15c3c3147..5d1a995b15f8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -136,7 +136,20 @@ done: memalloc_nofs_restore(nofs_flag); } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -157,6 +170,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); + cond_resched(); } } @@ -359,7 +373,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -437,37 +451,35 @@ xfs_prepare_ioend( * see a ENOSPC in writeback). */ static void -xfs_discard_page( - struct page *page, - loff_t fileoff) +xfs_discard_folio( + struct folio *folio, + loff_t pos) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - unsigned int pageoff = offset_in_page(fileoff); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); + size_t offset = offset_in_folio(folio, pos); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) - goto out_invalidate; + return; xfs_alert_ratelimited(mp, - "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, fileoff); + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page) - pageoff_fsb); + i_blocks_per_folio(inode, folio) - pageoff_fsb); if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); -out_invalidate: - iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, .prepare_ioend = xfs_prepare_ioend, - .discard_page = xfs_discard_page, + .discard_folio = xfs_discard_folio, }; STATIC int @@ -524,11 +536,11 @@ xfs_vm_bmap( } STATIC int -xfs_vm_readpage( +xfs_vm_read_folio( struct file *unused, - struct page *page) + struct folio *folio) { - return iomap_readpage(page, &xfs_read_iomap_ops); + return iomap_read_folio(folio, &xfs_read_iomap_ops); } STATIC void @@ -550,15 +562,15 @@ xfs_iomap_swapfile_activate( } const struct address_space_operations xfs_address_space_operations = { - .readpage = xfs_vm_readpage, + .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, .writepages = xfs_vm_writepages, - .set_page_dirty = __set_page_dirty_nobuffers, - .releasepage = iomap_releasepage, - .invalidatepage = iomap_invalidatepage, + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = xfs_iomap_swapfile_activate, @@ -567,7 +579,6 @@ const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, + .dirty_folio = noop_dirty_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 27265771f247..5db87b34fb6e 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -158,6 +158,7 @@ xfs_attr3_node_inactive( } child_fsb = be32_to_cpu(ichdr.btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + bp = NULL; /* * If this is the node level just above the leaves, simply loop @@ -211,12 +212,8 @@ xfs_attr3_node_inactive( &child_bp); if (error) return error; - error = bp->b_error; - if (error) { - xfs_trans_brelse(*trans, child_bp); - return error; - } xfs_trans_binval(*trans, child_bp); + child_bp = NULL; /* * If we're not done, re-read the parent to get the next @@ -233,6 +230,7 @@ xfs_attr3_node_inactive( bp->b_addr); child_fsb = be32_to_cpu(phdr.btree[i + 1].before); xfs_trans_brelse(*trans, bp); + bp = NULL; } /* * Atomically commit the whole invalidate stuff. @@ -338,7 +336,7 @@ xfs_attr_inactive( ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); - if (!XFS_IFORK_Q(dp)) + if (!xfs_inode_has_attr_fork(dp)) goto out_destroy_fork; xfs_iunlock(dp, lock_mode); @@ -351,7 +349,7 @@ xfs_attr_inactive( lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); - if (!XFS_IFORK_Q(dp)) + if (!xfs_inode_has_attr_fork(dp)) goto out_cancel; /* @@ -362,12 +360,11 @@ xfs_attr_inactive( /* * Invalidate and truncate the attribute fork extents. Make sure the - * fork actually has attributes as otherwise the invalidation has no + * fork actually has xattr blocks as otherwise the invalidation has no * blocks to read and returns an error. In this case, just do the fork * removal below. */ - if (xfs_inode_hasattr(dp) && - dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (dp->i_af.if_nextents > 0) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,11 +385,7 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_afp) { - xfs_idestroy_fork(dp->i_afp); - kmem_cache_free(xfs_ifork_cache, dp->i_afp); - dp->i_afp = NULL; - } + xfs_ifork_zap_attr(dp); if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c new file mode 100644 index 000000000000..2788a6f2edcd --- /dev/null +++ b/fs/xfs/xfs_attr_item.c @@ -0,0 +1,881 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson <allison.henderson@oracle.com> + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_shared.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_item.h" +#include "xfs_trace.h" +#include "xfs_trans_space.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; + +static const struct xfs_item_ops xfs_attri_item_ops; +static const struct xfs_item_ops xfs_attrd_item_ops; +static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip); + +static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attri_log_item, attri_item); +} + +/* + * Shared xattr name/value buffers for logged extended attribute operations + * + * When logging updates to extended attributes, we can create quite a few + * attribute log intent items for a single xattr update. To avoid cycling the + * memory allocator and memcpy overhead, the name (and value, for setxattr) + * are kept in a refcounted object that is shared across all related log items + * and the upper-level deferred work state structure. The shared buffer has + * a control structure, followed by the name, and then the value. + */ + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_get( + struct xfs_attri_log_nameval *nv) +{ + if (!refcount_inc_not_zero(&nv->refcount)) + return NULL; + return nv; +} + +static inline void +xfs_attri_log_nameval_put( + struct xfs_attri_log_nameval *nv) +{ + if (!nv) + return; + if (refcount_dec_and_test(&nv->refcount)) + kvfree(nv); +} + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_alloc( + const void *name, + unsigned int name_len, + const void *value, + unsigned int value_len) +{ + struct xfs_attri_log_nameval *nv; + + /* + * This could be over 64kB in length, so we have to use kvmalloc() for + * this. But kvmalloc() utterly sucks, so we use our own version. + */ + nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + + name_len + value_len); + + nv->name.i_addr = nv + 1; + nv->name.i_len = name_len; + nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; + memcpy(nv->name.i_addr, name, name_len); + + if (value_len) { + nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_len = value_len; + memcpy(nv->value.i_addr, value, value_len); + } else { + nv->value.i_addr = NULL; + nv->value.i_len = 0; + } + nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + + refcount_set(&nv->refcount, 1); + return nv; +} + +STATIC void +xfs_attri_item_free( + struct xfs_attri_log_item *attrip) +{ + kmem_free(attrip->attri_item.li_lv_shadow); + xfs_attri_log_nameval_put(attrip->attri_nameval); + kmem_cache_free(xfs_attri_cache, attrip); +} + +/* + * Freeing the attrip requires that we remove it from the AIL if it has already + * been placed there. However, the ATTRI may not yet have been placed in the + * AIL when called by xfs_attri_release() from ATTRD processing due to the + * ordering of committed vs unpin operations in bulk insert operations. Hence + * the reference count to ensure only the last caller frees the ATTRI. + */ +STATIC void +xfs_attri_release( + struct xfs_attri_log_item *attrip) +{ + ASSERT(atomic_read(&attrip->attri_refcount) > 0); + if (!atomic_dec_and_test(&attrip->attri_refcount)) + return; + + xfs_trans_ail_delete(&attrip->attri_item, 0); + xfs_attri_item_free(attrip); +} + +STATIC void +xfs_attri_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + + *nvecs += 2; + *nbytes += sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(nv->name.i_len); + + if (!nv->value.i_len) + return; + + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->value.i_len); +} + +/* + * This is called to fill in the log iovecs for the given attri log + * item. We use 1 iovec for the attri_format_item, 1 for the name, and + * another for the value if it is present + */ +STATIC void +xfs_attri_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + + attrip->attri_format.alfi_type = XFS_LI_ATTRI; + attrip->attri_format.alfi_size = 1; + + /* + * This size accounting must be done before copying the attrip into the + * iovec. If we do it after, the wrong size will be recorded to the log + * and we trip across assertion checks for bad region sizes later during + * the log recovery. + */ + + ASSERT(nv->name.i_len > 0); + attrip->attri_format.alfi_size++; + + if (nv->value.i_len > 0) + attrip->attri_format.alfi_size++; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, + &attrip->attri_format, + sizeof(struct xfs_attri_log_format)); + xlog_copy_from_iovec(lv, &vecp, &nv->name); + if (nv->value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->value); +} + +/* + * The unpin operation is the last place an ATTRI is manipulated in the log. It + * is either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the ATTRI transaction has been successfully committed to make + * it this far. Therefore, we expect whoever committed the ATTRI to either + * construct and commit the ATTRD or drop the ATTRD's reference in the event of + * error. Simply drop the log's ATTRI reference now that the log is done with + * it. + */ +STATIC void +xfs_attri_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + + +STATIC void +xfs_attri_item_release( + struct xfs_log_item *lip) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + +/* + * Allocate and initialize an attri item. Caller may allocate an additional + * trailing buffer for name and value + */ +STATIC struct xfs_attri_log_item * +xfs_attri_init( + struct xfs_mount *mp, + struct xfs_attri_log_nameval *nv) +{ + struct xfs_attri_log_item *attrip; + + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); + + /* + * Grab an extra reference to the name/value buffer for this log item. + * The caller retains its own reference! + */ + attrip->attri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attrip->attri_nameval); + + xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, + &xfs_attri_item_ops); + attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip; + atomic_set(&attrip->attri_refcount, 2); + + return attrip; +} + +static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attrd_log_item, attrd_item); +} + +STATIC void +xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) +{ + kmem_free(attrdp->attrd_item.li_lv_shadow); + kmem_cache_free(xfs_attrd_cache, attrdp); +} + +STATIC void +xfs_attrd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_attrd_log_format); +} + +/* + * This is called to fill in the log iovecs for the given attrd log item. We use + * only 1 iovec for the attrd_format, and we point that at the attr_log_format + * structure embedded in the attrd item. + */ +STATIC void +xfs_attrd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + attrdp->attrd_format.alfd_type = XFS_LI_ATTRD; + attrdp->attrd_format.alfd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT, + &attrdp->attrd_format, + sizeof(struct xfs_attrd_log_format)); +} + +/* + * The ATTRD is either committed or aborted if the transaction is canceled. If + * the transaction is canceled, drop our reference to the ATTRI and free the + * ATTRD. + */ +STATIC void +xfs_attrd_item_release( + struct xfs_log_item *lip) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + + xfs_attri_release(attrdp->attrd_attrip); + xfs_attrd_item_free(attrdp); +} + +static struct xfs_log_item * +xfs_attrd_item_intent( + struct xfs_log_item *lip) +{ + return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; +} + +/* + * Performs one step of an attribute update intent and marks the attrd item + * dirty.. An attr operation may be a set or a remove. Note that the + * transaction is marked dirty regardless of whether the operation succeeds or + * fails to support the ATTRI/ATTRD lifecycle rules. + */ +STATIC int +xfs_xattri_finish_update( + struct xfs_attr_intent *attr, + struct xfs_attrd_log_item *attrdp) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + error = -EAGAIN; +out: + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the ATTRI and frees the ATTRD + * 2.) shuts down the filesystem + */ + args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; + + /* + * attr intent/done items are null when logged attributes are disabled + */ + if (attrdp) + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + return error; +} + +/* Log an attr to the intent item. */ +STATIC void +xfs_attr_log_item( + struct xfs_trans *tp, + struct xfs_attri_log_item *attrip, + const struct xfs_attr_intent *attr) +{ + struct xfs_attri_log_format *attrp; + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); + + /* + * At this point the xfs_attr_intent has been constructed, and we've + * created the log intent. Fill in the attri log item and log format + * structure with fields from this xfs_attr_intent + */ + attrp = &attrip->attri_format; + attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); + attrp->alfi_op_flags = attr->xattri_op_flags; + attrp->alfi_value_len = attr->xattri_nameval->value.i_len; + attrp->alfi_name_len = attr->xattri_nameval->name.i_len; + ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; +} + +/* Get an ATTRI. */ +static struct xfs_log_item * +xfs_attr_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_attri_log_item *attrip; + struct xfs_attr_intent *attr; + struct xfs_da_args *args; + + ASSERT(count == 1); + + /* + * Each attr item only performs one attribute operation at a time, so + * this is a list of one + */ + attr = list_first_entry_or_null(items, struct xfs_attr_intent, + xattri_list); + args = attr->xattri_da_args; + + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + return NULL; + + /* + * Create a buffer to store the attribute name and value. This buffer + * will be shared between the higher level deferred xattr work state + * and the lower level xattr log items. + */ + if (!attr->xattri_nameval) { + /* + * Transfer our reference to the name/value buffer to the + * deferred work state structure. + */ + attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, + args->namelen, args->value, args->valuelen); + } + + attrip = xfs_attri_init(mp, attr->xattri_nameval); + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); + + return &attrip->attri_item; +} + +static inline void +xfs_attr_free_item( + struct xfs_attr_intent *attr) +{ + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + xfs_attri_log_nameval_put(attr->xattri_nameval); + if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) + kmem_free(attr); + else + kmem_cache_free(xfs_attr_intent_cache, attr); +} + +/* Process an attr. */ +STATIC int +xfs_attr_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_attr_intent *attr; + struct xfs_attrd_log_item *done_item = NULL; + int error; + + attr = container_of(item, struct xfs_attr_intent, xattri_list); + if (done) + done_item = ATTRD_ITEM(done); + + /* + * Always reset trans after EAGAIN cycle + * since the transaction is new + */ + attr->xattri_da_args->trans = tp; + + error = xfs_xattri_finish_update(attr, done_item); + if (error != -EAGAIN) + xfs_attr_free_item(attr); + + return error; +} + +/* Abort all pending ATTRs. */ +STATIC void +xfs_attr_abort_intent( + struct xfs_log_item *intent) +{ + xfs_attri_release(ATTRI_ITEM(intent)); +} + +/* Cancel an attr */ +STATIC void +xfs_attr_cancel_item( + struct list_head *item) +{ + struct xfs_attr_intent *attr; + + attr = container_of(item, struct xfs_attr_intent, xattri_list); + xfs_attr_free_item(attr); +} + +STATIC bool +xfs_attri_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; +} + +/* Is this recovered ATTRI format ok? */ +static inline bool +xfs_attri_validate( + struct xfs_mount *mp, + struct xfs_attri_log_format *attrp) +{ + unsigned int op = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + + if (attrp->__pad != 0) + return false; + + if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) + return false; + + if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) + return false; + + /* alfi_op_flags should be either a set or remove */ + switch (op) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + break; + default: + return false; + } + + if (attrp->alfi_value_len > XATTR_SIZE_MAX) + return false; + + if ((attrp->alfi_name_len > XATTR_NAME_MAX) || + (attrp->alfi_name_len == 0)) + return false; + + return xfs_verify_ino(mp, attrp->alfi_ino); +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attri_item_recover( + struct xfs_log_item *lip, + struct list_head *capture_list) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res tres; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error; + int total; + int local; + struct xfs_attrd_log_item *done_item = NULL; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + if (error) + return error; + + attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + + sizeof(struct xfs_da_args), KM_NOFS); + args = (struct xfs_da_args *)(attr + 1); + + attr->xattri_da_args = args; + attr->xattri_op_flags = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + + /* + * We're reconstructing the deferred work state structure from the + * recovered log item. Grab a reference to the name/value buffer and + * attach it to the new work state. + */ + attr->xattri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attr->xattri_nameval); + + args->dp = ip; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->name = nv->name.i_addr; + args->namelen = nv->name.i_len; + args->hashval = xfs_da_hashname(args->name, args->namelen); + args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | + XFS_DA_OP_LOGGED; + + ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); + + switch (attr->xattri_op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; + args->total = xfs_attr_calc_size(args, &local); + if (xfs_inode_hasattr(args->dp)) + attr->xattri_dela_state = xfs_attr_init_replace_state(args); + else + attr->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + if (!xfs_inode_hasattr(args->dp)) + goto out; + attr->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + goto out; + } + + xfs_init_attr_trans(args, &tres, &total); + error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + if (error) + goto out; + + args->trans = tp; + done_item = xfs_trans_get_attrd(tp, attrip); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_xattri_finish_update(attr, done_item); + if (error == -EAGAIN) { + /* + * There's more work to do, so add the intent item to this + * transaction so that we can continue it later. + */ + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + if (error) + goto out_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + return 0; + } + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + error = xfs_defer_ops_capture_and_commit(tp, capture_list); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); +out: + xfs_attr_free_item(attr); + return error; +} + +/* Re-log an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_attri_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_attrd_log_item *attrdp; + struct xfs_attri_log_item *old_attrip; + struct xfs_attri_log_item *new_attrip; + struct xfs_attri_log_format *new_attrp; + struct xfs_attri_log_format *old_attrp; + + old_attrip = ATTRI_ITEM(intent); + old_attrp = &old_attrip->attri_format; + + tp->t_flags |= XFS_TRANS_DIRTY; + attrdp = xfs_trans_get_attrd(tp, old_attrip); + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + /* + * Create a new log item that shares the same name/value buffer as the + * old log item. + */ + new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval); + new_attrp = &new_attrip->attri_format; + + new_attrp->alfi_ino = old_attrp->alfi_ino; + new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; + new_attrp->alfi_value_len = old_attrp->alfi_value_len; + new_attrp->alfi_name_len = old_attrp->alfi_name_len; + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; + + xfs_trans_add_item(tp, &new_attrip->attri_item); + set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); + + return &new_attrip->attri_item; +} + +STATIC int +xlog_recover_attri_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_attri_log_item *attrip; + struct xfs_attri_log_format *attri_formatp; + struct xfs_attri_log_nameval *nv; + const void *attr_value = NULL; + const void *attr_name; + size_t len; + + attri_formatp = item->ri_buf[0].i_addr; + attr_name = item->ri_buf[1].i_addr; + + /* Validate xfs_attri_log_format before the large memory allocation */ + len = sizeof(struct xfs_attri_log_format); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + if (!xfs_attri_validate(mp, attri_formatp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + /* Validate the attr name */ + if (item->ri_buf[1].i_len != + xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[1].i_addr, item->ri_buf[1].i_len); + return -EFSCORRUPTED; + } + + /* Validate the attr value, if present */ + if (attri_formatp->alfi_value_len != 0) { + if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + attr_value = item->ri_buf[2].i_addr; + } + + /* + * Memory alloc failure will cause replay to abort. We attach the + * name/value buffer to the recovered incore log item and drop our + * reference. + */ + nv = xfs_attri_log_nameval_alloc(attr_name, + attri_formatp->alfi_name_len, attr_value, + attri_formatp->alfi_value_len); + + attrip = xfs_attri_init(mp, nv); + memcpy(&attrip->attri_format, attri_formatp, len); + + /* + * The ATTRI has two references. One for the ATTRD and one for ATTRI to + * ensure it makes it into the AIL. Insert the ATTRI into the AIL + * directly and drop the ATTRI reference. Note that + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); + xfs_attri_release(attrip); + xfs_attri_log_nameval_put(nv); + return 0; +} + +/* + * This routine is called to allocate an "attr free done" log item. + */ +static struct xfs_attrd_log_item * +xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip) +{ + struct xfs_attrd_log_item *attrdp; + + ASSERT(tp != NULL); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + xfs_trans_add_item(tp, &attrdp->attrd_item); + return attrdp; +} + +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + if (!intent) + return NULL; + + return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, +}; + +/* + * This routine is called when an ATTRD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding ATTRI if + * it was still in the log. To do this it searches the AIL for the ATTRI with + * an id equal to that in the ATTRD format structure. If we find it we drop + * the ATTRD reference, which removes the ATTRI from the AIL and frees it. + */ +STATIC int +xlog_recover_attrd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_attrd_log_format *attrd_formatp; + + attrd_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_ATTRI, + attrd_formatp->alfd_alf_id); + return 0; +} + +static const struct xfs_item_ops xfs_attri_item_ops = { + .flags = XFS_ITEM_INTENT, + .iop_size = xfs_attri_item_size, + .iop_format = xfs_attri_item_format, + .iop_unpin = xfs_attri_item_unpin, + .iop_release = xfs_attri_item_release, + .iop_recover = xfs_attri_item_recover, + .iop_match = xfs_attri_item_match, + .iop_relog = xfs_attri_item_relog, +}; + +const struct xlog_recover_item_ops xlog_attri_item_ops = { + .item_type = XFS_LI_ATTRI, + .commit_pass2 = xlog_recover_attri_commit_pass2, +}; + +static const struct xfs_item_ops xfs_attrd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, + .iop_size = xfs_attrd_item_size, + .iop_format = xfs_attrd_item_format, + .iop_release = xfs_attrd_item_release, + .iop_intent = xfs_attrd_item_intent, +}; + +const struct xlog_recover_item_ops xlog_attrd_item_ops = { + .item_type = XFS_LI_ATTRD, + .commit_pass2 = xlog_recover_attrd_commit_pass2, +}; diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h new file mode 100644 index 000000000000..3280a7930287 --- /dev/null +++ b/fs/xfs/xfs_attr_item.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson <allison.henderson@oracle.com> + */ +#ifndef __XFS_ATTR_ITEM_H__ +#define __XFS_ATTR_ITEM_H__ + +/* kernel only ATTRI/ATTRD definitions */ + +struct xfs_mount; +struct kmem_zone; + +struct xfs_attri_log_nameval { + struct xfs_log_iovec name; + struct xfs_log_iovec value; + refcount_t refcount; + + /* name and value follow the end of this struct */ +}; + +/* + * This is the "attr intention" log item. It is used to log the fact that some + * extended attribute operations need to be processed. An operation is + * currently either a set or remove. Set or remove operations are described by + * the xfs_attr_intent which may be logged to this intent. + * + * During a normal attr operation, name and value point to the name and value + * fields of the caller's xfs_da_args structure. During a recovery, the name + * and value buffers are copied from the log, and stored in a trailing buffer + * attached to the xfs_attr_intent until they are committed. They are freed + * when the xfs_attr_intent itself is freed when the work is done. + */ +struct xfs_attri_log_item { + struct xfs_log_item attri_item; + atomic_t attri_refcount; + struct xfs_attri_log_nameval *attri_nameval; + struct xfs_attri_log_format attri_format; +}; + +/* + * This is the "attr done" log item. It is used to log the fact that some attrs + * earlier mentioned in an attri item have been freed. + */ +struct xfs_attrd_log_item { + struct xfs_log_item attrd_item; + struct xfs_attri_log_item *attrd_attrip; + struct xfs_attrd_log_format attrd_format; +}; + +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + +#endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 2d1e5134cebe..99bbbe1a0e44 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" @@ -60,8 +61,7 @@ xfs_attr_shortform_list( int sbsize, nsbuf, count, i; int error = 0; - ASSERT(dp->i_afp != NULL); - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -79,7 +79,7 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, @@ -120,7 +120,7 @@ xfs_attr_shortform_list( for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, @@ -512,7 +512,7 @@ xfs_attr_list_ilocked( */ if (!xfs_inode_hasattr(dp)) return 0; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_list(context); diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 667e297f59b1..fe21c76f75b8 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,48 +9,13 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } -static void -xfs_flush_bdev_async_endio( - struct bio *bio) -{ - complete(bio->bi_private); -} - -/* - * Submit a request for an async cache flush to run. If the request queue does - * not require flush operations, just skip it altogether. If the caller needs - * to wait for the flush completion at a later point in time, they must supply a - * valid completion. This will be signalled when the flush completes. The - * caller never sees the bio that is issued here. - */ -void -xfs_flush_bdev_async( - struct bio *bio, - struct block_device *bdev, - struct completion *done) -{ - struct request_queue *q = bdev->bd_disk->queue; - - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - complete(done); - return; - } - - bio_init(bio, NULL, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - bio->bi_private = done; - bio->bi_end_io = xfs_flush_bdev_async_endio; - - submit_bio(bio); -} int xfs_rw_bdev( struct block_device *bdev, sector_t sector, unsigned int count, char *data, - unsigned int op) + enum req_op op) { unsigned int is_vmalloc = is_vmalloc_addr(data); @@ -61,10 +26,9 @@ xfs_rw_bdev( if (is_vmalloc && op == REQ_OP_WRITE) flush_kernel_vmap_range(data, count); - bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, + GFP_KERNEL); bio->bi_iter.bi_sector = sector; - bio->bi_opf = op | REQ_META | REQ_SYNC; do { struct page *page = kmem_to_page(data); @@ -74,10 +38,9 @@ xfs_rw_bdev( while (bio_add_page(bio, page, len, off) != len) { struct bio *prev = bio; - bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); - bio_copy_dev(bio, prev); + bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), + prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); - bio->bi_opf = prev->bi_opf; bio_chain(prev, bio); submit_bio(prev); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e1f4d7d5a011..41323da523d1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -39,6 +39,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { + kmem_free(buip->bui_item.li_lv_shadow); kmem_cache_free(xfs_bui_cache, buip); } @@ -54,10 +55,11 @@ xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } + if (!atomic_dec_and_test(&buip->bui_refcount)) + return; + + xfs_trans_ail_delete(&buip->bui_item, 0); + xfs_bui_item_free(buip); } @@ -198,14 +200,24 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); + kmem_free(budp->bud_item.li_lv_shadow); kmem_cache_free(xfs_bud_cache, budp); } +static struct xfs_log_item * +xfs_bud_item_intent( + struct xfs_log_item *lip) +{ + return &BUD_ITEM(lip)->bud_buip->bui_item; +} + static const struct xfs_item_ops xfs_bud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_bud_item_size, .iop_format = xfs_bud_item_format, .iop_release = xfs_bud_item_release, + .iop_intent = xfs_bud_item_intent, }; static struct xfs_bud_log_item * @@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update( * 1.) releases the BUI and frees the BUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; @@ -463,7 +475,7 @@ xfs_bui_item_recover( struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_filblks_t count; @@ -506,6 +518,8 @@ xfs_bui_item_recover( iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; @@ -584,6 +598,7 @@ xfs_bui_item_relog( } static const struct xfs_item_ops xfs_bui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, @@ -593,28 +608,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_relog = xfs_bui_item_relog, }; -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) + struct xfs_bui_log_format *dst, + const struct xfs_bui_log_format *src) { - struct xfs_bui_log_format *src_bui_fmt; - uint len; + unsigned int i; - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents)); - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->bui_nextents; i++) + memcpy(&dst->bui_extents[i], &src->bui_extents[i], + sizeof(struct xfs_map_extent)); } /* @@ -631,23 +636,34 @@ xlog_recover_bui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_bui_log_item *buip; struct xfs_bui_log_format *bui_formatp; + size_t len; bui_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; + + len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + buip = xfs_bui_init(mp); + xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -681,7 +697,8 @@ xlog_recover_bud_commit_pass2( bud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73a36b7be3bd..04d0c2bff67c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -119,14 +119,14 @@ retry: */ ralen = ap->length / mp->m_sb.sb_rextsize; /* - * If the old value was close enough to MAXEXTLEN that + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that * we rounded up to it, cut it back so it's valid again. * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) + ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; /* * Lock out modifications to both the RT bitmap and summary inodes @@ -256,7 +256,7 @@ xfs_bmap_count_blocks( xfs_filblks_t *count) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; xfs_extlen_t btblocks = 0; int error; @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; - ifp = XFS_IFORK_PTR(ip, whichfork); xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: - if (!XFS_IFORK_Q(ip)) - goto out_unlock_iolock; + lock = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_has_attr_fork(ip)) + goto out_unlock_ilock; max_len = 1LL << 32; - lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + /* No CoW fork? Just return */ - if (!ifp) - goto out_unlock_iolock; + if (!xfs_ifork_ptr(ip, whichfork)) + goto out_unlock_ilock; if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); - - lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && @@ -491,6 +490,8 @@ xfs_getbmap( break; } + ifp = xfs_ifork_ptr(ip, whichfork); + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -686,6 +687,8 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; @@ -771,8 +774,7 @@ int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int alloc_type) + xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -840,9 +842,11 @@ xfs_alloc_file_space( * count, hence we need to limit the number of blocks we are * trying to reserve to avoid an overflow. We can't allocate * more than @nimaps extents, and an extent is limited on disk - * to MAXEXTLEN (21 bits), so use that to enforce the limit. + * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the + * limit. */ - resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + resblks = min_t(xfs_fileoff_t, (e - s), + (XFS_MAX_BMBT_EXTLEN * nimaps)); if (unlikely(rt)) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; @@ -851,9 +855,6 @@ xfs_alloc_file_space( rblocks = 0; } - /* - * Allocate and setup the transaction. - */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) @@ -861,18 +862,21 @@ xfs_alloc_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, 0, imapp, - &nimaps); + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); if (error) goto error; - /* - * Complete the transaction - */ + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -916,6 +920,8 @@ xfs_unmap_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1001,7 +1007,7 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any - * partial block at the beginning and/or end. iomap_zero_range is smart + * partial block at the beginning and/or end. xfs_zero_range is smart * enough to skip any holes, including those we just created, but we * must take care not to zero beyond EOF and enlarge i_size. */ @@ -1009,15 +1015,14 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, - &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, offset, len, NULL); if (error) return error; /* * If we zeroed right up to EOF and EOF straddles a page boundary we * must make sure that the post-EOF area is also zeroed because the - * page could be mmap'd and iomap_zero_range doesn't do that for us. + * page could be mmap'd and xfs_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { @@ -1198,6 +1203,8 @@ xfs_insert_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1314,8 +1321,8 @@ xfs_swap_extents_check_format( * extent format... */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_Q(ip) && - XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) + if (xfs_inode_has_attr_fork(ip) && + XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; @@ -1323,8 +1330,8 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_Q(tip) && - XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + if (xfs_inode_has_attr_fork(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; @@ -1426,6 +1433,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } @@ -1434,6 +1444,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } @@ -1494,15 +1507,15 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && - ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 && + ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && - tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 && + tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 9f993168b55b..24b37d211f1d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, int alloc_type); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 631c5a61d89b..dde346450952 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -5,6 +5,7 @@ */ #include "xfs.h" #include <linux/backing-dev.h> +#include <linux/dax.h> #include "xfs_shared.h" #include "xfs_format.h" @@ -14,13 +15,14 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_recover.h" +#include "xfs_log_priv.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_ag.h" -static struct kmem_cache *xfs_buf_cache; +struct kmem_cache *xfs_buf_cache; /* * Locking orders @@ -294,6 +296,16 @@ xfs_buf_free_pages( } static void +xfs_buf_free_callback( + struct callback_head *cb) +{ + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); + + xfs_buf_free_maps(bp); + kmem_cache_free(xfs_buf_cache, bp); +} + +static void xfs_buf_free( struct xfs_buf *bp) { @@ -306,8 +318,7 @@ xfs_buf_free( else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_cache, bp); + call_rcu(&bp->b_rcu, xfs_buf_free_callback); } static int @@ -394,7 +405,7 @@ xfs_buf_alloc_pages( } XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } @@ -405,7 +416,7 @@ xfs_buf_alloc_pages( STATIC int _xfs_buf_map_pages( struct xfs_buf *bp, - uint flags) + xfs_buf_flags_t flags) { ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { @@ -502,100 +513,45 @@ xfs_buf_hash_destroy( rhashtable_destroy(&pag->pag_buf_hash); } -/* - * Look up a buffer in the buffer cache and return it referenced and locked - * in @found_bp. - * - * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the - * cache. - * - * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return - * -EAGAIN if we fail to lock it. - * - * Return values are: - * -EFSCORRUPTED if have been supplied with an invalid address - * -EAGAIN on trylock failure - * -ENOENT if we fail to find a match and @new_bp was NULL - * 0, with @found_bp: - * - @new_bp if we inserted it into the cache - * - the buffer we found and locked. - */ static int -xfs_buf_find( +xfs_buf_map_verify( struct xfs_buftarg *btp, - struct xfs_buf_map *map, - int nmaps, - xfs_buf_flags_t flags, - struct xfs_buf *new_bp, - struct xfs_buf **found_bp) + struct xfs_buf_map *map) { - struct xfs_perag *pag; - struct xfs_buf *bp; - struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int i; - - *found_bp = NULL; - - for (i = 0; i < nmaps; i++) - cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= eofs) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, cmap.bm_bn, eofs); + __func__, map->bm_bn, eofs); WARN_ON(1); return -EFSCORRUPTED; } - - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - - spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, - xfs_buf_hash_params); - if (bp) { - atomic_inc(&bp->b_hold); - goto found; - } - - /* No match found */ - if (!new_bp) { - XFS_STATS_INC(btp->bt_mount, xb_miss_locked); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - return -ENOENT; - } - - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - *found_bp = new_bp; return 0; +} -found: - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - - if (!xfs_buf_trylock(bp)) { - if (flags & XBF_TRYLOCK) { - xfs_buf_rele(bp); - XFS_STATS_INC(btp->bt_mount, xb_busy_locked); +static int +xfs_buf_find_lock( + struct xfs_buf *bp, + xfs_buf_flags_t flags) +{ + if (flags & XBF_TRYLOCK) { + if (!xfs_buf_trylock(bp)) { + XFS_STATS_INC(bp->b_mount, xb_busy_locked); return -EAGAIN; } + } else { xfs_buf_lock(bp); - XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); + XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); } /* @@ -608,57 +564,59 @@ found: bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; } - - trace_xfs_buf_find(bp, flags, _RET_IP_); - XFS_STATS_INC(btp->bt_mount, xb_get_locked); - *found_bp = bp; return 0; } -struct xfs_buf * -xfs_buf_incore( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) +static inline int +xfs_buf_lookup( + struct xfs_perag *pag, + struct xfs_buf_map *map, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { - struct xfs_buf *bp; + struct xfs_buf *bp; int error; - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); - if (error) - return NULL; - return bp; + rcu_read_lock(); + bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + error = xfs_buf_find_lock(bp, flags); + if (error) { + xfs_buf_rele(bp); + return error; + } + + trace_xfs_buf_find(bp, flags, _RET_IP_); + *bpp = bp; + return 0; } /* - * Assembles a buffer covering the specified range. The code is optimised for - * cache hits, as metadata intensive workloads will see 3 orders of magnitude - * more hits than misses. + * Insert the new_bp into the hash table. This consumes the perag reference + * taken for the lookup regardless of the result of the insert. */ -int -xfs_buf_get_map( - struct xfs_buftarg *target, +static int +xfs_buf_find_insert( + struct xfs_buftarg *btp, + struct xfs_perag *pag, + struct xfs_buf_map *cmap, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { - struct xfs_buf *bp; struct xfs_buf *new_bp; + struct xfs_buf *bp; int error; - *bpp = NULL; - error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); - if (!error) - goto found; - if (error != -ENOENT) - return error; - - error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); + error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) - return error; + goto out_drop_pag; /* * For buffers that fit entirely within a single page, first attempt to @@ -673,18 +631,94 @@ xfs_buf_get_map( goto out_free_buf; } - error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); - if (error) + spin_lock(&pag->pag_buf_lock); + bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, xfs_buf_hash_params); + if (IS_ERR(bp)) { + error = PTR_ERR(bp); + spin_unlock(&pag->pag_buf_lock); + goto out_free_buf; + } + if (bp) { + /* found an existing buffer */ + atomic_inc(&bp->b_hold); + spin_unlock(&pag->pag_buf_lock); + error = xfs_buf_find_lock(bp, flags); + if (error) + xfs_buf_rele(bp); + else + *bpp = bp; goto out_free_buf; + } + + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); + *bpp = new_bp; + return 0; + +out_free_buf: + xfs_buf_free(new_bp); +out_drop_pag: + xfs_perag_put(pag); + return error; +} + +/* + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. + */ +int +xfs_buf_get_map( + struct xfs_buftarg *btp, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + struct xfs_perag *pag; + struct xfs_buf *bp = NULL; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; + int error; + int i; - if (bp != new_bp) - xfs_buf_free(new_bp); + for (i = 0; i < nmaps; i++) + cmap.bm_len += map[i].bm_len; -found: + error = xfs_buf_map_verify(btp, &cmap); + if (error) + return error; + + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + + error = xfs_buf_lookup(pag, &cmap, flags, &bp); + if (error && error != -ENOENT) + goto out_put_perag; + + /* cache hits always outnumber misses by at least 10:1 */ + if (unlikely(!bp)) { + XFS_STATS_INC(btp->bt_mount, xb_miss_locked); + + if (flags & XBF_INCORE) + goto out_put_perag; + + /* xfs_buf_find_insert() consumes the perag reference. */ + error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + flags, &bp); + if (error) + return error; + } else { + XFS_STATS_INC(btp->bt_mount, xb_get_locked); + xfs_perag_put(pag); + } + + /* We do not hold a perag reference anymore. */ if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn_ratelimited(target->bt_mount, + xfs_warn_ratelimited(btp->bt_mount, "%s: failed to map %u pages", __func__, bp->b_page_count); xfs_buf_relse(bp); @@ -699,12 +733,13 @@ found: if (!(flags & XBF_READ)) xfs_buf_ioerror(bp, 0); - XFS_STATS_INC(target->bt_mount, xb_get); + XFS_STATS_INC(btp->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); *bpp = bp; return 0; -out_free_buf: - xfs_buf_free(new_bp); + +out_put_perag: + xfs_perag_put(pag); return error; } @@ -813,7 +848,15 @@ xfs_buf_read_map( * buffer. */ if (error) { - if (!xfs_is_shutdown(target->bt_mount)) + /* + * Check against log shutdown for error reporting because + * metadata writeback may require a read first and we need to + * report errors in metadata writeback until the log is shut + * down. High level transaction read functions already check + * against mount shutdown, anyway, so we only need to be + * concerned about low level IO interactions here. + */ + if (!xlog_is_shutdown(target->bt_mount->m_log)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; @@ -843,9 +886,6 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) - return; - xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); @@ -862,7 +902,7 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -897,7 +937,7 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; @@ -1177,10 +1217,10 @@ xfs_buf_ioend_handle_error( struct xfs_error_cfg *cfg; /* - * If we've already decided to shutdown the filesystem because of I/O - * errors, there's no point in giving this a retry. + * If we've already shutdown the journal because of I/O errors, there's + * no point in giving this a retry. */ - if (xfs_is_shutdown(mp)) + if (xlog_is_shutdown(mp->m_log)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); @@ -1410,7 +1450,7 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int op) + blk_opf_t op) { int page_index; unsigned int total_nr_pages = bp->b_page_count; @@ -1440,12 +1480,10 @@ next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = bio_max_segs(total_nr_pages); - bio = bio_alloc(GFP_NOIO, nr_pages); - bio_set_dev(bio, bp->b_target->bt_bdev); + bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - bio->bi_opf = op; for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1489,7 +1527,7 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int op; + blk_opf_t op; int offset; int size; int i; @@ -1593,8 +1631,23 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - /* on shutdown we stale and complete the buffer immediately */ - if (xfs_is_shutdown(bp->b_mount)) { + /* + * On log shutdown we stale and complete the buffer immediately. We can + * be called to read the superblock before the log has been set up, so + * be careful checking the log state. + * + * Checking the mount shutdown state here can result in the log tail + * moving inappropriately on disk as the log may not yet be shut down. + * i.e. failing this buffer on mount shutdown can remove it from the AIL + * and move the tail of the log forwards without having written this + * buffer to disk. This corrupts the log tail state in memory, and + * because the log may not be shut down yet, it can then be propagated + * to disk before the log is shutdown. Hence we check log shutdown + * state here rather than mount state to avoid corrupting the log tail + * on shutdown. + */ + if (bp->b_mount->m_log && + xlog_is_shutdown(bp->b_mount->m_log)) { xfs_buf_ioend_fail(bp); return -EIO; } @@ -1808,10 +1861,10 @@ xfs_buftarg_drain( * If one or more failed buffers were freed, that means dirty metadata * was thrown away. This should only ever happen after I/O completion * handling has elevated I/O error(s) to permanent failures and shuts - * down the fs. + * down the journal. */ if (write_fail) { - ASSERT(xfs_is_shutdown(btp->bt_mount)); + ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } @@ -1892,6 +1945,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); } @@ -1932,20 +1986,24 @@ xfs_setsize_buftarg_early( return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } -xfs_buftarg_t * +struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - struct dax_device *dax_dev) + struct block_device *bdev) { xfs_buftarg_t *btp; + const struct dax_holder_operations *ops = NULL; +#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) + ops = &xfs_dax_holder_operations; +#endif btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = dax_dev; + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, + mp, ops); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages @@ -1967,7 +2025,8 @@ xfs_alloc_buftarg( btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker)) + if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", + mp->m_super->s_id)) goto error_pcpu; return btp; @@ -2094,12 +2153,13 @@ xfs_buf_delwri_submit_buffers( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait_list) { + if (!xfs_buf_trylock(bp)) + continue; if (xfs_buf_ispinned(bp)) { + xfs_buf_unlock(bp); pinned++; continue; } - if (!xfs_buf_trylock(bp)) - continue; } else { xfs_buf_lock(bp); } @@ -2255,29 +2315,6 @@ xfs_buf_delwri_pushbuf( return error; } -int __init -xfs_buf_init(void) -{ - xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN | - SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD, - NULL); - if (!xfs_buf_cache) - goto out; - - return 0; - - out: - return -ENOMEM; -} - -void -xfs_buf_terminate(void) -{ - kmem_cache_destroy(xfs_buf_cache); -} - void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 6b0200b8007d..549c60942208 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -15,6 +15,8 @@ #include <linux/uio.h> #include <linux/list_lru.h> +extern struct kmem_cache *xfs_buf_cache; + /* * Base types */ @@ -22,28 +24,30 @@ struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ -#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ +#define XBF_READ (1u << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ +#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ -#define _XBF_INODES (1 << 16)/* inode buffer */ -#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ -#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ +#define _XBF_INODES (1u << 16)/* inode buffer */ +#define _XBF_DQUOTS (1u << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1u << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ +#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ +#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ + typedef unsigned int xfs_buf_flags_t; @@ -58,11 +62,12 @@ typedef unsigned int xfs_buf_flags_t; { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ { _XBF_DQUOTS, "DQUOTS" }, \ - { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ + { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } @@ -89,6 +94,7 @@ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; @@ -192,13 +198,10 @@ struct xfs_buf { int b_last_error; const struct xfs_buf_ops *b_ops; + struct rcu_head b_rcu; }; /* Finding and Reading Buffers */ -struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, - xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags); - int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, @@ -209,6 +212,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target, const struct xfs_buf_ops *ops); static inline int +xfs_buf_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + + return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp); +} + +static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, @@ -246,11 +262,11 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, - struct xfs_buf **bpp); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, int flags, struct xfs_buf **bpp, - const struct xfs_buf_ops *ops); + size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); @@ -293,10 +309,6 @@ extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); -/* Buffer Daemon Setup Routines */ -extern int xfs_buf_init(void); -extern void xfs_buf_terminate(void); - static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { return bp->b_maps[0].bm_bn; @@ -338,8 +350,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) /* * Handling of buftargs. */ -extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, struct dax_device *); +struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, + struct block_device *bdev); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a7a8e4528881..522d450a94b1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -21,6 +21,7 @@ #include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_buf_item_cache; @@ -428,7 +429,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_has_v3inodes(lip->li_mountp) || + if (xfs_has_v3inodes(lip->li_log->l_mp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -616,7 +617,7 @@ xfs_buf_item_put( * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xfs_is_shutdown(lip->li_mountp); + xlog_is_shutdown(lip->li_log); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e11e9ef2338f..4d8a6aece995 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -8,15 +8,18 @@ /* kernel only definitions */ +struct xfs_buf; +struct xfs_mount; + /* buf log item flags */ -#define XFS_BLI_HOLD 0x01 -#define XFS_BLI_DIRTY 0x02 -#define XFS_BLI_STALE 0x04 -#define XFS_BLI_LOGGED 0x08 -#define XFS_BLI_INODE_ALLOC_BUF 0x10 -#define XFS_BLI_STALE_INODE 0x20 -#define XFS_BLI_INODE_BUF 0x40 -#define XFS_BLI_ORDERED 0x80 +#define XFS_BLI_HOLD (1u << 0) +#define XFS_BLI_DIRTY (1u << 1) +#define XFS_BLI_STALE (1u << 2) +#define XFS_BLI_LOGGED (1u << 3) +#define XFS_BLI_INODE_ALLOC_BUF (1u << 4) +#define XFS_BLI_STALE_INODE (1u << 5) +#define XFS_BLI_INODE_BUF (1u << 6) +#define XFS_BLI_ORDERED (1u << 7) #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -28,11 +31,6 @@ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ { XFS_BLI_ORDERED, "ORDERED" } - -struct xfs_buf; -struct xfs_mount; -struct xfs_buf_log_item; - /* * This is the in core log item structure used to track information * needed to log buffers. It tracks how many times the lock has been diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 70ca5751b13e..ffa94102094d 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -24,6 +24,15 @@ #include "xfs_quota.h" /* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +/* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. */ @@ -816,7 +825,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } @@ -993,3 +1002,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +int +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + void *p; + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + int i; + + if (!log->l_buf_cancel_table) + return; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8310005af00f..9f3ceb461515 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -138,7 +138,8 @@ xfs_dir2_sf_getdents( STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, - struct dir_context *ctx) + struct dir_context *ctx, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ @@ -146,7 +147,6 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; - int lock_mode; unsigned int offset, next_offset; unsigned int end; @@ -156,12 +156,13 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(args->trans, dp, &bp); - xfs_iunlock(dp, lock_mode); if (error) return error; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -247,7 +248,7 @@ xfs_dir2_leaf_readbuf( struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; struct xfs_da_geometry *geo = args->geo; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_bmbt_irec map; struct blk_plug plug; xfs_dir2_off_t new_off; @@ -344,7 +345,8 @@ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, - size_t bufsize) + size_t bufsize, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; @@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents( xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ - int lock_mode; unsigned int offset = 0; int error = 0; /* error return value */ @@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents( bp = NULL; } - lock_mode = xfs_ilock_data_map_shared(dp); + if (*lock_mode == 0) + *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); - xfs_iunlock(dp, lock_mode); if (error || !bp) break; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. @@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents( * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the - * transaction. The caller should ensure that the inode is locked + * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int @@ -507,8 +511,9 @@ xfs_readdir( size_t bufsize) { struct xfs_da_args args = { NULL }; - int rval; - int v; + unsigned int lock_mode; + bool isblock; + int error; trace_xfs_readdir(dp); @@ -516,6 +521,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; @@ -523,13 +529,22 @@ xfs_readdir( args.trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(&args, ctx); - else if ((rval = xfs_dir2_isblock(&args, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(&args, ctx); - else - rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + return xfs_dir2_sf_getdents(&args, ctx); - return rval; + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir2_isblock(&args, &isblock); + if (error) + goto out_unlock; + + if (isblock) { + error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); + goto out_unlock; + } + + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + +out_unlock: + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0191de8ce9ce..bfc829c07f03 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -45,7 +45,7 @@ xfs_trim_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) goto out_put_perag; agf = agbp->b_addr; @@ -114,7 +114,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -152,8 +152,8 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); - unsigned int granularity = q->limits.discard_granularity; + unsigned int granularity = + bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; @@ -162,7 +162,7 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) return -EOPNOTSUPP; /* diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e48ae227bb11..8fb90da89787 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer( res->timer = xfs_dquot_set_timeout(mp, ktime_get_real_seconds() + qlim->time); } else { - if (res->timer == 0) - res->warnings = 0; - else - res->timer = 0; + res->timer = 0; } } @@ -289,13 +286,12 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) */ STATIC int xfs_dquot_disk_alloc( - struct xfs_trans **tpp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - struct xfs_trans *tp = *tpp; - struct xfs_mount *mp = tp->t_mountp; + struct xfs_trans *tp; + struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); @@ -304,29 +300,38 @@ xfs_dquot_disk_alloc( trace_xfs_dqalloc(dqp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + return error; + xfs_ilock(quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, 0); + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return -ESRCH; + error = -ESRCH; + goto err_cancel; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, quotip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) - return error; + goto err_cancel; /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); if (error) - return error; + goto err_cancel; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -341,7 +346,7 @@ xfs_dquot_disk_alloc( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) - return error; + goto err_cancel; bp->b_ops = &xfs_dquot_buf_ops; /* @@ -371,16 +376,25 @@ xfs_dquot_disk_alloc( * is responsible for unlocking any buffer passed back, either * manually or by committing the transaction. On error, the buffer is * released and not passed back. + * + * Keep the quota inode ILOCKed until after the transaction commit to + * maintain the atomicity of bmap/rmap updates. */ xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp); + error = xfs_trans_commit(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); if (error) { - xfs_trans_bhold_release(*tpp, bp); - xfs_trans_brelse(*tpp, bp); + xfs_buf_relse(bp); return error; } + *bpp = bp; return 0; + +err_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return error; } /* @@ -535,7 +549,7 @@ xfs_dquot_check_type( * at the same time. The non-user quota file can be switched between * group and project quota uses depending on the mount options, which * means that we can encounter the other type when we try to load quota - * defaults. Quotacheck will soon reset the the entire quota file + * defaults. Quotacheck will soon reset the entire quota file * (including the root dquot) anyway, but don't log scary corruption * reports to dmesg. */ @@ -575,10 +589,6 @@ xfs_dquot_from_disk( dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); - dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); - dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); - dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); @@ -620,52 +630,15 @@ xfs_dquot_to_disk( ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); - ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); - ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); - ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + ddqp->d_bwarns = 0; + ddqp->d_iwarns = 0; + ddqp->d_rtbwarns = 0; ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } -/* Allocate and initialize the dquot buffer for this in-core dquot. */ -static int -xfs_qm_dqread_alloc( - struct xfs_mount *mp, - struct xfs_dquot *dqp, - struct xfs_buf **bpp) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto err; - - error = xfs_dquot_disk_alloc(&tp, dqp, bpp); - if (error) - goto err_cancel; - - error = xfs_trans_commit(tp); - if (error) { - /* - * Buffer was held to the transaction, so we have to unlock it - * manually here because we're not passing it back. - */ - xfs_buf_relse(*bpp); - *bpp = NULL; - goto err; - } - return 0; - -err_cancel: - xfs_trans_cancel(tp); -err: - return error; -} - /* * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. If @can_alloc is true, fill any @@ -689,7 +662,7 @@ xfs_qm_dqread( /* Try to read the buffer, allocating if necessary. */ error = xfs_dquot_disk_read(mp, dqp, &bp); if (error == -ENOENT && can_alloc) - error = xfs_qm_dqread_alloc(mp, dqp, &bp); + error = xfs_dquot_disk_alloc(dqp, &bp); if (error) goto err; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 6b5e3cf40c8b..80c8f851a2f3 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -44,14 +44,6 @@ struct xfs_dquot_res { * in seconds since the Unix epoch. */ time64_t timer; - - /* - * For root dquots, this is the maximum number of warnings that will - * be issued for this quota type. Otherwise, this is the number of - * warnings issued against this quota. Note that none of this is - * implemented. - */ - xfs_qwarncnt_t warnings; }; static inline bool diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 81c445e9489b..c6b2aabd6f18 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REDUCE_MAX_IEXTENTS, XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, + XFS_RANDOM_LARP, + XFS_RANDOM_DA_LEAF_SPLIT, + XFS_RANDOM_ATTR_LEAF_TO_NODE, }; struct xfs_errortag_attr { @@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); +XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); +XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); +XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -211,26 +217,35 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), + XFS_ERRORTAG_ATTR_LIST(larp), + XFS_ERRORTAG_ATTR_LIST(da_leaf_split), + XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), NULL, }; +ATTRIBUTE_GROUPS(xfs_errortag); static struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, - .default_attrs = xfs_errortag_attrs, + .default_groups = xfs_errortag_groups, }; int xfs_errortag_init( struct xfs_mount *mp) { + int ret; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; - return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, - &mp->m_kobj, "errortag"); + ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); + if (ret) + kmem_free(mp->m_errortag); + return ret; } void @@ -264,7 +279,7 @@ xfs_errortag_test( ASSERT(error_tag < XFS_ERRTAG_MAX); randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32() % randfactor) + if (!randfactor || prandom_u32_max(randfactor)) return false; xfs_warn_ratelimited(mp, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5735d5ea87ee..5191e9145e55 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); * XFS panic tags -- allow a call to xfs_alert_tag() be turned into * a panic by setting xfs_panic_mask in a sysctl. */ -#define XFS_NO_PTAG 0 -#define XFS_PTAG_IFLUSH 0x00000001 -#define XFS_PTAG_LOGRES 0x00000002 -#define XFS_PTAG_AILDELETE 0x00000004 -#define XFS_PTAG_ERROR_REPORT 0x00000008 -#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 -#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 -#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 -#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 -#define XFS_PTAG_VERIFIER_ERROR 0x00000100 +#define XFS_NO_PTAG 0u +#define XFS_PTAG_IFLUSH (1u << 0) +#define XFS_PTAG_LOGRES (1u << 1) +#define XFS_PTAG_AILDELETE (1u << 2) +#define XFS_PTAG_ERROR_REPORT (1u << 3) +#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4) +#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5) +#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6) +#define XFS_PTAG_FSBLOCK_ZERO (1u << 7) +#define XFS_PTAG_VERIFIER_ERROR (1u << 8) #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ef9c9c5c17..d5130d1fcfae 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -11,6 +11,7 @@ #include "xfs_bit.h" #include "xfs_shared.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -58,23 +59,11 @@ xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } -} + if (!atomic_dec_and_test(&efip->efi_refcount)) + return; -/* - * This returns the number of iovecs needed to log the given efi item. - * We only need 1 iovec for an efi item. It just logs the efi_log_format - * structure. - */ -static inline int -xfs_efi_item_sizeof( - struct xfs_efi_log_item *efip) -{ - return sizeof(struct xfs_efi_log_format) + - (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); + xfs_trans_ail_delete(&efip->efi_item, 0); + xfs_efi_item_free(efip); } STATIC void @@ -83,8 +72,10 @@ xfs_efi_item_size( int *nvecs, int *nbytes) { + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); + *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } /* @@ -110,7 +101,7 @@ xfs_efi_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format, - xfs_efi_item_sizeof(efip)); + xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents)); } @@ -153,13 +144,11 @@ xfs_efi_init( { struct xfs_efi_log_item *efip; - uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_efi_log_item) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, 0); + efip = kzalloc(xfs_efi_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efip = kmem_cache_zalloc(xfs_efi_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -186,15 +175,17 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; - uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); - uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); - uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); + uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); + uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); if (buf->i_len == len) { - memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + memcpy(dst_efi_fmt, src_efi_fmt, + offsetof(struct xfs_efi_log_format, efi_extents)); + for (i = 0; i < src_efi_fmt->efi_nextents; i++) + memcpy(&dst_efi_fmt->efi_extents[i], + &src_efi_fmt->efi_extents[i], + sizeof(struct xfs_extent)); return 0; } else if (buf->i_len == len32) { xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; @@ -225,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr, + buf->i_len); return -EFSCORRUPTED; } @@ -244,27 +236,16 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) kmem_cache_free(xfs_efd_cache, efdp); } -/* - * This returns the number of iovecs needed to log the given efd item. - * We only need 1 iovec for an efd item. It just logs the efd_log_format - * structure. - */ -static inline int -xfs_efd_item_sizeof( - struct xfs_efd_log_item *efdp) -{ - return sizeof(xfs_efd_log_format_t) + - (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efd_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); + *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } /* @@ -289,7 +270,7 @@ xfs_efd_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format, - xfs_efd_item_sizeof(efdp)); + xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents)); } /* @@ -306,11 +287,20 @@ xfs_efd_item_release( xfs_efd_item_free(efdp); } +static struct xfs_log_item * +xfs_efd_item_intent( + struct xfs_log_item *lip) +{ + return &EFD_ITEM(lip)->efd_efip->efi_item; +} + static const struct xfs_item_ops xfs_efd_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_release = xfs_efd_item_release, + .iop_intent = xfs_efd_item_intent, }; /* @@ -329,9 +319,8 @@ xfs_trans_get_efd( ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + - (nextents - 1) * sizeof(struct xfs_extent), - 0); + efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efdp = kmem_cache_zalloc(xfs_efd_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -380,7 +369,7 @@ xfs_trans_free_extent( * 1.) releases the EFI and frees the EFD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; @@ -541,6 +530,7 @@ xfs_agfl_free_finish_item( xfs_agnumber_t agno; xfs_agblock_t agbno; uint next_extent; + struct xfs_perag *pag; free = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(free->xefi_blockcount == 1); @@ -550,9 +540,11 @@ xfs_agfl_free_finish_item( trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (!error) error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); + xfs_perag_put(pag); /* * Mark the transaction dirty, even on error. This ensures the @@ -604,7 +596,7 @@ xfs_efi_item_recover( struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; @@ -688,6 +680,7 @@ xfs_efi_item_relog( } static const struct xfs_item_ops xfs_efi_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, @@ -718,6 +711,12 @@ xlog_recover_efi_commit_pass2( efi_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { @@ -754,12 +753,24 @@ xlog_recover_efd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; + int buflen = item->ri_buf[0].i_len; efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + if (buflen < sizeof(struct xfs_efd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + efd_formatp->efd_nextents) && + item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + efd_formatp->efd_nextents)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); return 0; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 186d0f2137f1..da6a5afa607c 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -52,6 +52,14 @@ struct xfs_efi_log_item { xfs_efi_log_format_t efi_format; }; +static inline size_t +xfs_efi_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efi_log_item, efi_format) + + xfs_efi_log_format_sizeof(nr); +} + /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item @@ -64,6 +72,14 @@ struct xfs_efd_log_item { xfs_efd_log_format_t efd_format; }; +static inline size_t +xfs_efd_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efd_log_item, efd_format) + + xfs_efd_log_format_sizeof(nr); +} + /* * Max number of extents in fast allocation path. */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27594738b0d1..e462d39c840e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" +#include <linux/dax.h> #include <linux/falloc.h> #include <linux/backing-dev.h> #include <linux/mman.h> @@ -66,40 +67,6 @@ xfs_is_falloc_aligned( return !((pos | len) & mask); } -int -xfs_update_prealloc_flags( - struct xfs_inode *ip, - enum xfs_prealloc_flags flags) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, - 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(flags & XFS_PREALLOC_INVISIBLE)) { - VFS_I(ip)->i_mode &= ~S_ISUID; - if (VFS_I(ip)->i_mode & S_IXGRP) - VFS_I(ip)->i_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (flags & XFS_PREALLOC_SET) - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - if (flags & XFS_PREALLOC_CLEAR) - ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - /* * Fsync operations on directories are much simpler than on regular files, * as there is no file data to flush, and thus also no need for explicit @@ -176,7 +143,7 @@ xfs_file_fsync( { struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; - int error = 0; + int error, err2; int log_flushed = 0; trace_xfs_file_fsync(ip); @@ -197,18 +164,21 @@ xfs_file_fsync( * inode size in case of an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) - blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) - blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode while not catch modifications + * racy check here for a pinned inode will not catch modifications * that happen concurrently to the fsync call, but fsync semantics * only require to sync previously completed I/O. */ - if (xfs_ipincount(ip)) - error = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (xfs_ipincount(ip)) { + err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (err2 && !error) + error = err2; + } /* * If we only have a single device, and the log force about was @@ -218,8 +188,11 @@ xfs_file_fsync( * commit. */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && - mp->m_logdev_targp == mp->m_ddev_targp) - blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + mp->m_logdev_targp == mp->m_ddev_targp) { + err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + if (err2 && !error) + error = err2; + } return error; } @@ -259,7 +232,7 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -344,7 +317,7 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - int *iolock) + unsigned int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -437,15 +410,14 @@ restart: } trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, - NULL, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; } else spin_unlock(&ip->i_flags_lock); out: - return file_modified(file); + return kiocb_modified(iocb); } static int @@ -548,7 +520,7 @@ xfs_file_dio_write_aligned( struct kiocb *iocb, struct iov_iter *from) { - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; ret = xfs_ilock_iocb(iocb, iolock); @@ -569,7 +541,7 @@ xfs_file_dio_write_aligned( } trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, 0); + &xfs_dio_write_ops, 0, NULL, 0); out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -601,7 +573,7 @@ xfs_file_dio_write_unaligned( { size_t isize = i_size_read(VFS_I(ip)); size_t count = iov_iter_count(from); - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; ssize_t ret; @@ -611,9 +583,9 @@ xfs_file_dio_write_unaligned( * don't even bother trying the fast path in this case. */ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { -retry_exclusive: if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; +retry_exclusive: iolock = XFS_IOLOCK_EXCL; flags = IOMAP_DIO_FORCE_WAIT; } @@ -647,7 +619,7 @@ retry_exclusive: trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, flags, 0); + &xfs_dio_write_ops, flags, NULL, 0); /* * Retry unaligned I/O with exclusive blocking semantics if the DIO @@ -690,7 +662,7 @@ xfs_file_dax_write( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; + unsigned int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; loff_t pos; @@ -704,7 +676,7 @@ xfs_file_dax_write( pos = iocb->ki_pos; trace_xfs_file_dax_write(iocb, from); - ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); @@ -729,20 +701,17 @@ xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; bool cleared_space = false; - int iolock; - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; + unsigned int iolock; write_retry: iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) @@ -802,9 +771,7 @@ xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); @@ -846,7 +813,7 @@ xfs_wait_dax_page( xfs_ilock(ip, XFS_MMAPLOCK_EXCL); } -static int +int xfs_break_dax_layouts( struct inode *inode, bool *retry) @@ -896,6 +863,21 @@ xfs_break_layouts( return error; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -911,7 +893,6 @@ xfs_file_fallocate( struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; - enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -956,6 +937,10 @@ xfs_file_fallocate( goto out_unlock; } + error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1005,8 +990,6 @@ xfs_file_fallocate( } do_file_insert = true; } else { - flags |= XFS_PREALLOC_SET; - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -1052,20 +1035,12 @@ xfs_file_fallocate( } if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + error = xfs_alloc_file_space(ip, offset, len); if (error) goto out_unlock; } } - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - - error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; - /* Change file size if needed */ if (new_size) { struct iattr iattr; @@ -1084,8 +1059,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (xfs_file_sync_writes(file)) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); @@ -1117,21 +1098,6 @@ xfs_file_fadvise( return ret; } -/* Does this file, inode, or mount want synchronous writes? */ -static inline bool xfs_file_sync_writes(struct file *filp) -{ - struct xfs_inode *ip = XFS_I(file_inode(filp)); - - if (xfs_has_wsync(ip->i_mount)) - return true; - if (filp->f_flags & (__O_SYNC | O_DSYNC)) - return true; - if (IS_SYNC(file_inode(filp))) - return true; - - return false; -} - STATIC loff_t xfs_file_remap_range( struct file *file_in, @@ -1203,12 +1169,10 @@ xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - return 0; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + return generic_file_open(inode, file); } STATIC int @@ -1217,7 +1181,7 @@ xfs_dir_open( struct file *file) { struct xfs_inode *ip = XFS_I(inode); - int mode; + unsigned int mode; int error; error = xfs_file_open(inode, file); @@ -1296,6 +1260,32 @@ xfs_file_llseek( return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } +#ifdef CONFIG_FS_DAX +static inline vm_fault_t +xfs_dax_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault, + pfn_t *pfn) +{ + return dax_iomap_fault(vmf, pe_size, pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_dax_write_iomap_ops : + &xfs_read_iomap_ops); +} +#else +static inline vm_fault_t +xfs_dax_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault, + pfn_t *pfn) +{ + ASSERT(0); + return VM_FAULT_SIGBUS; +} +#endif + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: @@ -1327,10 +1317,7 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, - (write_fault && !vmf->cow_page) ? - &xfs_direct_write_iomap_ops : - &xfs_read_iomap_ops); + ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 6a3ce0f6dc9e..34b21a29c39b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -126,13 +126,14 @@ xfs_filestream_pick_ag( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + err = xfs_alloc_read_agf(pag, NULL, trylock, NULL); if (err) { - xfs_perag_put(pag); - if (err != -EAGAIN) + if (err != -EAGAIN) { + xfs_perag_put(pag); return err; + } /* Couldn't lock the AGF, skip this AG. */ - continue; + goto next_ag; } } @@ -180,7 +181,7 @@ next_ag: if (ag != startag) continue; - /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */ if (trylock != 0) { trylock = 0; continue; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 48287caad28b..d8337274c74d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -450,11 +450,11 @@ xfs_getfsmap_logdev( /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; @@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; - error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; @@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( info->last = true; ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); - error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); if (error) goto err; err: @@ -642,8 +642,7 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, - &info->agf_bp); + error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp); if (error) break; @@ -864,8 +863,8 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; - use_rmap = capable(CAP_SYS_ADMIN) && - xfs_has_rmapbt(mp); + use_rmap = xfs_has_rmapbt(mp) && + has_capability_noaudit(current, CAP_SYS_ADMIN); head->fmh_entries = 0; /* Set up our device handlers. */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 33e26690a8c4..13851c0d640b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" @@ -40,6 +41,7 @@ xfs_resizefs_init_new_ags( xfs_agnumber_t oagcount, xfs_agnumber_t nagcount, xfs_rfsblock_t delta, + struct xfs_perag *last_pag, bool *lastag_extended) { struct xfs_mount *mp = tp->t_mountp; @@ -72,7 +74,7 @@ xfs_resizefs_init_new_ags( if (delta) { *lastag_extended = true; - error = xfs_ag_extend_space(mp, tp, id, delta); + error = xfs_ag_extend_space(last_pag, tp, delta); } return error; } @@ -95,6 +97,7 @@ xfs_growfs_data_private( xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; + struct xfs_perag *last_pag; nb = in->newblocks; error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); @@ -127,10 +130,9 @@ xfs_growfs_data_private( return -EINVAL; oagcount = mp->m_sb.sb_agcount; - /* allocate the new per-ag structures */ if (nagcount > oagcount) { - error = xfs_initialize_perag(mp, nagcount, &nagimax); + error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); if (error) return error; } else if (nagcount < oagcount) { @@ -144,20 +146,17 @@ xfs_growfs_data_private( if (error) return error; + last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, - delta, &lastag_extended); + delta, last_pag, &lastag_extended); } else { - static struct ratelimit_state shrink_warning = \ - RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); - ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&shrink_warning)) - xfs_alert(mp, + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); - error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); + error = xfs_ag_shrink_space(last_pag, &tp, -delta); } + xfs_perag_put(last_pag); if (error) goto out_trans_cancel; @@ -347,11 +346,8 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - - spin_lock(&mp->m_sb_lock); - cnt->freertx = mp->m_sb.sb_frextents; - spin_unlock(&mp->m_sb_lock); + xfs_fdblocks_unavailable(mp); + cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); } /* @@ -430,46 +426,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + free = percpu_counter_sum(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp); + delta = request - mp->m_resblks; + mp->m_resblks = request; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + if (!error) + xfs_mod_fdblocks(mp, fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; } - out: if (outval) { outval->resblks = mp->m_resblks; @@ -521,15 +507,18 @@ xfs_fs_goingdown( void xfs_do_force_shutdown( struct xfs_mount *mp, - int flags, + uint32_t flags, char *fname, int lnnum) { int tag; const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; @@ -542,6 +531,9 @@ xfs_do_force_shutdown( } else if (flags & SHUTDOWN_CORRUPT_INCORE) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of in-memory data"; + } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of on-disk metadata"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f62fa652c2fd..4d0a98f920ca 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = { #endif #ifdef DEBUG .pwork_threads = -1, /* automatic thread detection */ + .larp = false, /* log attribute replay */ #endif }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index da4af2142a2b..eae7427062cf 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -23,6 +23,7 @@ #include "xfs_reflink.h" #include "xfs_ialloc.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" #include <linux/iversion.h> @@ -77,7 +78,7 @@ xfs_inode_alloc( * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL * and return NULL here on ENOMEM. */ - ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); + ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); if (inode_init_always(mp->m_super, VFS_I(ip))) { kmem_cache_free(xfs_inode_cache, ip); @@ -87,6 +88,7 @@ xfs_inode_alloc( /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; VFS_I(ip)->i_state = 0; + mapping_set_large_folios(VFS_I(ip)->i_mapping); XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -96,8 +98,9 @@ xfs_inode_alloc( ip->i_ino = ino; ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; ip->i_cowfp = NULL; + memset(&ip->i_af, 0, sizeof(ip->i_af)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -109,6 +112,8 @@ xfs_inode_alloc( INIT_WORK(&ip->i_ioend_work, xfs_end_io); INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = NULLAGINO; return ip; } @@ -128,10 +133,8 @@ xfs_inode_free_callback( break; } - if (ip->i_afp) { - xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - } + xfs_ifork_zap_attr(ip); + if (ip->i_cowfp) { xfs_idestroy_fork(ip->i_cowfp); kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); @@ -320,6 +323,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; + mapping_set_large_folios(inode->i_mapping); return error; } @@ -437,7 +441,7 @@ xfs_inodegc_queue_all( for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); } } @@ -592,7 +596,7 @@ xfs_iget_cache_miss( */ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { - VFS_I(ip)->i_generation = prandom_u32(); + VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; @@ -749,7 +753,8 @@ again: /* * If we have a real type for an on-disk inode, we can setup the inode - * now. If it's a new inode being created, xfs_ialloc will handle it. + * now. If it's a new inode being created, xfs_init_new_inode will + * handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) xfs_setup_existing_inode(ip); @@ -870,9 +875,16 @@ xfs_reclaim_inode( if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; - if (xfs_is_shutdown(ip->i_mount)) { + /* + * Check for log shutdown because aborting the inode can move the log + * tail and corrupt in memory state. This is fine if the log is shut + * down, but if the log is still active and only the mount is shut down + * then the in-memory log tail movement caused by the abort can be + * incorrectly propagated to disk. + */ + if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); - xfs_iflush_abort(ip); + xfs_iflush_shutdown_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) @@ -901,6 +913,7 @@ reclaim: ip->i_checked = 0; spin_unlock(&ip->i_flags_lock); + ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); @@ -1763,7 +1776,7 @@ xfs_check_delalloc( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; @@ -1830,8 +1843,8 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, - work); + struct xfs_inodegc *gc = container_of(to_delayed_work(work), + struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; @@ -1851,28 +1864,30 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately, and - * wait for the work to finish. Two pass - queue all the work first pass, wait - * for it in a second pass. + * Expedite all pending inodegc work to run immediately. This does not wait for + * completion of the work. */ void -xfs_inodegc_flush( +xfs_inodegc_push( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_is_inodegc_enabled(mp)) return; - - trace_xfs_inodegc_flush(mp, __return_address); - + trace_xfs_inodegc_push(mp, __return_address); xfs_inodegc_queue_all(mp); +} - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - flush_work(&gc->work); - } +/* + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + xfs_inodegc_push(mp); + trace_xfs_inodegc_flush(mp, __return_address); + flush_workqueue(mp->m_inodegc_wq); } /* @@ -1883,18 +1898,12 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_clear_inodegc_enabled(mp)) return; xfs_inodegc_queue_all(mp); + drain_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - cancel_work_sync(&gc->work); - } trace_xfs_inodegc_stop(mp, __return_address); } @@ -1919,13 +1928,16 @@ xfs_inodegc_want_queue_rt_file( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - uint64_t freertx; if (!XFS_IS_REALTIME_INODE(ip)) return false; - freertx = READ_ONCE(mp->m_sb.sb_frextents); - return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; + if (__percpu_counter_compare(&mp->m_frextents, + mp->m_low_rtexts[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + return false; } #else # define xfs_inodegc_want_queue_rt_file(ip) (false) @@ -2014,6 +2026,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2025,19 +2038,26 @@ xfs_inodegc_queue( items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); - put_cpu_ptr(gc); - if (!xfs_is_inodegc_enabled(mp)) + /* + * We queue the work while holding the current CPU so that the work + * is scheduled to run on this CPU. + */ + if (!xfs_is_inodegc_enabled(mp)) { + put_cpu_ptr(gc); return; - - if (xfs_inodegc_want_queue_work(ip, items)) { - trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); } + if (xfs_inodegc_want_queue_work(ip, items)) + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + put_cpu_ptr(gc); + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); - flush_work(&gc->work); + flush_delayed_work(&gc->work); } } @@ -2054,7 +2074,7 @@ xfs_inodegc_cpu_dead( unsigned int count = 0; dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_work_sync(&dead_gc->work); + cancel_delayed_work_sync(&dead_gc->work); if (llist_empty(&dead_gc->list)) return; @@ -2073,12 +2093,12 @@ xfs_inodegc_cpu_dead( llist_add_batch(first, last, &gc->list); count += READ_ONCE(gc->items); WRITE_ONCE(gc->items, count); - put_cpu_ptr(gc); if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); } + put_cpu_ptr(gc); } /* @@ -2173,7 +2193,7 @@ xfs_inodegc_shrinker_scan( unsigned int h = READ_ONCE(gc->shrinker_hits); WRITE_ONCE(gc->shrinker_hits, h + 1); - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); no_items = false; } } @@ -2201,5 +2221,5 @@ xfs_inodegc_register_shrinker( shrink->flags = SHRINKER_NONSLAB; shrink->batch = XFS_INODEGC_SHRINKER_BATCH; - return register_shrinker(shrink); + return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2e4cfddf8b8e..6cd180721659 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_push(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 508e184e3b8f..b05314d48176 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -63,6 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { + kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6771f357ad2c..aa303be11576 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" +#include "xfs_iunlink_item.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -35,6 +36,7 @@ #include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_inode_cache; @@ -124,13 +126,33 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_afp && xfs_need_iread_extents(ip->i_afp)) + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; } /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, + * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values + * to set in lock_flags. + */ +static inline void +xfs_lock_flags_assert( + uint lock_flags) +{ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + ASSERT(lock_flags != 0); +} + +/* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. @@ -167,18 +189,7 @@ xfs_ilock( { trace_xfs_ilock(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { down_write_nested(&VFS_I(ip)->i_rwsem, @@ -221,18 +232,7 @@ xfs_ilock_nowait( { trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) @@ -290,19 +290,7 @@ xfs_iunlock( xfs_inode_t *ip, uint lock_flags) { - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - ASSERT(lock_flags != 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -378,8 +366,8 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, + (lock_flags & XFS_MMAPLOCK_SHARED)); } if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { @@ -415,10 +403,12 @@ xfs_lockdep_subclass_ok( * parent locking. Care must be taken to ensure we don't overrun the subclass * storage fields in the class mask we build. */ -static inline int -xfs_lock_inumorder(int lock_mode, int subclass) +static inline uint +xfs_lock_inumorder( + uint lock_mode, + uint subclass) { - int class = 0; + uint class = 0; ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | XFS_ILOCK_RTSUM))); @@ -463,7 +453,10 @@ xfs_lock_inodes( int inodes, uint lock_mode) { - int attempts = 0, i, j, try_lock; + int attempts = 0; + uint i; + int j; + bool try_lock; struct xfs_log_item *lp; /* @@ -488,9 +481,9 @@ xfs_lock_inodes( } else if (lock_mode & XFS_MMAPLOCK_EXCL) ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); - try_lock = 0; - i = 0; again: + try_lock = false; + i = 0; for (; i < inodes; i++) { ASSERT(ips[i]); @@ -505,7 +498,7 @@ again: for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = &ips[j]->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) - try_lock++; + try_lock = true; } } @@ -545,8 +538,6 @@ again: if ((attempts % 5) == 0) { delay(1); /* Don't just spin the CPU */ } - i = 0; - try_lock = 0; goto again; } } @@ -645,7 +636,7 @@ xfs_ip2xflags( flags |= FS_XFLAG_COWEXTSIZE; } - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) flags |= FS_XFLAG_HASATTR; return flags; } @@ -658,9 +649,9 @@ xfs_ip2xflags( */ int xfs_lookup( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t **ipp, + struct xfs_inode *dp, + const struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name) { xfs_ino_t inum; @@ -844,9 +835,8 @@ xfs_init_new_inode( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && - !in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -903,7 +893,7 @@ xfs_init_new_inode( */ if (init_xattrs && xfs_has_attr(mp)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; - ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); } /* @@ -988,8 +978,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1023,11 +1013,6 @@ xfs_create( xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry @@ -1142,8 +1127,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1217,7 +1202,7 @@ xfs_link( { xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; - int error; + int error, nospace_error = 0; int resblks; trace_xfs_link(tdp, target_name); @@ -1236,24 +1221,11 @@ xfs_link( goto std_return; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); - } + error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, + &tp, &nospace_error); if (error) goto std_return; - xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto error_return; - /* * If we are using project inheritance, we only allow hard link * creation in our tree when the project IDs are the same; else @@ -1306,6 +1278,8 @@ xfs_link( error_return: xfs_trans_cancel(tp); std_return: + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -1319,8 +1293,8 @@ xfs_itruncate_clear_reflink_flags( if (!xfs_is_reflink_inode(ip)) return; - dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); if (dfork->if_bytes == 0 && cfork->if_bytes == 0) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; if (cfork->if_bytes == 0) @@ -1669,7 +1643,7 @@ xfs_inode_needs_inactive( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); /* * If the inode is already free, then there can be nothing @@ -1788,13 +1762,12 @@ xfs_inactive( * now. The code calls a routine that recursively deconstructs the * attribute fork. If also blows away the in-core attribute fork. */ - if (XFS_IFORK_Q(ip)) { + if (xfs_inode_has_attr_fork(ip)) { error = xfs_attr_inactive(ip); if (error) goto out; } - ASSERT(!ip->i_afp); ASSERT(ip->i_forkoff == 0); /* @@ -1827,195 +1800,69 @@ out: * because we must walk that list to find the inode that points to the inode * being removed from the unlinked hash bucket list. * - * What if we modelled the unlinked list as a collection of records capturing - * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd - * have a fast way to look up unlinked list predecessors, which avoids the - * slow list walk. That's exactly what we do here (in-core) with a per-AG - * rhashtable. - * - * Because this is a backref cache, we ignore operational failures since the - * iunlink code can fall back to the slow bucket walk. The only errors that - * should bubble out are for obviously incorrect situations. + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. * - * All users of the backref cache MUST hold the AGI buffer lock to serialize - * access or have otherwise provided for concurrency control. + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. */ -/* Capture a "X.next_unlinked = Y" relationship. */ -struct xfs_iunlink { - struct rhash_head iu_rhash_head; - xfs_agino_t iu_agino; /* X */ - xfs_agino_t iu_next_unlinked; /* Y */ -}; - -/* Unlinked list predecessor lookup hashtable construction */ -static int -xfs_iunlink_obj_cmpfn( - struct rhashtable_compare_arg *arg, - const void *obj) -{ - const xfs_agino_t *key = arg->key; - const struct xfs_iunlink *iu = obj; - - if (iu->iu_next_unlinked != *key) - return 1; - return 0; -} - -static const struct rhashtable_params xfs_iunlink_hash_params = { - .min_size = XFS_AGI_UNLINKED_BUCKETS, - .key_len = sizeof(xfs_agino_t), - .key_offset = offsetof(struct xfs_iunlink, - iu_next_unlinked), - .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), - .automatic_shrinking = true, - .obj_cmpfn = xfs_iunlink_obj_cmpfn, -}; - /* - * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such - * relation is found. + * Find an inode on the unlinked list. This does not take references to the + * inode as we have existence guarantees by holding the AGI buffer lock and that + * only unlinked, referenced inodes can be on the unlinked inode list. If we + * don't find the inode in cache, then let the caller handle the situation. */ -static xfs_agino_t -xfs_iunlink_lookup_backref( +static struct xfs_inode * +xfs_iunlink_lookup( struct xfs_perag *pag, xfs_agino_t agino) { - struct xfs_iunlink *iu; - - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - return iu ? iu->iu_agino : NULLAGINO; -} + struct xfs_inode *ip; -/* - * Take ownership of an iunlink cache entry and insert it into the hash table. - * If successful, the entry will be owned by the cache; if not, it is freed. - * Either way, the caller does not own @iu after this call. - */ -static int -xfs_iunlink_insert_backref( - struct xfs_perag *pag, - struct xfs_iunlink *iu) -{ - int error; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); - error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); /* - * Fail loudly if there already was an entry because that's a sign of - * corruption of in-memory data. Also fail loudly if we see an error - * code we didn't anticipate from the rhashtable code. Currently we - * only anticipate ENOMEM. + * Inode not in memory or in RCU freeing limbo should not happen. + * Warn about this and let the caller handle the failure. */ - if (error) { - WARN(error != -ENOMEM, "iunlink cache insert error %d", error); - kmem_free(iu); + if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + rcu_read_unlock(); + return NULL; } - /* - * Absorb any runtime errors that aren't a result of corruption because - * this is a cache and we can always fall back to bucket list scanning. - */ - if (error != 0 && error != -EEXIST) - error = 0; - return error; + ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); + rcu_read_unlock(); + return ip; } -/* Remember that @prev_agino.next_unlinked = @this_agino. */ +/* Update the prev pointer of the next agino. */ static int -xfs_iunlink_add_backref( +xfs_iunlink_update_backref( struct xfs_perag *pag, xfs_agino_t prev_agino, - xfs_agino_t this_agino) -{ - struct xfs_iunlink *iu; - - if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) - return 0; - - iu = kmem_zalloc(sizeof(*iu), KM_NOFS); - iu->iu_agino = prev_agino; - iu->iu_next_unlinked = this_agino; - - return xfs_iunlink_insert_backref(pag, iu); -} - -/* - * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. - * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there - * wasn't any such entry then we don't bother. - */ -static int -xfs_iunlink_change_backref( - struct xfs_perag *pag, - xfs_agino_t agino, - xfs_agino_t next_unlinked) + xfs_agino_t next_agino) { - struct xfs_iunlink *iu; - int error; - - /* Look up the old entry; if there wasn't one then exit. */ - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - if (!iu) - return 0; - - /* - * Remove the entry. This shouldn't ever return an error, but if we - * couldn't remove the old entry we don't want to add it again to the - * hash table, and if the entry disappeared on us then someone's - * violated the locking rules and we need to fail loudly. Either way - * we cannot remove the inode because internal state is or would have - * been corrupt. - */ - error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); - if (error) - return error; + struct xfs_inode *ip; - /* If there is no new next entry just free our item and return. */ - if (next_unlinked == NULLAGINO) { - kmem_free(iu); + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) return 0; - } - - /* Update the entry and re-add it to the hash table. */ - iu->iu_next_unlinked = next_unlinked; - return xfs_iunlink_insert_backref(pag, iu); -} - -/* Set up the in-core predecessor structures. */ -int -xfs_iunlink_init( - struct xfs_perag *pag) -{ - return rhashtable_init(&pag->pagi_unlinked_hash, - &xfs_iunlink_hash_params); -} - -/* Free the in-core predecessor structures. */ -static void -xfs_iunlink_free_item( - void *ptr, - void *arg) -{ - struct xfs_iunlink *iu = ptr; - bool *freed_anything = arg; - *freed_anything = true; - kmem_free(iu); -} - -void -xfs_iunlink_destroy( - struct xfs_perag *pag) -{ - bool freed_anything = false; - - rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, - xfs_iunlink_free_item, &freed_anything); - - ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -EFSCORRUPTED; + ip->i_prev_unlinked = prev_agino; + return 0; } /* @@ -2034,7 +1881,7 @@ xfs_iunlink_update_bucket( xfs_agino_t old_value; int offset; - ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino)); + ASSERT(xfs_verify_agino_or_null(pag, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, @@ -2057,88 +1904,53 @@ xfs_iunlink_update_bucket( return 0; } -/* Set an on-disk inode's next_unlinked pointer. */ -STATIC void -xfs_iunlink_update_dinode( - struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_agino_t agino, - struct xfs_buf *ibp, - struct xfs_dinode *dip, - struct xfs_imap *imap, - xfs_agino_t next_agino) -{ - struct xfs_mount *mp = tp->t_mountp; - int offset; - - ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); - - trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino, - be32_to_cpu(dip->di_next_unlinked), next_agino); - - dip->di_next_unlinked = cpu_to_be32(next_agino); - offset = imap->im_boffset + - offsetof(struct xfs_dinode, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); -} - -/* Set an in-core inode's unlinked pointer and return the old value. */ -STATIC int -xfs_iunlink_update_inode( +static int +xfs_iunlink_insert_inode( struct xfs_trans *tp, - struct xfs_inode *ip, struct xfs_perag *pag, - xfs_agino_t next_agino, - xfs_agino_t *old_next_agino) + struct xfs_buf *agibp, + struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_dinode *dip; - struct xfs_buf *ibp; - xfs_agino_t old_value; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t next_agino; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; - ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); - - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); - if (error) - return error; - dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); - - /* Make sure the old pointer isn't garbage. */ - old_value = be32_to_cpu(dip->di_next_unlinked); - if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - goto out; + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(pag, next_agino)) { + xfs_buf_mark_corrupt(agibp); + return -EFSCORRUPTED; } /* - * Since we're updating a linked list, we should never find that the - * current pointer is the same as the new value, unless we're - * terminating the list. + * Update the prev pointer in the next inode to point back to this + * inode. */ - *old_next_agino = old_value; - if (old_value == next_agino) { - if (next_agino != NULLAGINO) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - } - goto out; + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error) + return error; + + if (next_agino != NULLAGINO) { + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); + if (error) + return error; + ip->i_next_unlinked = next_agino; } - /* Ok, update the new pointer. */ - xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - ibp, dip, &ip->i_imap, next_agino); - return 0; -out: - xfs_trans_brelse(tp, ibp); - return error; + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } /* @@ -2155,11 +1967,7 @@ xfs_iunlink( { struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; - struct xfs_agi *agi; struct xfs_buf *agibp; - xfs_agino_t next_agino; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2169,202 +1977,38 @@ xfs_iunlink( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) goto out; - agi = agibp->b_addr; - - /* - * Get the index into the agi hash table for the list this inode will - * go on. Make sure the pointer isn't garbage and that this inode - * isn't already on the list. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (next_agino == agino || - !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) { - xfs_buf_mark_corrupt(agibp); - error = -EFSCORRUPTED; - goto out; - } - - if (next_agino != NULLAGINO) { - xfs_agino_t old_agino; - - /* - * There is already another inode in the bucket, so point this - * inode to the current head of the list. - */ - error = xfs_iunlink_update_inode(tp, ip, pag, next_agino, - &old_agino); - if (error) - goto out; - ASSERT(old_agino == NULLAGINO); - - /* - * agino has been unlinked, add a backref from the next inode - * back to agino. - */ - error = xfs_iunlink_add_backref(pag, agino, next_agino); - if (error) - goto out; - } - /* Point the head of the list to point to this inode. */ - error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); + error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); out: xfs_perag_put(pag); return error; } -/* Return the imap, dinode pointer, and buffer for an inode. */ -STATIC int -xfs_iunlink_map_ino( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agino_t agino, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = tp->t_mountp; - int error; - - imap->im_blkno = 0; - error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } - - error = xfs_imap_to_bp(mp, tp, imap, bpp); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - - *dipp = xfs_buf_offset(*bpp, imap->im_boffset); - return 0; -} - -/* - * Walk the unlinked chain from @head_agino until we find the inode that - * points to @target_agino. Return the inode number, map, dinode pointer, - * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. - * - * @tp, @pag, @head_agino, and @target_agino are input parameters. - * @agino, @imap, @dipp, and @bpp are all output parameters. - * - * Do not call this function if @target_agino is the head of the list. - */ -STATIC int -xfs_iunlink_map_prev( - struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_agino_t head_agino, - xfs_agino_t target_agino, - xfs_agino_t *agino, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = tp->t_mountp; - xfs_agino_t next_agino; - int error; - - ASSERT(head_agino != target_agino); - *bpp = NULL; - - /* See if our backref cache can find it faster. */ - *agino = xfs_iunlink_lookup_backref(pag, target_agino); - if (*agino != NULLAGINO) { - error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap, - dipp, bpp); - if (error) - return error; - - if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) - return 0; - - /* - * If we get here the cache contents were corrupt, so drop the - * buffer and fall back to walking the bucket list. - */ - xfs_trans_brelse(tp, *bpp); - *bpp = NULL; - WARN_ON_ONCE(1); - } - - trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno); - - /* Otherwise, walk the entire bucket until we find it. */ - next_agino = head_agino; - while (next_agino != target_agino) { - xfs_agino_t unlinked_agino; - - if (*bpp) - xfs_trans_brelse(tp, *bpp); - - *agino = next_agino; - error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap, - dipp, bpp); - if (error) - return error; - - unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); - /* - * Make sure this pointer is valid and isn't an obvious - * infinite loop. - */ - if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) || - next_agino == unlinked_agino) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - *dipp, sizeof(**dipp)); - error = -EFSCORRUPTED; - return error; - } - next_agino = unlinked_agino; - } - - return 0; -} - -/* - * Pull the on-disk inode from the AGI unlinked list. - */ -STATIC int -xfs_iunlink_remove( +static int +xfs_iunlink_remove_inode( struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi; - struct xfs_buf *agibp; - struct xfs_buf *last_ibp; - struct xfs_dinode *last_dip = NULL; + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - xfs_agino_t next_agino; xfs_agino_t head_agino; short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; trace_xfs_iunlink_remove(ip); - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); - if (error) - return error; - agi = agibp->b_addr; - /* * Get the index into the agi hash table for the list this inode will * go on. Make sure the head pointer isn't garbage. */ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) { + if (!xfs_verify_agino(pag, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; @@ -2375,52 +2019,60 @@ xfs_iunlink_remove( * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ - error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); + error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); if (error) return error; /* - * If there was a backref pointing from the next inode back to this - * one, remove it because we've removed this inode from the list. - * - * Later, if this inode was in the middle of the list we'll update - * this inode's backref to point from the next inode. + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. */ - if (next_agino != NULLAGINO) { - error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO); - if (error) - return error; - } + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; if (head_agino != agino) { - struct xfs_imap imap; - xfs_agino_t prev_agino; + struct xfs_inode *prev_ip; - /* We need to search the list for the inode being freed. */ - error = xfs_iunlink_map_prev(tp, pag, head_agino, agino, - &prev_agino, &imap, &last_dip, &last_ibp); - if (error) - return error; - - /* Point the previous inode on the list to the next inode. */ - xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp, - last_dip, &imap, next_agino); + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) + return -EFSCORRUPTED; - /* - * Now we deal with the backref for this inode. If this inode - * pointed at a real inode, change the backref that pointed to - * us to point to our old next. If this inode was the end of - * the list, delete the backref that pointed to us. Note that - * change_backref takes care of deleting the backref if - * next_agino is NULLAGINO. - */ - return xfs_iunlink_change_backref(agibp->b_pag, agino, - next_agino); + error = xfs_iunlink_log_inode(tp, prev_ip, pag, + ip->i_next_unlinked); + prev_ip->i_next_unlinked = ip->i_next_unlinked; + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); } - /* Point the head of the list to the next unlinked inode. */ - return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, - next_agino); + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = NULLAGINO; + return error; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_buf *agibp; + int error; + + trace_xfs_iunlink_remove(ip); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, &agibp); + if (error) + return error; + + return xfs_iunlink_remove_inode(tp, pag, agibp, ip); } /* @@ -2599,14 +2251,13 @@ xfs_ifree_cluster( } /* - * This is called to return an inode to the inode free list. - * The inode should already be truncated to 0 length and have - * no pages associated with it. This routine also assumes that - * the inode is already a part of the transaction. + * This is called to return an inode to the inode free list. The inode should + * already be truncated to 0 length and have no pages associated with it. This + * routine also assumes that the inode is already a part of the transaction. * - * The on-disk copy of the inode will have been added to the list - * of unlinked inodes in the AGI. We need to remove the inode from - * that list atomically with respect to freeing it here. + * The on-disk copy of the inode will have been added to the list of unlinked + * inodes in the AGI. We need to remove the inode from that list atomically with + * respect to freeing it here. */ int xfs_ifree( @@ -2628,13 +2279,16 @@ xfs_ifree( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* - * Pull the on-disk inode from the AGI unlinked list. + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. */ - error = xfs_iunlink_remove(tp, pag, ip); + error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) goto out; - error = xfs_difree(tp, pag, ip->i_ino, &xic); + error = xfs_iunlink_remove(tp, pag, ip); if (error) goto out; @@ -2755,6 +2409,7 @@ xfs_remove( xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); + int dontcare; int error = 0; uint resblks; @@ -2772,31 +2427,24 @@ xfs_remove( goto std_return; /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. + * We try to get the real space reservation first, allowing for + * directory btree deletion(s) implying possible bmap insert(s). If we + * can't get the space reservation then we use 0 instead, and avoid the + * bmap btree insert(s) in the directory code by, if the bmap insert + * tries to happen, instead trimming the LAST block from the directory. + * + * Ignore EDQUOT and ENOSPC being returned via nospace_error because + * the directory code can handle a reservationless update and we don't + * want to prevent a user from trying to free space by deleting things. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, - &tp); - } + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, + &tp, &dontcare); if (error) { ASSERT(error != -ENOSPC); goto std_return; } - xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* * If we're removing a directory perform some additional validation. */ @@ -3062,10 +2710,12 @@ out_trans_abort: static int xfs_rename_alloc_whiteout( struct user_namespace *mnt_userns, + struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) { struct xfs_inode *tmpfile; + struct qstr name; int error; error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, @@ -3073,6 +2723,15 @@ xfs_rename_alloc_whiteout( if (error) return error; + name.name = src_name->name; + name.len = src_name->len; + error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); + if (error) { + xfs_finish_inode_setup(tmpfile); + xfs_irele(tmpfile); + return error; + } + /* * Prepare the tmpfile inode as if it were created through the VFS. * Complete the inode setup and flag it as linkable. nlink is already @@ -3109,7 +2768,8 @@ xfs_rename( bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); int spaceres; - int error; + bool retried = false; + int error, nospace_error = 0; trace_xfs_rename(src_dp, target_dp, src_name, target_name); @@ -3122,7 +2782,8 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); + error = xfs_rename_alloc_whiteout(mnt_userns, src_name, + target_dp, &wip); if (error) return error; @@ -3133,9 +2794,12 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); +retry: + nospace_error = 0; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { + nospace_error = error; spaceres = 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, &tp); @@ -3154,7 +2818,7 @@ xfs_rename( * Lock all the participating inodes. Depending upon whether * the target_name exists in the target directory, and * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. + * directory, we can lock from 2 to 5 inodes. */ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); @@ -3190,37 +2854,33 @@ xfs_rename( spaceres); /* + * Try to reserve quota to handle an expansion of the target directory. + * We'll allow the rename to continue in reservationless mode if we hit + * a space usage constraint. If we trigger reservationless mode, save + * the errno if there isn't any free space in the target directory. + */ + if (spaceres != 0) { + error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, + 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(target_dp, 0); + retried = true; + goto retry; + } + + nospace_error = error; + spaceres = 0; + error = 0; + } + if (error) + goto out_trans_cancel; + } + + /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. - * - * Extent count overflow check: - * - * From the perspective of src_dp, a rename operation is essentially a - * directory entry remove operation. Hence the only place where we check - * for extent count overflow for src_dp is in - * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns - * -ENOSPC when it detects a possible extent count overflow and in - * response, the higher layers of directory handling code do the - * following: - * 1. Data/Free blocks: XFS lets these blocks linger until a - * future remove operation removes them. - * 2. Dabtree blocks: XFS swaps the blocks with the last block in the - * Leaf space and unmaps the last block. - * - * For target_dp, there are two cases depending on whether the - * destination directory entry exists or not. - * - * When destination directory entry does not exist (i.e. target_ip == - * NULL), extent count overflow check is performed only when transaction - * has a non-zero sized space reservation associated with it. With a - * zero-sized space reservation, XFS allows a rename operation to - * continue only when the directory has sufficient free space in its - * data/leaf/free space blocks to hold the new entry. - * - * When destination directory entry exists (i.e. target_ip != NULL), all - * we need to do is change the inode number associated with the already - * existing entry. Hence there is no need to perform an extent count - * overflow check. */ if (target_ip == NULL) { /* @@ -3231,12 +2891,6 @@ xfs_rename( error = xfs_dir_canenter(tp, target_dp, target_name); if (error) goto out_trans_cancel; - } else { - error = xfs_iext_count_may_overflow(target_dp, - XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; } } else { /* @@ -3265,11 +2919,13 @@ xfs_rename( if (inodes[i] == wip || (inodes[i] == target_ip && (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { - struct xfs_buf *bp; - xfs_agnumber_t agno; + struct xfs_perag *pag; + struct xfs_buf *bp; - agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); - error = xfs_read_agi(mp, tp, agno, &bp); + pag = xfs_perag_get(mp, + XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); + error = xfs_read_agi(pag, tp, &bp); + xfs_perag_put(pag); if (error) goto out_trans_cancel; } @@ -3404,18 +3060,12 @@ xfs_rename( * inode number of the whiteout inode rather than removing it * altogether. */ - if (wip) { + if (wip) error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); - } else { - /* - * NOTE: We don't need to check for extent count overflow here - * because the dir remove name code will leave the dir block in - * place if the extent count would overflow. - */ + else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); - } if (error) goto out_trans_cancel; @@ -3435,6 +3085,8 @@ out_trans_cancel: out_release_wip: if (wip) xfs_irele(wip); + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -3466,7 +3118,7 @@ xfs_iflush( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, + "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto flush_out; } @@ -3476,7 +3128,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr "PTR_FMT, + "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3487,25 +3139,25 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr "PTR_FMT, + "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: detected corrupt incore inode %Lu, " - "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, + "%s: detected corrupt incore inode %llu, " + "total extents = %llu nblocks = %lld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), + ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), ip->i_nblocks, ip); goto flush_out; } if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, + "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); goto flush_out; } @@ -3528,7 +3180,8 @@ xfs_iflush( if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_data(ip)) goto flush_out; - if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + if (xfs_inode_has_attr_fork(ip) && + ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_attr(ip)) goto flush_out; @@ -3546,7 +3199,7 @@ xfs_iflush( } xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); /* @@ -3611,7 +3264,7 @@ xfs_iflush_cluster( /* * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. + * will remove itself from the list. */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = (struct xfs_inode_log_item *)lip; @@ -3659,7 +3312,7 @@ xfs_iflush_cluster( * AIL, leaving a dirty/unpinned inode attached to the buffer * that otherwise looks like it should be flushed. */ - if (xfs_is_shutdown(mp)) { + if (xlog_is_shutdown(mp->m_log)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -3685,9 +3338,19 @@ xfs_iflush_cluster( } if (error) { + /* + * Shutdown first so we kill the log before we release this + * buffer. If it is an INODE_ALLOC buffer and pins the tail + * of the log, failing it before the _log_ is shut down can + * result in the log tail being moved forward in the journal + * on disk because log writes can still be taking place. Hence + * unpinning the tail will allow the ICREATE intent to be + * removed from the log an recovery will fail with uninitialised + * inode cluster buffers. + */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } @@ -3783,6 +3446,50 @@ retry: return 0; } +static int +xfs_mmaplock_two_inodes_and_break_dax_layout( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int error; + bool retry; + struct page *page; + + if (ip1->i_ino > ip2->i_ino) + swap(ip1, ip2); + +again: + retry = false; + /* Lock the first inode */ + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + error = xfs_break_dax_layouts(VFS_I(ip1), &retry); + if (error || retry) { + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + if (error == 0 && retry) + goto again; + return error; + } + + if (ip1 == ip2) + return 0; + + /* Nested lock the second inode */ + xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); + /* + * We cannot use xfs_break_dax_layouts() directly here because it may + * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable + * for this nested lock case. + */ + page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); + if (page && page_ref_count(page) != 1) { + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + goto again; + } + + return 0; +} + /* * Lock two inodes so that userspace cannot initiate I/O via file syscalls or * mmap activity. @@ -3797,8 +3504,19 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, - VFS_I(ip2)->i_mapping); + + if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { + ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); + if (ret) { + inode_unlock(VFS_I(ip2)); + if (ip1 != ip2) + inode_unlock(VFS_I(ip1)); + return ret; + } + } else + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); + return 0; } @@ -3808,8 +3526,14 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, - VFS_I(ip2)->i_mapping); + if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + } else + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); + inode_unlock(VFS_I(ip2)); if (ip1 != ip2) inode_unlock(VFS_I(ip1)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c447bf04205a..fa780f08dc89 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -33,9 +33,9 @@ typedef struct xfs_inode { struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ - struct xfs_ifork *i_afp; /* attribute fork pointer */ struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ + struct xfs_ifork i_af; /* attribute fork */ /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ @@ -68,6 +68,10 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ + /* unlinked list pointers */ + xfs_agino_t i_next_unlinked; + xfs_agino_t i_prev_unlinked; + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -77,6 +81,66 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) +{ + return ip->i_forkoff > 0; +} + +static inline struct xfs_ifork * +xfs_ifork_ptr( + struct xfs_inode *ip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return &ip->i_df; + case XFS_ATTR_FORK: + if (!xfs_inode_has_attr_fork(ip)) + return NULL; + return &ip->i_af; + case XFS_COW_FORK: + return ip->i_cowfp; + default: + ASSERT(0); + return NULL; + } +} + +static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip) +{ + return ip->i_forkoff << 3; +} + +static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip) +{ + if (xfs_inode_has_attr_fork(ip)) + return xfs_inode_fork_boff(ip); + + return XFS_LITINO(ip->i_mount); +} + +static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip) +{ + if (xfs_inode_has_attr_fork(ip)) + return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip); + return 0; +} + +static inline unsigned int +xfs_inode_fork_size( + struct xfs_inode *ip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_inode_data_fork_size(ip); + case XFS_ATTR_FORK: + return xfs_inode_attr_fork_size(ip); + default: + return 0; + } +} + /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -218,6 +282,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } +static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -278,12 +347,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ -#define XFS_IOLOCK_EXCL (1<<0) -#define XFS_IOLOCK_SHARED (1<<1) -#define XFS_ILOCK_EXCL (1<<2) -#define XFS_ILOCK_SHARED (1<<3) -#define XFS_MMAPLOCK_EXCL (1<<4) -#define XFS_MMAPLOCK_SHARED (1<<5) +#define XFS_IOLOCK_EXCL (1u << 0) +#define XFS_IOLOCK_SHARED (1u << 1) +#define XFS_ILOCK_EXCL (1u << 2) +#define XFS_ILOCK_SHARED (1u << 3) +#define XFS_MMAPLOCK_EXCL (1u << 4) +#define XFS_MMAPLOCK_SHARED (1u << 5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ @@ -350,19 +419,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) */ #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_MAX_SUBCLASS 3 -#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000u #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 #define XFS_MMAPLOCK_MAX_SUBCLASS 3 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u #define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) -#define XFS_ILOCK_RTBITMAP_VAL 6 -#define XFS_ILOCK_RTSUM_VAL 7 -#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_RTBITMAP_VAL 6u +#define XFS_ILOCK_RTSUM_VAL 7u +#define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) @@ -402,7 +471,7 @@ enum layout_break_reason { int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, +int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *name, @@ -462,15 +531,7 @@ xfs_itruncate_extents( } /* from xfs_file.c */ -enum xfs_prealloc_flags { - XFS_PREALLOC_SET = (1 << 1), - XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_SYNC = (1 << 3), - XFS_PREALLOC_INVISIBLE = (1 << 4), -}; - -int xfs_update_prealloc_flags(struct xfs_inode *ip, - enum xfs_prealloc_flags flags); +int xfs_break_dax_layouts(struct inode *inode, bool *retry); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); @@ -509,9 +570,6 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); -int xfs_iunlink_init(struct xfs_perag *pag); -void xfs_iunlink_destroy(struct xfs_perag *pag); - void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 90d8e591baf8..ca2941ab6cbc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include <linux/iversion.h> @@ -56,7 +57,7 @@ xfs_inode_item_data_fork_size( ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { /* worst case, doesn't subtract delalloc extents */ - *nbytes += XFS_IFORK_DSIZE(ip); + *nbytes += xfs_inode_data_fork_size(ip); *nvecs += 1; } break; @@ -70,7 +71,7 @@ xfs_inode_item_data_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - *nbytes += roundup(ip->i_df.if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes); *nvecs += 1; } break; @@ -91,27 +92,27 @@ xfs_inode_item_attr_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_afp->if_nextents > 0 && - ip->i_afp->if_bytes > 0) { + ip->i_af.if_nextents > 0 && + ip->i_af.if_bytes > 0) { /* worst case, doesn't subtract unused space */ - *nbytes += XFS_IFORK_ASIZE(ip); + *nbytes += xfs_inode_attr_fork_size(ip); *nvecs += 1; } break; case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) { - *nbytes += ip->i_afp->if_broot_bytes; + ip->i_af.if_broot_bytes > 0) { + *nbytes += ip->i_af.if_broot_bytes; *nvecs += 1; } break; case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) { - *nbytes += roundup(ip->i_afp->if_bytes, 4); + ip->i_af.if_bytes > 0) { + *nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes); *nvecs += 1; } break; @@ -142,7 +143,7 @@ xfs_inode_item_size( xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } @@ -203,17 +204,12 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, data_bytes); - ilf->ilf_dsize = (unsigned)data_bytes; + ip->i_df.if_u1.if_data, + ip->i_df.if_bytes); + ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; @@ -241,18 +237,18 @@ xfs_inode_item_format_attr_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_afp->if_nextents > 0 && - ip->i_afp->if_bytes > 0) { + ip->i_af.if_nextents > 0 && + ip->i_af.if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(xfs_iext_count(ip->i_afp) == - ip->i_afp->if_nextents); + ASSERT(xfs_iext_count(&ip->i_af) == + ip->i_af.if_nextents); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -269,13 +265,13 @@ xfs_inode_item_format_attr_fork( ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) { - ASSERT(ip->i_afp->if_broot != NULL); + ip->i_af.if_broot_bytes > 0) { + ASSERT(ip->i_af.if_broot != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, - ip->i_afp->if_broot, - ip->i_afp->if_broot_bytes); - ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ip->i_af.if_broot, + ip->i_af.if_broot_bytes); + ilf->ilf_asize = ip->i_af.if_broot_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; @@ -286,18 +282,12 @@ xfs_inode_item_format_attr_fork( ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_afp->if_bytes, 4); - ASSERT(ip->i_afp->if_u1.if_data != NULL); + ip->i_af.if_bytes > 0) { + ASSERT(ip->i_af.if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_afp->if_u1.if_data, - data_bytes); - ilf->ilf_asize = (unsigned)data_bytes; + ip->i_af.if_u1.if_data, + ip->i_af.if_bytes); + ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; @@ -358,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode( } } +static inline void +xfs_inode_to_log_dinode_iext_counters( + struct xfs_inode *ip, + struct xfs_log_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_big_anextents = xfs_ifork_nextents(&ip->i_af); + to->di_nrext64_pad = 0; + } else { + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(&ip->i_af); + } +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -373,7 +378,6 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); @@ -385,10 +389,8 @@ xfs_inode_to_log_dinode( to->di_size = ip->i_disk_size; to->di_nblocks = ip->i_nblocks; to->di_extsize = ip->i_extsize; - to->di_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = ip->i_forkoff; - to->di_aformat = xfs_ifork_format(ip->i_afp); + to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = ip->i_diflags; xfs_copy_dm_fields_to_log_dinode(ip, to); @@ -406,11 +408,14 @@ xfs_inode_to_log_dinode( to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_log_dinode_iext_counters(ip, to); } /* @@ -475,7 +480,7 @@ xfs_inode_item_format( xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); - if (XFS_IFORK_Q(ip)) { + if (xfs_inode_has_attr_fork(ip)) { xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); } else { iip->ili_fields &= @@ -543,10 +548,17 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - ASSERT(iip->ili_item.li_buf); + if (!bp || (ip->i_flags & XFS_ISTALE)) { + /* + * Inode item/buffer is being aborted due to cluster + * buffer deletion. Trigger a log force to have that operation + * completed and items removed from the AIL before the next push + * attempt. + */ + return XFS_ITEM_PINNED; + } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || - (ip->i_flags & XFS_ISTALE)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (xfs_iflags_test(ip, XFS_IFLUSHING)) @@ -720,6 +732,17 @@ xfs_iflush_ail_updates( if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) continue; + /* + * dgc: Not sure how this happens, but it happens very + * occassionaly via generic/388. xfs_iflush_abort() also + * silently handles this same "under writeback but not in AIL at + * shutdown" condition via xfs_trans_ail_delete(). + */ + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { + ASSERT(xlog_is_shutdown(lip->li_log)); + continue; + } + lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) tail_lsn = lsn; @@ -822,46 +845,143 @@ xfs_buf_inode_io_fail( } /* - * This is the inode flushing abort routine. It is called when - * the filesystem is shutting down to clean up the inode state. It is - * responsible for removing the inode item from the AIL if it has not been - * re-logged and clearing the inode's flush state. + * Clear the inode logging fields so no more flushes are attempted. If we are + * on a buffer list, it is now safe to remove it because the buffer is + * guaranteed to be locked. The caller will drop the reference to the buffer + * the log item held. + */ +static void +xfs_iflush_abort_clean( + struct xfs_inode_log_item *iip) +{ + iip->ili_last_fields = 0; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); +} + +/* + * Abort flushing the inode from a context holding the cluster buffer locked. + * + * This is the normal runtime method of aborting writeback of an inode that is + * attached to a cluster buffer. It occurs when the inode and the backing + * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster + * flushing or buffer IO completion encounters a log shutdown situation. + * + * If we need to abort inode writeback and we don't already hold the buffer + * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be + * necessary in a shutdown situation. */ void xfs_iflush_abort( struct xfs_inode *ip) { struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp; - if (iip) { - /* - * Clear the failed bit before removing the item from the AIL so - * xfs_trans_ail_delete() doesn't try to clear and release the - * buffer attached to the log item before we are done with it. - */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); - xfs_trans_ail_delete(&iip->ili_item, 0); + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + /* + * Remove the inode item from the AIL before we clear its internal + * state. Whilst the inode is in the AIL, it should have a valid buffer + * pointer for push operations to access - it is only safe to remove the + * inode from the buffer once it has been removed from the AIL. + * + * We also clear the failed bit before removing the item from the AIL + * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer + * references the inode item owns and needs to hold until we've fully + * aborted the inode log item and detached it from the buffer. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + + /* + * Grab the inode buffer so can we release the reference the inode log + * item holds on it. + */ + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + xfs_iflush_abort_clean(iip); + spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + if (bp) + xfs_buf_rele(bp); +} + +/* + * Abort an inode flush in the case of a shutdown filesystem. This can be called + * from anywhere with just an inode reference and does not require holding the + * inode cluster buffer locked. If the inode is attached to a cluster buffer, + * it will grab and lock it safely, then abort the inode flush. + */ +void +xfs_iflush_shutdown_abort( + struct xfs_inode *ip) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp; + + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + if (!bp) { + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + return; + } + + /* + * We have to take a reference to the buffer so that it doesn't get + * freed when we drop the ili_lock and then wait to lock the buffer. + * We'll clean up the extra reference after we pick up the ili_lock + * again. + */ + xfs_buf_hold(bp); + spin_unlock(&iip->ili_lock); + xfs_buf_lock(bp); + + spin_lock(&iip->ili_lock); + if (!iip->ili_item.li_buf) { /* - * Clear the inode logging fields so no more flushes are - * attempted. + * Raced with another removal, hold the only reference + * to bp now. Inode should not be in the AIL now, so just clean + * up and return; */ - spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_flush_lsn = 0; - bp = iip->ili_item.li_buf; - iip->ili_item.li_buf = NULL; - list_del_init(&iip->ili_item.li_bio_list); + ASSERT(list_empty(&iip->ili_item.li_bio_list)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)); + xfs_iflush_abort_clean(iip); spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_buf_relse(bp); + return; } - xfs_iflags_clear(ip, XFS_IFLUSHING); - if (bp) - xfs_buf_rele(bp); + + /* + * Got two references to bp. The first will get dropped by + * xfs_iflush_abort() when the item is removed from the buffer list, but + * we can't drop our reference until _abort() returns because we have to + * unlock the buffer as well. Hence we abort and then unlock and release + * our reference to the buffer. + */ + ASSERT(iip->ili_item.li_buf == bp); + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + xfs_buf_relse(bp); } + /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1a302000d604..bbd836a44ff0 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_shutdown_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 239dd2e3384e..0e5dba2343ea 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts( return ts; } +static inline bool xfs_log_dinode_has_large_extent_counts( + const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_NREXT64); +} + +static inline void +xfs_log_dinode_to_disk_iext_counters( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + if (xfs_log_dinode_has_large_extent_counts(from)) { + to->di_big_nextents = cpu_to_be64(from->di_big_nextents); + to->di_big_anextents = cpu_to_be32(from->di_big_anextents); + to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); + } else { + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + } + +} + STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, @@ -158,7 +181,6 @@ xfs_log_dinode_to_disk( to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); @@ -167,8 +189,6 @@ xfs_log_dinode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); to->di_forkoff = from->di_forkoff; to->di_aformat = from->di_aformat; to->di_dmevmask = cpu_to_be32(from->di_dmevmask); @@ -184,12 +204,66 @@ xfs_log_dinode_to_disk( to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_flushiter = cpu_to_be16(from->di_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_log_dinode_to_disk_iext_counters(from, to); +} + +STATIC int +xlog_dinode_verify_extent_counts( + struct xfs_mount *mp, + struct xfs_log_dinode *ldip) +{ + xfs_extnum_t nextents; + xfs_aextnum_t anextents; + + if (xfs_log_dinode_has_large_extent_counts(ldip)) { + if (!xfs_has_large_extent_counts(mp) || + (ldip->di_nrext64_pad != 0)) { + XFS_CORRUPTION_ERROR( + "Bad log dinode large extent count format", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, padding 0x%x", + ldip->di_ino, xfs_has_large_extent_counts(mp), + ldip->di_nrext64_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_big_nextents; + anextents = ldip->di_big_anextents; + } else { + if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { + XFS_CORRUPTION_ERROR( + "Bad log dinode di_v3_pad", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, di_v3_pad 0x%llx", + ldip->di_ino, ldip->di_v3_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_nextents; + anextents = ldip->di_anextents; + } + + if (unlikely(nextents + anextents > ldip->di_nblocks)) { + XFS_CORRUPTION_ERROR("Bad log dinode extent counts", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", + ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, + anextents, ldip->di_nblocks); + return -EFSCORRUPTED; + } + + return 0; } STATIC int @@ -247,7 +321,7 @@ xlog_recover_inode_commit_pass2( */ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", __func__, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -255,7 +329,7 @@ xlog_recover_inode_commit_pass2( ldip = item->ri_buf[1].i_addr; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", __func__, item, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2( if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for regular file", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } @@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2( if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE) && (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for directory", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; + + error = xlog_dinode_verify_extent_counts(mp, ldip); + if (error) goto out_release; - } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode fork offset", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + "Bad inode 0x%llx, di_forkoff 0x%x", + in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; } isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, + mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); + "Bad inode 0x%llx log dinode size 0x%x", + in_f->ilf_ino, item->ri_buf[1].i_len); error = -EFSCORRUPTED; goto out_release; } @@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2( ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); + (len == xlog_calc_iovec_len(in_f->ilf_dsize))); switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: @@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2( } len = item->ri_buf[attr_index].i_len; src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); + ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { case XFS_ILOG_ADATA: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 174cd8950cb6..1f783e979629 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -15,6 +15,8 @@ #include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -35,8 +37,7 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" +#include "xfs_xattr.h" #include <linux/mount.h> #include <linux/namei.h> @@ -372,7 +373,7 @@ int xfs_ioc_attr_list( struct xfs_inode *dp, void __user *ubuf, - int bufsize, + size_t bufsize, int flags, struct xfs_attrlist_cursor __user *ucursor) { @@ -524,7 +525,7 @@ xfs_attrmulti_attr_set( args.valuelen = len; } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (flags & XFS_IOC_ATTR_ROOT)) xfs_forget_acl(inode, name); kfree(args.value); @@ -627,86 +628,6 @@ xfs_attrmulti_by_handle( return error; } -int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf) -{ - struct inode *inode = file_inode(filp); - struct xfs_inode *ip = XFS_I(inode); - struct iattr iattr; - enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; - uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - int error; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -EPERM; - - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - if (xfs_is_always_cow_inode(ip)) - return -EOPNOTSUPP; - - if (filp->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - if (filp->f_mode & FMODE_NOCMTIME) - flags |= XFS_PREALLOC_INVISIBLE; - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); - if (error) - goto out_unlock; - inode_dio_wait(inode); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += filp->f_pos; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), - &iattr); - if (error) - goto out_unlock; - - error = xfs_update_prealloc_flags(ip, flags); - -out_unlock: - xfs_iunlock(ip, iolock); - mnt_drop_write_file(filp); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -893,6 +814,9 @@ xfs_bulk_ireq_setup( if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) return -ECANCELED; + if (hdr->flags & XFS_BULK_IREQ_NREXT64) + breq->flags |= XFS_IBULK_NREXT64; + return 0; } @@ -1031,6 +955,7 @@ xfs_ioc_ag_geometry( struct xfs_mount *mp, void __user *arg) { + struct xfs_perag *pag; struct xfs_ag_geometry ageo; int error; @@ -1041,7 +966,12 @@ xfs_ioc_ag_geometry( if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) return -EINVAL; - error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); + pag = xfs_perag_get(mp, ageo.ag_number); + if (!pag) + return -EINVAL; + + error = xfs_ag_get_geometry(pag, &ageo); + xfs_perag_put(pag); if (error) return error; @@ -1061,7 +991,7 @@ xfs_fill_fsxattr( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); fileattr_fill_xflags(fa, xfs_ip2xflags(ip)); @@ -1172,7 +1102,8 @@ xfs_flags2diflags2( { uint64_t di_flags2 = (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | - XFS_DIFLAG2_BIGTIME)); + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; @@ -1269,7 +1200,7 @@ xfs_ioctl_setattr_get_trans( goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_error; @@ -1544,7 +1475,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count < 2) return -EINVAL; - if (bmx.bmv_count > ULONG_MAX / recsize) + if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL); @@ -1935,6 +1866,15 @@ xfs_fs_eofblocks_from_user( } /* + * These long-unused ioctls were removed from the official ioctl API in 5.17, + * but retain these definitions so that we can log warnings about them. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) + +/* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. * So we don't "sign flip" like most other routines. This means @@ -1964,13 +1904,11 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: { - xfs_flock64_t bf; - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -EFAULT; - return xfs_ioc_space(filp, &bf); - } + case XFS_IOC_FREESP64: + xfs_warn_once(mp, + "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", + current->comm); + return -ENOTTY; case XFS_IOC_DIOINFO: { struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 28453a6d4461..d4abba2c13c1 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -10,12 +10,6 @@ struct xfs_bstat; struct xfs_ibulk; struct xfs_inogrp; - -extern int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf); - int xfs_ioc_swapext( xfs_swapext_t *sxp); @@ -38,8 +32,9 @@ xfs_readlink_by_handle( int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, uint32_t opcode, void __user *uname, void __user *value, uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, - int flags, struct xfs_attrlist_cursor __user *ucursor); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 8783af203cfc..2f54b701eead 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -17,6 +17,8 @@ #include "xfs_itable.h" #include "xfs_fsops.h" #include "xfs_rtalloc.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_ioctl.h" #include "xfs_ioctl32.h" @@ -28,22 +30,6 @@ #ifdef BROKEN_X86_ALIGNMENT STATIC int -xfs_compat_flock64_copyin( - xfs_flock64_t *bf, - compat_xfs_flock64_t __user *arg32) -{ - if (get_user(bf->l_type, &arg32->l_type) || - get_user(bf->l_whence, &arg32->l_whence) || - get_user(bf->l_start, &arg32->l_start) || - get_user(bf->l_len, &arg32->l_len) || - get_user(bf->l_sysid, &arg32->l_sysid) || - get_user(bf->l_pid, &arg32->l_pid) || - copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -EFAULT; - return 0; -} - -STATIC int xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, compat_xfs_fsop_geom_v1_t __user *arg32) @@ -233,7 +219,7 @@ xfs_compat_ioc_fsbulkstat( inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat; bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat; -#ifdef CONFIG_X86_X32 +#ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { /* * ... but on x32 the input xfs_fsop_bulkreq has pointers @@ -445,17 +431,6 @@ xfs_file_compat_ioctl( switch (cmd) { #if defined(BROKEN_X86_ALIGNMENT) - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: { - struct xfs_flock64 bf; - - if (xfs_compat_flock64_copyin(&bf, arg)) - return -EFAULT; - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, &bf); - } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 9929482bf358..c14852362fce 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -142,28 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) #ifdef BROKEN_X86_ALIGNMENT -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct compat_xfs_flock64 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} compat_xfs_flock64_t; - -#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) -#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) -#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) -#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) -#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) - typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 093758440ad5..07da03976ec1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -28,7 +28,6 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" - #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -54,7 +53,8 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - u16 flags) + unsigned int mapping_flags, + u16 iomap_flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -71,16 +71,22 @@ xfs_bmbt_to_iomap( iomap->type = IOMAP_DELALLOC; } else { iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); + if (mapping_flags & IOMAP_DAX) + iomap->addr += target->bt_dax_part_off; + if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else iomap->type = IOMAP_MAPPED; + } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = target->bt_bdev; - iomap->dax_dev = target->bt_daxdev; - iomap->flags = flags; + if (mapping_flags & IOMAP_DAX) + iomap->dax_dev = target->bt_daxdev; + else + iomap->bdev = target->bt_bdev; + iomap->flags = iomap_flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) @@ -153,7 +159,7 @@ xfs_iomap_eof_align_last_fsb( struct xfs_inode *ip, xfs_fileoff_t end_fsb) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); xfs_extlen_t extsz = xfs_get_extsz_hint(ip); xfs_extlen_t align = xfs_eof_alignment(ip); struct xfs_bmbt_irec irec; @@ -188,6 +194,7 @@ xfs_iomap_write_direct( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, + unsigned int flags, struct xfs_bmbt_irec *imap) { struct xfs_mount *mp = ip->i_mount; @@ -229,7 +236,7 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { + if (flags & IOMAP_DAX) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; @@ -244,6 +251,8 @@ xfs_iomap_write_direct( return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, nr_exts); if (error) goto out_trans_cancel; @@ -361,7 +370,7 @@ xfs_iomap_prealloc_size( struct xfs_iext_cursor ncur = *icur; struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int64_t freesp; xfs_fsblock_t qblocks; @@ -395,7 +404,7 @@ xfs_iomap_prealloc_size( */ plen = prev.br_blockcount; while (xfs_iext_prev_extent(ifp, &ncur, &got)) { - if (plen > MAXEXTLEN / 2 || + if (plen > XFS_MAX_BMBT_EXTLEN / 2 || isnullstartblock(got.br_startblock) || got.br_startoff + got.br_blockcount != prev.br_startoff || got.br_startblock + got.br_blockcount != prev.br_startblock) @@ -407,23 +416,23 @@ xfs_iomap_prealloc_size( /* * If the size of the extents is greater than half the maximum extent * length, then use the current offset as the basis. This ensures that - * for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width - * alignment of real extents. + * for large files the preallocation size always extends to + * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe + * unit/width alignment of real extents. */ alloc_blocks = plen * 2; - if (alloc_blocks > MAXEXTLEN) + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); qblocks = alloc_blocks; /* - * MAXEXTLEN is not a power of two value but we round the prealloc down - * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported - * prealloc size, we round up first, apply appropriate throttling, - * round down and cap the value to MAXEXTLEN. + * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc + * down to the nearest power of two value after throttling. To prevent + * the round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, round + * down and cap the value to XFS_BMBT_MAX_EXTLEN. */ - alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); freesp = percpu_counter_read_positive(&mp->m_fdblocks); @@ -471,14 +480,14 @@ xfs_iomap_prealloc_size( */ if (alloc_blocks) alloc_blocks = rounddown_pow_of_two(alloc_blocks); - if (alloc_blocks > MAXEXTLEN) - alloc_blocks = MAXEXTLEN; + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) + alloc_blocks = XFS_MAX_BMBT_EXTLEN; /* * If we are still trying to allocate more space than is * available, squash the prealloc hard. This can happen if we * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. + * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN. */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; @@ -548,6 +557,9 @@ xfs_iomap_write_unwritten( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; @@ -620,7 +632,7 @@ imap_needs_alloc( imap->br_startblock == DELAYSTARTBLOCK) return true; /* we convert unwritten extents before copying the data for DAX */ - if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) + if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) return true; return false; } @@ -652,7 +664,7 @@ xfs_ilock_for_iomap( unsigned flags, unsigned *lockmode) { - unsigned mode = XFS_ILOCK_SHARED; + unsigned int mode = *lockmode; bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* @@ -730,7 +742,7 @@ xfs_direct_write_iomap_begin( int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -761,7 +773,8 @@ xfs_direct_write_iomap_begin( /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, - &lockmode, flags & IOMAP_DIRECT); + &lockmode, + (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; if (shared) @@ -800,7 +813,7 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); allocate_blocks: error = -EAGAIN; @@ -826,23 +839,24 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - &imap); + flags, &imap); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + iomap_flags | IOMAP_F_NEW); out_found_cow: xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); out_unlock: if (lockmode) @@ -855,6 +869,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { }; static int +xfs_dax_write_iomap_end( + struct inode *inode, + loff_t pos, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (!xfs_is_cow_inode(ip)) + return 0; + + if (!written) { + xfs_reflink_cancel_cow_range(ip, pos, length, true); + return 0; + } + + return xfs_reflink_end_cow(ip, pos, written); +} + +const struct iomap_ops xfs_dax_write_iomap_ops = { + .iomap_begin = xfs_direct_write_iomap_begin, + .iomap_end = xfs_dax_write_iomap_end, +}; + +static int xfs_buffered_write_iomap_begin( struct inode *inode, loff_t offset, @@ -873,6 +914,7 @@ xfs_buffered_write_iomap_begin( bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; + unsigned int lockmode = XFS_ILOCK_EXCL; if (xfs_is_shutdown(mp)) return -EIO; @@ -884,7 +926,9 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); - xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { @@ -1052,23 +1096,24 @@ retry: */ xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); found_imap: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); found_cow: xfs_iunlock(ip, XFS_ILOCK_EXCL); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1158,7 +1203,7 @@ xfs_read_iomap_begin( xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1177,7 +1222,8 @@ xfs_read_iomap_begin( if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + shared ? IOMAP_F_SHARED : 0); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1236,7 +1282,8 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1258,7 +1305,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1291,12 +1338,12 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) { + if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) { error = -ENOENT; goto out_unlock; } - ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: @@ -1305,9 +1352,40 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; + +int +xfs_zero_range( + struct xfs_inode *ip, + loff_t pos, + loff_t len, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_zero_range(inode, pos, len, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_zero_range(inode, pos, len, did_zero, + &xfs_buffered_write_iomap_ops); +} + +int +xfs_truncate_page( + struct xfs_inode *ip, + loff_t pos, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_truncate_page(inode, pos, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_truncate_page(inode, pos, did_zero, + &xfs_buffered_write_iomap_ops); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7d3703556d0e..c782e8c0479c 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,19 @@ struct xfs_inode; struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); + xfs_fileoff_t count_fsb, unsigned int flags, + struct xfs_bmbt_irec *imap); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); -int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, u16); +int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, + struct xfs_bmbt_irec *imap, unsigned int mapping_flags, + u16 iomap_flags); + +int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, + bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -45,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; +extern const struct iomap_ops xfs_dax_write_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a607d6aca5c4..2e10e1c66ad6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -13,6 +13,8 @@ #include "xfs_inode.h" #include "xfs_acl.h" #include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" #include "xfs_trace.h" @@ -22,6 +24,7 @@ #include "xfs_iomap.h" #include "xfs_error.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -59,7 +62,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (error < 0) break; } @@ -72,9 +75,8 @@ xfs_initxattrs( * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ - -STATIC int -xfs_init_security( +int +xfs_inode_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -119,7 +121,7 @@ xfs_cleanup_inode( /* Oh, the horror. * If we can't add the ACL or we fail in - * xfs_init_security we must back out. + * xfs_inode_init_security we must back out. * ENOSPC can hit here, among other things. */ xfs_dentry_to_name(&teardown, dentry); @@ -165,7 +167,7 @@ xfs_generic_create( struct dentry *dentry, umode_t mode, dev_t rdev, - bool tmpfile) /* unnamed file */ + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -205,11 +207,10 @@ xfs_generic_create( inode = VFS_I(ip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; -#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) @@ -220,7 +221,6 @@ xfs_generic_create( if (error) goto out_cleanup_inode; } -#endif xfs_setup_iops(ip); @@ -234,7 +234,7 @@ xfs_generic_create( * d_tmpfile can immediately set it back to zero. */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); + d_tmpfile(tmpfile, inode); } else d_instantiate(dentry, inode); @@ -261,7 +261,7 @@ xfs_vn_mknod( umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); } STATIC int @@ -272,7 +272,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); } STATIC int @@ -283,7 +283,7 @@ xfs_vn_mkdir( umode_t mode) { return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - false); + NULL); } STATIC struct dentry * @@ -423,7 +423,7 @@ xfs_vn_symlink( inode = VFS_I(cip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; @@ -511,27 +511,6 @@ xfs_vn_get_link( return ERR_PTR(error); } -STATIC const char * -xfs_vn_get_link_inline( - struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct xfs_inode *ip = XFS_I(inode); - char *link; - - ASSERT(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL); - - /* - * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if - * if_data is junk. - */ - link = ip->i_df.if_u1.if_data; - if (XFS_IS_CORRUPT(ip->i_mount, !link)) - return ERR_PTR(-EFSCORRUPTED); - return link; -} - static uint32_t xfs_stat_blksize( struct xfs_inode *ip) @@ -579,6 +558,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); trace_xfs_getattr(ip); @@ -589,8 +570,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -625,6 +606,16 @@ xfs_vn_getattr( stat->blksize = BLKDEV_IOSIZE; stat->rdev = inode->i_rdev; break; + case S_IFREG: + if (request_mask & STATX_DIOALIGN) { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } + fallthrough; default: stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; @@ -634,37 +625,6 @@ xfs_vn_getattr( return 0; } -static void -xfs_setattr_mode( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; -} - -void -xfs_setattr_time( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (iattr->ia_valid & ATTR_ATIME) - inode->i_atime = iattr->ia_atime; - if (iattr->ia_valid & ATTR_CTIME) - inode->i_ctime = iattr->ia_ctime; - if (iattr->ia_valid & ATTR_MTIME) - inode->i_mtime = iattr->ia_mtime; -} - static int xfs_vn_change_ok( struct user_namespace *mnt_userns, @@ -699,10 +659,10 @@ xfs_setattr_nonsize( int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; - kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; - struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL; ASSERT((mask & ATTR_SIZE) == 0); @@ -718,13 +678,15 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; @@ -744,66 +706,30 @@ xfs_setattr_nonsize( } error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_dqrele; /* - * Change file ownership. Must be the owner or privileged. + * Register quota modifications in the transaction. Must be the owner + * or privileged. These IDs could have changed since we last looked at + * them. But, we're assured that if the ownership did change while we + * didn't have the inode locked, inode's dquot(s) would have changed + * also. */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = inode->i_uid; - igid = inode->i_gid; - gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; - uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((inode->i_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) - inode->i_mode &= ~(S_ISUID|S_ISGID); - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (!uid_eq(iuid, uid)) { - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & ATTR_UID); - ASSERT(udqp); - olddquot1 = xfs_qm_vop_chown(tp, ip, - &ip->i_udquot, udqp); - } - inode->i_uid = uid; - } - if (!gid_eq(igid, gid)) { - if (XFS_IS_GQUOTA_ON(mp)) { - ASSERT(xfs_has_pquotino(mp) || - !XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & ATTR_GID); - ASSERT(gdqp); - olddquot2 = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); - } - inode->i_gid = gid; - } + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(mnt_userns, iattr, inode)) { + ASSERT(udqp); + old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); + } + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(mnt_userns, iattr, inode)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); + ASSERT(gdqp); + old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - if (mask & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -815,8 +741,8 @@ xfs_setattr_nonsize( /* * Release any dquot(s) the inode had kept before chown. */ - xfs_qm_dqrele(olddquot1); - xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(old_udqp); + xfs_qm_dqrele(old_gdqp); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -911,8 +837,8 @@ xfs_setattr_size( */ if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); - error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, oldsize, newsize - oldsize, + &did_zeroing); } else { /* * iomap won't detect a dirty page over an unwritten block (or a @@ -924,8 +850,7 @@ xfs_setattr_size( newsize); if (error) return error; - error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_buffered_write_iomap_ops); + error = xfs_truncate_page(ip, newsize, &did_zeroing); } if (error) @@ -1028,11 +953,8 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (iattr->ia_valid & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -1170,10 +1092,12 @@ STATIC int xfs_vn_tmpfile( struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, + struct file *file, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); + int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + + return finish_open_simple(file, err); } static const struct inode_operations xfs_inode_operations = { @@ -1250,14 +1174,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; -static const struct inode_operations xfs_inline_symlink_inode_operations = { - .get_link = xfs_vn_get_link_inline, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .listxattr = xfs_vn_listxattr, - .update_time = xfs_vn_update_time, -}; - /* Figure out if this file actually supports DAX. */ static bool xfs_inode_supports_dax( @@ -1332,9 +1248,9 @@ xfs_diflags_to_iflags( * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, - * when creating a new inode it is called from xfs_ialloc after setting up the - * inode. These callers have different criteria for clearing XFS_INEW, so leave - * it up to the caller to deal with unlocking the inode appropriately. + * when creating a new inode it is called from xfs_init_new_inode after setting + * up the inode. These callers have different criteria for clearing XFS_INEW, so + * leave it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1379,7 +1295,7 @@ xfs_setup_inode( * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ - if (!XFS_IFORK_Q(ip)) { + if (!xfs_inode_has_attr_fork(ip)) { inode_has_no_xattr(inode); cache_no_acl(inode); } @@ -1408,10 +1324,7 @@ xfs_setup_iops( inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: - if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) - inode->i_op = &xfs_inline_symlink_inode_operations; - else - inode->i_op = &xfs_symlink_inode_operations; + inode->i_op = &xfs_symlink_inode_operations; break; default: inode->i_op = &xfs_inode_operations; diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 278949056048..e570dcb5df8d 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,8 +13,10 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); +int xfs_inode_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c08c79d9e311..a1c2bcf65d37 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -64,7 +64,10 @@ xfs_bulkstat_one_int( struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; struct xfs_bulkstat *buf = bc->buf; + xfs_extnum_t nextents; int error = -EINVAL; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (xfs_internal_inum(mp, ino)) goto out_advance; @@ -80,14 +83,16 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* xfs_iget returns the following without needing * further change. */ buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; - buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); - buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); + buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid)); + buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid)); buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; @@ -102,10 +107,16 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = ip->i_extsize; - buf->bs_extents = xfs_ifork_nextents(&ip->i_df); + + nextents = xfs_ifork_nextents(&ip->i_df); + if (!(bc->breq->flags & XFS_IBULK_NREXT64)) + buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL); + else + buf->bs_extents64 = nextents; + xfs_bulkstat_health(ip, buf); - buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); - buf->bs_forkoff = XFS_IFORK_BOFF(ip); + buf->bs_aextents = xfs_ifork_nextents(&ip->i_af); + buf->bs_forkoff = xfs_inode_fork_boff(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; if (xfs_has_v3inodes(mp)) { @@ -256,6 +267,7 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; + unsigned int iwalk_flags = 0; int error; if (breq->mnt_userns != &init_user_ns) { @@ -279,7 +291,10 @@ xfs_bulkstat( if (error) goto out; - error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + if (breq->flags & XFS_IBULK_SAME_AG) + iwalk_flags |= XFS_IWALK_SAME_AG; + + error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); out: diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 7078d10c9b12..e2d0eba43f35 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -17,7 +17,10 @@ struct xfs_ibulk { }; /* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) +#define XFS_IBULK_SAME_AG (1U << 0) + +/* Fill out the bs_extents64 field if set. */ +#define XFS_IBULK_NREXT64 (1U << 1) /* * Advance the user buffer pointer by one record of the given size. If the diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c new file mode 100644 index 000000000000..43005ce8bd48 --- /dev/null +++ b/fs/xfs/xfs_iunlink_item.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2022, Red Hat, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_ag.h" +#include "xfs_iunlink_item.h" +#include "xfs_trace.h" +#include "xfs_error.h" + +struct kmem_cache *xfs_iunlink_cache; + +static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_iunlink_item, item); +} + +static void +xfs_iunlink_item_release( + struct xfs_log_item *lip) +{ + struct xfs_iunlink_item *iup = IUL_ITEM(lip); + + xfs_perag_put(iup->pag); + kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip)); +} + + +static uint64_t +xfs_iunlink_item_sort( + struct xfs_log_item *lip) +{ + return IUL_ITEM(lip)->ip->i_ino; +} + +/* + * Look up the inode cluster buffer and log the on-disk unlinked inode change + * we need to make. + */ +static int +xfs_iunlink_log_dinode( + struct xfs_trans *tp, + struct xfs_iunlink_item *iup) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip = iup->ip; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + int offset; + int error; + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); + if (error) + return error; + /* + * Don't log the unlinked field on stale buffers as this may be the + * transaction that frees the inode cluster and relogging the buffer + * here will incorrectly remove the stale state. + */ + if (ibp->b_flags & XBF_STALE) + goto out; + + dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); + + /* Make sure the old pointer isn't garbage. */ + if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + goto out; + } + + trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno, + XFS_INO_TO_AGINO(mp, ip->i_ino), + be32_to_cpu(dip->di_next_unlinked), iup->next_agino); + + dip->di_next_unlinked = cpu_to_be32(iup->next_agino); + offset = ip->i_imap.im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * On precommit, we grab the inode cluster buffer for the inode number we were + * passed, then update the next unlinked field for that inode in the buffer and + * log the buffer. This ensures that the inode cluster buffer was logged in the + * correct order w.r.t. other inode cluster buffers. We can then remove the + * iunlink item from the transaction and release it as it is has now served it's + * purpose. + */ +static int +xfs_iunlink_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_iunlink_item *iup = IUL_ITEM(lip); + int error; + + error = xfs_iunlink_log_dinode(tp, iup); + list_del(&lip->li_trans); + xfs_iunlink_item_release(lip); + return error; +} + +static const struct xfs_item_ops xfs_iunlink_item_ops = { + .iop_release = xfs_iunlink_item_release, + .iop_sort = xfs_iunlink_item_sort, + .iop_precommit = xfs_iunlink_item_precommit, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the iunlink item. + */ +int +xfs_iunlink_log_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_perag *pag, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_iunlink_item *iup; + + ASSERT(xfs_verify_agino_or_null(pag, next_agino)); + ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked)); + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + if (ip->i_next_unlinked == next_agino) { + if (next_agino != NULLAGINO) + return -EFSCORRUPTED; + return 0; + } + + iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK, + &xfs_iunlink_item_ops); + + iup->ip = ip; + iup->next_agino = next_agino; + iup->old_agino = ip->i_next_unlinked; + + atomic_inc(&pag->pag_ref); + iup->pag = pag; + + xfs_trans_add_item(tp, &iup->item); + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &iup->item.li_flags); + return 0; +} + diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h new file mode 100644 index 000000000000..c793cdcaccde --- /dev/null +++ b/fs/xfs/xfs_iunlink_item.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2022, Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef XFS_IUNLINK_ITEM_H +#define XFS_IUNLINK_ITEM_H 1 + +struct xfs_trans; +struct xfs_inode; +struct xfs_perag; + +/* in memory log item structure */ +struct xfs_iunlink_item { + struct xfs_log_item item; + struct xfs_inode *ip; + struct xfs_perag *pag; + xfs_agino_t next_agino; + xfs_agino_t old_agino; +}; + +extern struct kmem_cache *xfs_iunlink_cache; + +int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_perag *pag, xfs_agino_t next_agino); + +#endif /* XFS_IUNLINK_ITEM_H */ diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 37a795f03267..83699089755e 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, unsigned int inode_records, bool poll, void *data); /* Only iterate inodes within the same AG as @startino. */ -#define XFS_IWALK_SAME_AG (0x1) +#define XFS_IWALK_SAME_AG (1U << 0) #define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c174262a074e..f9878021e7d0 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -61,6 +61,7 @@ typedef __u32 xfs_nlink_t; #include <linux/ratelimit.h> #include <linux/rhashtable.h> #include <linux/xattr.h> +#include <linux/mnt_idmapping.h> #include <asm/page.h> #include <asm/div64.h> @@ -195,9 +196,7 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) } int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, - char *data, unsigned int op); -void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, - struct completion *done); + char *data, enum req_op op); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 89fec9a18c34..f02a0dd522b3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -49,7 +49,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclog, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp); STATIC void xlog_grant_push_ail( @@ -58,13 +57,10 @@ xlog_grant_push_ail( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog); + struct xlog_in_core *iclog, + struct xlog_ticket *ticket); #if defined(DEBUG) STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr); -STATIC void xlog_verify_grant_tail( struct xlog *log); STATIC void @@ -77,7 +73,6 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog); #else -#define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b) @@ -90,6 +85,62 @@ xlog_iclogs_empty( static int xfs_log_cover(struct xfs_mount *); +/* + * We need to make sure the buffer pointer returned is naturally aligned for the + * biggest basic data type we put into it. We have already accounted for this + * padding when sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + * + * We also add space for the xlog_op_header that describes this region in the + * log. This prepends the data region we return to the caller to copy their data + * into, so do all the static initialisation of the ophdr now. Because the ophdr + * is not 8 byte aligned, we have to be careful to ensure that we align the + * start of the buffer such that the region we return to the call is 8 byte + * aligned and packed against the tail of the ophdr. + */ +void * +xlog_prepare_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + struct xlog_op_header *oph; + uint32_t len; + void *buf; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + len = lv->lv_buf_len + sizeof(struct xlog_op_header); + if (!IS_ALIGNED(len, sizeof(uint64_t))) { + lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + sizeof(struct xlog_op_header); + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + oph = vec->i_addr; + oph->oh_clientid = XFS_TRANSACTION; + oph->oh_res2 = 0; + oph->oh_flags = 0; + + buf = vec->i_addr + sizeof(struct xlog_op_header); + ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t))); + + *vecp = vec; + return buf; +} + static void xlog_grant_sub_space( struct xlog *log, @@ -175,12 +226,12 @@ xlog_ticket_reservation( if (head == &log->l_write_head) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); return tic->t_unit_res; - } else { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - return tic->t_unit_res * tic->t_cnt; - else - return tic->t_unit_res; } + + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + + return tic->t_unit_res; } STATIC bool @@ -322,30 +373,6 @@ xlog_grant_head_check( return error; } -static void -xlog_tic_reset_res(xlog_ticket_t *tic) -{ - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - tic->t_res_num_ophdrs = 0; -} - -static void -xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) -{ - if (tic->t_res_num == XLOG_TIC_LEN_MAX) { - /* add to overflow and start again */ - tic->t_res_o_flow += tic->t_res_arr_sum; - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - } - - tic->t_res_arr[tic->t_res_num].r_len = len; - tic->t_res_arr[tic->t_res_num].r_type = type; - tic->t_res_arr_sum += len; - tic->t_res_num++; -} - bool xfs_log_writable( struct xfs_mount *mp) @@ -395,8 +422,6 @@ xfs_log_regrant( xlog_grant_push_ail(log, tic->t_unit_res); tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - if (tic->t_cnt > 0) return 0; @@ -434,10 +459,9 @@ out_error: int xfs_log_reserve( struct xfs_mount *mp, - int unit_bytes, - int cnt, + int unit_bytes, + int cnt, struct xlog_ticket **ticp, - uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -445,15 +469,13 @@ xfs_log_reserve( int need_bytes; int error = 0; - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -487,7 +509,10 @@ out_error: * Run all the pending iclog callbacks and wake log force waiters and iclog * space waiters so they can process the newly set shutdown state. We really * don't care what order we process callbacks here because the log is shut down - * and so state cannot change on disk anymore. + * and so state cannot change on disk anymore. However, we cannot wake waiters + * until the callbacks have been processed because we may be in unmount and + * we must ensure that all AIL operations the callbacks perform have completed + * before we tear down the AIL. * * We avoid processing actively referenced iclogs so that we don't run callbacks * while the iclog owner might still be preparing the iclog for IO submssion. @@ -501,7 +526,6 @@ xlog_state_shutdown_callbacks( struct xlog_in_core *iclog; LIST_HEAD(cb_list); - spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { if (atomic_read(&iclog->ic_refcnt)) { @@ -509,26 +533,22 @@ xlog_state_shutdown_callbacks( continue; } list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); + + spin_lock(&log->l_icloglock); wake_up_all(&iclog->ic_write_wait); wake_up_all(&iclog->ic_force_wait); } while ((iclog = iclog->ic_next) != log->l_iclog); wake_up_all(&log->l_flush_wait); - spin_unlock(&log->l_icloglock); - - xlog_cil_process_committed(&cb_list); } /* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * - * If the caller passes in a non-zero @old_tail_lsn and the current log tail - * does not match, there may be metadata on disk that must be persisted before - * this iclog is written. To satisfy that requirement, set the - * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new - * log tail value. - * * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the * log tail is updated correctly. NEED_FUA indicates that the iclog will be * written to stable storage, and implies that a commit record is contained @@ -545,12 +565,11 @@ xlog_state_shutdown_callbacks( * always capture the tail lsn on the iclog on the first NEED_FUA release * regardless of the number of active reference counts on this iclog. */ - int xlog_state_release_iclog( struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t old_tail_lsn) + struct xlog_ticket *ticket) { xfs_lsn_t tail_lsn; bool last_ref; @@ -561,18 +580,14 @@ xlog_state_release_iclog( /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does - * not move between deciding if a cache flush is required and writing - * the LSN into the iclog below. + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. */ - if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - if (old_tail_lsn && tail_lsn != old_tail_lsn) - iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; - - if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && - !iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -583,11 +598,8 @@ xlog_state_release_iclog( * pending iclog callbacks that were waiting on the release of * this iclog. */ - if (last_ref) { - spin_unlock(&log->l_icloglock); + if (last_ref) xlog_state_shutdown_callbacks(log); - spin_lock(&log->l_icloglock); - } return -EIO; } @@ -600,13 +612,11 @@ xlog_state_release_iclog( } iclog->ic_state = XLOG_STATE_SYNCING; - if (!iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog); trace_xlog_iclog_syncing(iclog, _RET_IP_); spin_unlock(&log->l_icloglock); - xlog_sync(log, iclog); + xlog_sync(log, iclog, ticket); spin_lock(&log->l_icloglock); return 0; } @@ -812,10 +822,9 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; + xfs_log_work_queue(mp); if (xlog_recovery_needed(log)) error = xlog_recover_finish(log); - if (!error) - xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); @@ -874,7 +883,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); } /* @@ -915,23 +924,39 @@ xlog_write_unmount_record( struct xlog *log, struct xlog_ticket *ticket) { - struct xfs_unmount_log_format ulf = { - .magic = XLOG_UNMOUNT_TYPE, + struct { + struct xlog_op_header ophdr; + struct xfs_unmount_log_format ulf; + } unmount_rec = { + .ophdr = { + .oh_clientid = XFS_LOG, + .oh_tid = cpu_to_be32(ticket->t_tid), + .oh_flags = XLOG_UNMOUNT_TRANS, + }, + .ulf = { + .magic = XLOG_UNMOUNT_TYPE, + }, }; struct xfs_log_iovec reg = { - .i_addr = &ulf, - .i_len = sizeof(ulf), + .i_addr = &unmount_rec, + .i_len = sizeof(unmount_rec), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); + + BUILD_BUG_ON((sizeof(struct xlog_op_header) + + sizeof(struct xfs_unmount_log_format)) != + sizeof(unmount_rec)); /* account for space used by record data */ - ticket->t_curr_res -= sizeof(ulf); + ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len); } /* @@ -947,7 +972,7 @@ xlog_unmount_write( struct xlog_ticket *tic = NULL; int error; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, 0); if (error) goto out_err; @@ -1102,7 +1127,7 @@ xfs_log_item_init( int type, const struct xfs_item_ops *ops) { - item->li_mountp = mp; + item->li_log = mp->m_log; item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; @@ -1374,7 +1399,7 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } xlog_state_done_syncing(iclog); @@ -1598,9 +1623,6 @@ xlog_alloc_log( GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; -#ifdef DEBUG - log->l_iclog_bak[i] = &iclog->ic_header; -#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -1616,7 +1638,7 @@ xlog_alloc_log( iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1883,19 +1905,19 @@ xlog_write_iclog( return; } - bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); - bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); - iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; - iclog->ic_bio.bi_end_io = xlog_bio_end_io; - iclog->ic_bio.bi_private = iclog; - /* * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more * IOs coming immediately after this one. This prevents the block layer * writeback throttle from throttling log writes behind background * metadata writeback and causing priority inversions. */ - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; + bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec, + howmany(count, PAGE_SIZE), + REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE); + iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; + iclog->ic_bio.bi_end_io = xlog_bio_end_io; + iclog->ic_bio.bi_private = iclog; + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) { iclog->ic_bio.bi_opf |= REQ_PREFLUSH; /* @@ -1903,9 +1925,17 @@ xlog_write_iclog( * device cache first to ensure all metadata writeback covered * by the LSN in this iclog is on stable storage. This is slow, * but it *must* complete before we issue the external log IO. + * + * If the flush fails, we cannot conclude that past metadata + * writeback from the log succeeded. Repeating the flush is + * not possible, hence we must shut down with log IO error to + * avoid shutdown re-entering this path and erroring out again. */ - if (log->l_targ != log->l_mp->m_ddev_targp) - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev); + if (log->l_targ != log->l_mp->m_ddev_targp && + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + return; + } } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; @@ -1913,7 +1943,7 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return; } if (is_vmalloc_addr(iclog->ic_data)) @@ -1982,7 +2012,7 @@ xlog_calc_iclog_size( } /* - * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous * fashion. Previously, we should have moved the current iclog * ptr in the log to point to the next available iclog. This allows further * write to continue while this code syncs out an iclog ready to go. @@ -2007,7 +2037,8 @@ xlog_calc_iclog_size( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { unsigned int count; /* byte count of bwrite */ unsigned int roundoff; /* roundoff to BB or stripe */ @@ -2019,12 +2050,20 @@ xlog_sync( count = xlog_calc_iclog_size(log, iclog, &roundoff); - /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + /* + * If we have a ticket, account for the roundoff via the ticket + * reservation to avoid touching the hot grant heads needlessly. + * Otherwise, we have to move grant heads directly. + */ + if (ticket) { + ticket->t_curr_res -= roundoff; + } else { + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + } /* put cycle number in every block */ - xlog_pack_data(log, iclog, roundoff); + xlog_pack_data(log, iclog, roundoff); /* real byte length */ size = iclog->ic_offset; @@ -2074,8 +2113,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - xlog_cil_destroy(log); - /* * Cycle all the iclogbuf locks to make sure all log IO completion * is done before we tear down these buffers. @@ -2087,6 +2124,13 @@ xlog_dealloc_log( iclog = iclog->ic_next; } + /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. + */ + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; @@ -2125,63 +2169,11 @@ xlog_print_tic_res( struct xfs_mount *mp, struct xlog_ticket *ticket) { - uint i; - uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); - - /* match with XLOG_REG_TYPE_* in xfs_log.h */ -#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[] = { - REG_TYPE_STR(BFORMAT, "bformat"), - REG_TYPE_STR(BCHUNK, "bchunk"), - REG_TYPE_STR(EFI_FORMAT, "efi_format"), - REG_TYPE_STR(EFD_FORMAT, "efd_format"), - REG_TYPE_STR(IFORMAT, "iformat"), - REG_TYPE_STR(ICORE, "icore"), - REG_TYPE_STR(IEXT, "iext"), - REG_TYPE_STR(IBROOT, "ibroot"), - REG_TYPE_STR(ILOCAL, "ilocal"), - REG_TYPE_STR(IATTR_EXT, "iattr_ext"), - REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), - REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), - REG_TYPE_STR(QFORMAT, "qformat"), - REG_TYPE_STR(DQUOT, "dquot"), - REG_TYPE_STR(QUOTAOFF, "quotaoff"), - REG_TYPE_STR(LRHEADER, "LR header"), - REG_TYPE_STR(UNMOUNT, "unmount"), - REG_TYPE_STR(COMMIT, "commit"), - REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create"), - REG_TYPE_STR(RUI_FORMAT, "rui_format"), - REG_TYPE_STR(RUD_FORMAT, "rud_format"), - REG_TYPE_STR(CUI_FORMAT, "cui_format"), - REG_TYPE_STR(CUD_FORMAT, "cud_format"), - REG_TYPE_STR(BUI_FORMAT, "bui_format"), - REG_TYPE_STR(BUD_FORMAT, "bud_format"), - }; - BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); -#undef REG_TYPE_STR - xfs_warn(mp, "ticket reservation summary:"); - xfs_warn(mp, " unit res = %d bytes", - ticket->t_unit_res); - xfs_warn(mp, " current res = %d bytes", - ticket->t_curr_res); - xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", - ticket->t_res_arr_sum, ticket->t_res_o_flow); - xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", - ticket->t_res_num_ophdrs, ophdr_spc); - xfs_warn(mp, " ophdr + reg = %u bytes", - ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); - xfs_warn(mp, " num regions = %u", - ticket->t_res_num); - - for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes", i, - ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? - "bad-rtype" : res_type_str[r_type]), - ticket->t_res_arr[i].r_len); - } + xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res); + xfs_warn(mp, " original count = %d", ticket->t_ocnt); + xfs_warn(mp, " remaining count = %d", ticket->t_cnt); } /* @@ -2234,187 +2226,226 @@ xlog_print_trans( } } +static inline void +xlog_write_iovec( + struct xlog_in_core *iclog, + uint32_t *log_offset, + void *data, + uint32_t write_len, + int *bytes_left, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + ASSERT(*log_offset < iclog->ic_log->l_iclog_size); + ASSERT(*log_offset % sizeof(int32_t) == 0); + ASSERT(write_len % sizeof(int32_t) == 0); + + memcpy(iclog->ic_datap + *log_offset, data, write_len); + *log_offset += write_len; + *bytes_left -= write_len; + (*record_cnt)++; + *data_cnt += write_len; +} + /* - * Calculate the potential space needed by the log vector. We may need a start - * record, and each region gets its own struct xlog_op_header and may need to be - * double word aligned. + * Write log vectors into a single iclog which is guaranteed by the caller + * to have enough space to write the entire log vector into. */ -static int -xlog_write_calc_vec_length( +static void +xlog_write_full( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector, - uint optype) + struct xlog_in_core *iclog, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - struct xfs_log_vec *lv; - int headers = 0; - int len = 0; - int i; + int index; - if (optype & XLOG_START_TRANS) - headers++; + ASSERT(*log_offset + *len <= iclog->ic_size || + iclog->ic_state == XLOG_STATE_WANT_SYNC); - for (lv = log_vector; lv; lv = lv->lv_next) { - /* we don't write ordered log vectors */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) - continue; - - headers += lv->lv_niovecs; - - for (i = 0; i < lv->lv_niovecs; i++) { - struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + /* + * Ordered log vectors have no regions to write so this + * loop will naturally skip them. + */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + struct xlog_op_header *ophdr = reg->i_addr; - len += vecp->i_len; - xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); - } + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + xlog_write_iovec(iclog, log_offset, reg->i_addr, + reg->i_len, len, record_cnt, data_cnt); } - - ticket->t_res_num_ophdrs += headers; - len += headers * sizeof(struct xlog_op_header); - - return len; -} - -static void -xlog_write_start_rec( - struct xlog_op_header *ophdr, - struct xlog_ticket *ticket) -{ - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_len = 0; - ophdr->oh_flags = XLOG_START_TRANS; - ophdr->oh_res2 = 0; } -static xlog_op_header_t * -xlog_write_setup_ophdr( - struct xlog *log, - struct xlog_op_header *ophdr, +static int +xlog_write_get_more_iclog_space( struct xlog_ticket *ticket, - uint flags) + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_res2 = 0; - - /* are we copying a commit or unmount record? */ - ophdr->oh_flags = flags; + struct xlog_in_core *iclog = *iclogp; + struct xlog *log = iclog->ic_log; + int error; - /* - * We've seen logs corrupted with bad transaction client ids. This - * makes sure that XFS doesn't generate them on. Turn this into an EIO - * and shut down the filesystem. - */ - switch (ophdr->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, - ophdr->oh_clientid, ticket); - return NULL; - } + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + error = xlog_state_release_iclog(log, iclog, ticket); + spin_unlock(&log->l_icloglock); + if (error) + return error; - return ophdr; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + log_offset); + if (error) + return error; + *record_cnt = 0; + *data_cnt = 0; + *iclogp = iclog; + return 0; } /* - * Set up the parameters of the region copy into the log. This has - * to handle region write split across multiple log buffers - this - * state is kept external to this function so that this code can - * be written in an obvious, self documenting manner. + * Write log vectors into a single iclog which is smaller than the current chain + * length. We write until we cannot fit a full record into the remaining space + * and then stop. We return the log vector that is to be written that cannot + * wholly fit in the iclog. */ static int -xlog_write_setup_copy( +xlog_write_partial( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xlog_op_header *ophdr, - int space_available, - int space_required, - int *copy_off, - int *copy_len, - int *last_was_partial_copy, - int *bytes_consumed) -{ - int still_to_copy; - - still_to_copy = space_required - *bytes_consumed; - *copy_off = *bytes_consumed; - - if (still_to_copy <= space_available) { - /* write of region completes here */ - *copy_len = still_to_copy; - ophdr->oh_len = cpu_to_be32(*copy_len); - if (*last_was_partial_copy) - ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - *last_was_partial_copy = 0; - *bytes_consumed = 0; - return 0; - } + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + struct xlog_in_core *iclog = *iclogp; + struct xlog_op_header *ophdr; + int index = 0; + uint32_t rlen; + int error; + + /* walk the logvec, copying until we run out of space in the iclog */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + uint32_t reg_offset = 0; + + /* + * The first region of a continuation must have a non-zero + * length otherwise log recovery will just skip over it and + * start recovering from the next opheader it finds. Because we + * mark the next opheader as a continuation, recovery will then + * incorrectly add the continuation to the previous region and + * that breaks stuff. + * + * Hence if there isn't space for region data after the + * opheader, then we need to start afresh with a new iclog. + */ + if (iclog->ic_size - *log_offset <= + sizeof(struct xlog_op_header)) { + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, *len, record_cnt, + data_cnt); + if (error) + return error; + } - /* partial write of region, needs extra log op header reservation */ - *copy_len = space_available; - ophdr->oh_len = cpu_to_be32(*copy_len); - ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (*last_was_partial_copy) - ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; - *bytes_consumed += *copy_len; - (*last_was_partial_copy)++; + ophdr = reg->i_addr; + rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset); - /* account for new log op header */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); - ticket->t_res_num_ophdrs++; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header)); + if (rlen != reg->i_len) + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - return sizeof(struct xlog_op_header); -} + xlog_write_iovec(iclog, log_offset, reg->i_addr, + rlen, len, record_cnt, data_cnt); -static int -xlog_write_copy_finish( - struct xlog *log, - struct xlog_in_core *iclog, - uint flags, - int *record_cnt, - int *data_cnt, - int *partial_copy, - int *partial_copy_len, - int log_offset) -{ - int error; + /* If we wrote the whole region, move to the next. */ + if (rlen == reg->i_len) + continue; - if (*partial_copy) { /* - * This iclog has already been marked WANT_SYNC by - * xlog_state_get_iclog_space. + * We now have a partially written iovec, but it can span + * multiple iclogs so we loop here. First we release the iclog + * we currently have, then we get a new iclog and add a new + * opheader. Then we continue copying from where we were until + * we either complete the iovec or fill the iclog. If we + * complete the iovec, then we increment the index and go right + * back to the top of the outer loop. if we fill the iclog, we + * run the inner loop again. + * + * This is complicated by the tail of a region using all the + * space in an iclog and hence requiring us to release the iclog + * and get a new one before returning to the outer loop. We must + * always guarantee that we exit this inner loop with at least + * space for log transaction opheaders left in the current + * iclog, hence we cannot just terminate the loop at the end + * of the of the continuation. So we loop while there is no + * space left in the current iclog, and check for the end of the + * continuation after getting a new iclog. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - goto release_iclog; - } + do { + /* + * Ensure we include the continuation opheader in the + * space we need in the new iclog by adding that size + * to the length we require. This continuation opheader + * needs to be accounted to the ticket as the space it + * consumes hasn't been accounted to the lv we are + * writing. + */ + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, + *len + sizeof(struct xlog_op_header), + record_cnt, data_cnt); + if (error) + return error; - *partial_copy = 0; - *partial_copy_len = 0; + ophdr = iclog->ic_datap + *log_offset; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = XFS_TRANSACTION; + ophdr->oh_res2 = 0; + ophdr->oh_flags = XLOG_WAS_CONT_TRANS; - if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) - return 0; + ticket->t_curr_res -= sizeof(struct xlog_op_header); + *log_offset += sizeof(struct xlog_op_header); + *data_cnt += sizeof(struct xlog_op_header); - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; + /* + * If rlen fits in the iclog, then end the region + * continuation. Otherwise we're going around again. + */ + reg_offset += rlen; + rlen = reg->i_len - reg_offset; + if (rlen <= iclog->ic_size - *log_offset) + ophdr->oh_flags |= XLOG_END_TRANS; + else + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - xlog_is_shutdown(log)); -release_iclog: - error = xlog_state_release_iclog(log, iclog, 0); - spin_unlock(&log->l_icloglock); - return error; + rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset); + ophdr->oh_len = cpu_to_be32(rlen); + + xlog_write_iovec(iclog, log_offset, + reg->i_addr + reg_offset, + rlen, len, record_cnt, data_cnt); + + } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS); + } + + /* + * No more iovecs remain in this logvec so return the next log vec to + * the caller so it can go back to fast path copying. + */ + *iclogp = iclog; + return 0; } /* @@ -2461,175 +2492,73 @@ int xlog_write( struct xlog *log, struct xfs_cil_ctx *ctx, - struct xfs_log_vec *log_vector, + struct list_head *lv_chain, struct xlog_ticket *ticket, - uint optype) + uint32_t len) + { struct xlog_in_core *iclog = NULL; - struct xfs_log_vec *lv = log_vector; - struct xfs_log_iovec *vecp = lv->lv_iovecp; - int index = 0; - int len; - int partial_copy = 0; - int partial_copy_len = 0; - int contwr = 0; - int record_cnt = 0; - int data_cnt = 0; + struct xfs_log_vec *lv; + uint32_t record_cnt = 0; + uint32_t data_cnt = 0; int error = 0; + int log_offset; - /* - * If this is a commit or unmount transaction, we don't need a start - * record to be written. We do, however, have to account for the - * commit or unmount header that gets written. Hence we always have - * to account for an extra xlog_op_header here. - */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - len = xlog_write_calc_vec_length(ticket, log_vector, optype); - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - void *ptr; - int log_offset; - - error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset); - if (error) - return error; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &log_offset); + if (error) + return error; - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = iclog->ic_datap + log_offset; + ASSERT(log_offset <= iclog->ic_size - 1); - /* - * If we have a context pointer, pass it the first iclog we are - * writing to so it can record state needed for iclog write - * ordering. - */ - if (ctx) { - xlog_cil_set_ctx_write_state(ctx, iclog); - ctx = NULL; - } + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) + xlog_cil_set_ctx_write_state(ctx, iclog); + list_for_each_entry(lv, lv_chain, lv_list) { /* - * This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). + * If the entire log vec does not fit in the iclog, punt it to + * the partial copy loop which can handle this case. */ - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - struct xfs_log_iovec *reg; - struct xlog_op_header *ophdr; - int copy_len; - int copy_off; - bool ordered = false; - bool wrote_start_rec = false; - - /* ordered log vectors have no regions to write */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { - ASSERT(lv->lv_niovecs == 0); - ordered = true; - goto next_lv; - } - - reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - - /* - * Before we start formatting log vectors, we need to - * write a start record. Only do this for the first - * iclog we write to. - */ - if (optype & XLOG_START_TRANS) { - xlog_write_start_rec(ptr, ticket); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - optype &= ~XLOG_START_TRANS; - wrote_start_rec = true; - } - - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); - if (!ophdr) - return -EIO; - - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - - len += xlog_write_setup_copy(ticket, ophdr, - iclog->ic_size-log_offset, - reg->i_len, - ©_off, ©_len, - &partial_copy, - &partial_copy_len); - xlog_verify_dest_ptr(log, ptr); - - /* - * Copy region. - * - * Unmount records just log an opheader, so can have - * empty payloads with no data region to copy. Hence we - * only copy the payload if the vector says it has data - * to copy. - */ - ASSERT(copy_len >= 0); - if (copy_len > 0) { - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - copy_len); - } - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - if (wrote_start_rec) { - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - } - data_cnt += contwr ? copy_len : 0; - - error = xlog_write_copy_finish(log, iclog, optype, - &record_cnt, &data_cnt, - &partial_copy, - &partial_copy_len, - log_offset); - if (error) + if (lv->lv_niovecs && + lv->lv_bytes > iclog->ic_size - log_offset) { + error = xlog_write_partial(lv, ticket, &iclog, + &log_offset, &len, &record_cnt, + &data_cnt); + if (error) { + /* + * We have no iclog to release, so just return + * the error immediately. + */ return error; - - /* - * if we had a partial copy, we need to get more iclog - * space but we don't want to increment the region - * index because there is still more is this region to - * write. - * - * If we completed writing this region, and we flushed - * the iclog (indicated by resetting of the record - * count), then we also need to get more log space. If - * this was the last record, though, we are done and - * can just return. - */ - if (partial_copy) - break; - - if (++index == lv->lv_niovecs) { -next_lv: - lv = lv->lv_next; - index = 0; - if (lv) - vecp = lv->lv_iovecp; - } - if (record_cnt == 0 && !ordered) { - if (!lv) - return 0; - break; } + } else { + xlog_write_full(lv, ticket, iclog, &log_offset, + &len, &record_cnt, &data_cnt); } } - ASSERT(len == 0); + /* + * We've already been guaranteed that the last writes will fit inside + * the current iclog, and hence it will already have the space used by + * those writes accounted to it. Hence we do not need to update the + * iclog with the number of bytes written here. + */ spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - error = xlog_state_release_iclog(log, iclog, 0); + xlog_state_finish_copy(log, iclog, record_cnt, 0); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); return error; @@ -2985,7 +2914,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclogp, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp) { int log_offset; @@ -3022,9 +2950,6 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - xlog_tic_add_region(ticket, - log->l_iclog_hsize, - XLOG_REG_TYPE_LRHEADER); head->h_cycle = cpu_to_be32(log->l_curr_cycle); head->h_lsn = cpu_to_be64( xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); @@ -3053,7 +2978,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3066,13 +2991,10 @@ restart: * iclogs (to mark it taken), this particular iclog will release/sync * to disk in xlog_write(). */ - if (len <= iclog->ic_size - iclog->ic_offset) { - *continued_write = 0; + if (len <= iclog->ic_size - iclog->ic_offset) iclog->ic_offset += len; - } else { - *continued_write = 1; + else xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - } *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); @@ -3104,7 +3026,6 @@ xfs_log_ticket_regrant( xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); trace_xfs_log_ticket_regrant_sub(log, ticket); @@ -3115,7 +3036,6 @@ xfs_log_ticket_regrant( trace_xfs_log_ticket_regrant_exit(log, ticket); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); } xfs_log_ticket_put(ticket); @@ -3506,7 +3426,8 @@ xfs_log_ticket_get( static int xlog_calc_unit_res( struct xlog *log, - int unit_bytes) + int unit_bytes, + int *niclogs) { int iclog_space; uint num_headers; @@ -3586,6 +3507,8 @@ xlog_calc_unit_res( /* roundoff padding for transaction data and one for commit record */ unit_bytes += 2 * log->l_iclog_roundoff; + if (niclogs) + *niclogs = num_headers; return unit_bytes; } @@ -3594,7 +3517,7 @@ xfs_log_calc_unit_res( struct xfs_mount *mp, int unit_bytes) { - return xlog_calc_unit_res(mp->m_log, unit_bytes); + return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL); } /* @@ -3605,7 +3528,6 @@ xlog_ticket_alloc( struct xlog *log, int unit_bytes, int cnt, - char client, bool permanent) { struct xlog_ticket *tic; @@ -3613,7 +3535,7 @@ xlog_ticket_alloc( tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); - unit_res = xlog_calc_unit_res(log, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); atomic_set(&tic->t_ref, 1); tic->t_task = current; @@ -3622,41 +3544,15 @@ xlog_ticket_alloc( tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = prandom_u32(); - tic->t_clientid = client; + tic->t_tid = get_random_u32(); if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - xlog_tic_reset_res(tic); - return tic; } #if defined(DEBUG) /* - * Make sure that the destination ptr is within the valid data region of - * one of the iclogs. This uses backup pointers stored in a different - * part of the log in case we trash the log structure. - */ -STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr) -{ - int i; - int good_ptr = 0; - - for (i = 0; i < log->l_iclog_bufs; i++) { - if (ptr >= log->l_iclog_bak[i] && - ptr <= log->l_iclog_bak[i] + log->l_iclog_size) - good_ptr++; - } - - if (!good_ptr) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); -} - -/* * Check to make sure the grant write head didn't just over lap the tail. If * the cycles are the same, we can't be overlapping. Otherwise, make sure that * the cycles differ by exactly one and check the byte count. @@ -3783,7 +3679,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3794,11 +3690,12 @@ xlog_verify_iclog( iclog->ic_header.h_cycle_data[idx]); } } - if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, - "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", - __func__, clientid, ophead, + "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx", + __func__, i, clientid, ophead, (unsigned long)field_offset); + } /* check length */ p = &ophead->oh_len; @@ -3806,8 +3703,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((uintptr_t)&ophead->oh_len - - (uintptr_t)iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3822,9 +3718,10 @@ xlog_verify_iclog( #endif /* - * Perform a forced shutdown on the log. This should be called once and once - * only by the high level filesystem shutdown code to shut the log subsystem - * down cleanly. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. * * Our main objectives here are to make sure that: * a. if the shutdown was not due to a log IO error, flush the logs to @@ -3833,6 +3730,8 @@ xlog_verify_iclog( * parties to find out. Nothing new gets queued after this is done. * c. Tasks sleeping on log reservations, pinned objects and * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * * Return true if the shutdown cause was a log IO error and we actually shut the * log down. @@ -3840,29 +3739,29 @@ xlog_verify_iclog( bool xlog_force_shutdown( struct xlog *log, - int shutdown_flags) + uint32_t shutdown_flags) { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); - /* - * If this happens during log recovery then we aren't using the runtime - * log mechanisms yet so there's nothing to shut down. - */ - if (!log || xlog_in_recovery(log)) + if (!log) return false; - ASSERT(!xlog_is_shutdown(log)); - /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs * to disk. * - * Re-entry due to a log IO error shutdown during the log force is - * prevented by the atomicity of higher level shutdown code. + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!log_error) + if (!log_error && !xlog_in_recovery(log)) xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* @@ -3879,12 +3778,25 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); - ASSERT(0); return false; } spin_unlock(&log->l_icloglock); /* + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. + */ + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } + + /* * We don't want anybody waiting for log reservations after this. That * means we have to wake up everybody queued up on reserveq as well as * writeq. In addition, we make sure in xlog_{re}grant_log_space that @@ -3904,8 +3816,12 @@ xlog_force_shutdown( wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); + + spin_lock(&log->l_icloglock); xlog_state_shutdown_callbacks(log); + spin_unlock(&log->l_icloglock); + wake_up_var(&log->l_opstate); return log_error; } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index dc1b77b92fc1..2728886c2963 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -9,7 +9,8 @@ struct xfs_cil_ctx; struct xfs_log_vec { - struct xfs_log_vec *lv_next; /* next lv in build list */ + struct list_head lv_list; /* CIL lv chain ptrs */ + uint32_t lv_order_id; /* chain ordering info */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ @@ -21,46 +22,59 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) -static inline void * -xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, - uint type) +/* + * Calculate the log iovec length for a given user buffer length. Intended to be + * used by ->iop_size implementations when sizing buffers of arbitrary + * alignments. + */ +static inline int +xlog_calc_iovec_len(int len) { - struct xfs_log_iovec *vec = *vecp; + return roundup(len, sizeof(uint32_t)); +} + +void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type); - if (vec) { - ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); - vec++; - } else { - vec = &lv->lv_iovecp[0]; +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, + int data_len) +{ + struct xlog_op_header *oph = vec->i_addr; + int len; + + /* + * Always round up the length to the correct alignment so callers don't + * need to know anything about this log vec layout requirement. This + * means we have to zero the area the data to be written does not cover. + * This is complicated by fact the payload region is offset into the + * logvec region by the opheader that tracks the payload. + */ + len = xlog_calc_iovec_len(data_len); + if (len - data_len != 0) { + char *buf = vec->i_addr + sizeof(struct xlog_op_header); + + memset(buf + data_len, 0, len - data_len); } - vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; + /* + * The opheader tracks aligned payload length, whilst the logvec tracks + * the overall region length. + */ + oph->oh_len = cpu_to_be32(len); - ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + len += sizeof(struct xlog_op_header); + lv->lv_buf_len += len; + lv->lv_bytes += len; + vec->i_len = len; - *vecp = vec; - return vec->i_addr; + /* Catch buffer overruns */ + ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size); } /* - * We need to make sure the next buffer is naturally aligned for the biggest - * basic data type we put into it. We already accounted for this padding when - * sizing the buffer. - * - * However, this padding does not get written into the log, and hence we have to - * track the space used by the log vectors separately to prevent log space hangs - * due to inaccurate accounting (i.e. a leak) of the used log space through the - * CIL context ticket. + * Copy the amount of data requested by the caller into a new log iovec. */ -static inline void -xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) -{ - lv->lv_buf_len += round_up(len, sizeof(uint64_t)); - lv->lv_bytes += len; - vec->i_len = len; -} - static inline void * xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type, void *data, int len) @@ -73,6 +87,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } +static inline void * +xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + const struct xfs_log_iovec *src) +{ + return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); +} + /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number @@ -117,15 +138,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); -void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_reserve(struct xfs_mount *mp, - int length, - int count, - struct xlog_ticket **ticket, - uint8_t clientid, - bool permanent); -int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -void xfs_log_unmount(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); +int xfs_log_reserve(struct xfs_mount *mp, int length, int count, + struct xlog_ticket **ticket, bool permanent); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); +void xfs_log_unmount(struct xfs_mount *mp); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); @@ -140,9 +157,10 @@ void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); -bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); +bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); void xlog_use_incompat_feat(struct xlog *log); void xlog_drop_incompat_feat(struct xlog *log); +int xfs_attr_use_log_assist(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 6c93c8ada6f3..eccbfb99e894 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,16 +37,59 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); + tic = xlog_ticket_alloc(log, 0, 1, 0); /* * set the current reservation to zero so we know to steal the basic * transaction overhead reservation from the first transaction commit. */ tic->t_curr_res = 0; + tic->t_iclog_hdrs = 0; return tic; } +static inline void +xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil) +{ + struct xlog *log = cil->xc_log; + + atomic_set(&cil->xc_iclog_hdrs, + (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) / + (log->l_iclog_size - log->l_iclog_hsize))); +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +static bool +xlog_item_in_current_chkpt( + struct xfs_cil *cil, + struct xfs_log_item *lip) +{ + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) + return false; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); +} + +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip); +} + /* * Unavoidable forward declaration - xlog_cil_push_work() calls * xlog_cil_ctx_alloc() itself. @@ -61,15 +104,88 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->log_items); + INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); return ctx; } +/* + * Aggregate the CIL per cpu structures into global counts, lists, etc and + * clear the percpu state ready for the next context to use. This is called + * from the push code with the context lock held exclusively, hence nothing else + * will be accessing or modifying the per-cpu counters. + */ +static void +xlog_cil_push_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + struct xlog_cil_pcp *cilpcp; + int cpu; + + for_each_online_cpu(cpu) { + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + + ctx->ticket->t_curr_res += cilpcp->space_reserved; + cilpcp->space_reserved = 0; + + if (!list_empty(&cilpcp->busy_extents)) { + list_splice_init(&cilpcp->busy_extents, + &ctx->busy_extents); + } + if (!list_empty(&cilpcp->log_items)) + list_splice_init(&cilpcp->log_items, &ctx->log_items); + + /* + * We're in the middle of switching cil contexts. Reset the + * counter we use to detect when the current context is nearing + * full. + */ + cilpcp->space_used = 0; + } +} + +/* + * Aggregate the CIL per-cpu space used counters into the global atomic value. + * This is called when the per-cpu counter aggregation will first pass the soft + * limit threshold so we can switch to atomic counter aggregation for accurate + * detection of hard limit traversal. + */ +static void +xlog_cil_insert_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + struct xlog_cil_pcp *cilpcp; + int cpu; + int count = 0; + + /* Trigger atomic updates then aggregate only for the first caller */ + if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) + return; + + for_each_online_cpu(cpu) { + int old, prev; + + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + do { + old = cilpcp->space_used; + prev = cmpxchg(&cilpcp->space_used, old, 0); + } while (old != prev); + count += old; + } + atomic_add(count, &ctx->space_used); +} + static void xlog_cil_ctx_switch( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { + xlog_cil_set_iclog_hdr_count(cil); + set_bit(XLOG_CIL_EMPTY, &cil->xc_flags); + set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags); ctx->sequence = ++cil->xc_current_sequence; ctx->cil = cil; cil->xc_ctx = ctx; @@ -91,6 +207,7 @@ xlog_cil_init_post_recovery( { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; + xlog_cil_set_iclog_hdr_count(log->l_cilp); } static inline int @@ -181,13 +298,20 @@ xlog_cil_alloc_shadow_bufs( } /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. + * We 64-bit align the length of each iovec so that the start of + * the next one is naturally aligned. We'll need to account for + * that slack space here. + * + * We also add the xlog_op_header to each region when + * formatting, but that's not accounted to the size of the item + * at this point. Hence we'll need an addition number of bytes + * for each vector to hold an opheader. + * + * Then round nbytes up to 64-bit alignment so that the initial + * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * sizeof(uint64_t); + nbytes += niovecs * + (sizeof(uint64_t) + sizeof(struct xlog_op_header)); nbytes = round_up(nbytes, sizeof(uint64_t)); /* @@ -203,27 +327,19 @@ xlog_cil_alloc_shadow_bufs( */ if (!lip->li_lv_shadow || buf_size > lip->li_lv_shadow->lv_size) { - /* * We free and allocate here as a realloc would copy - * unnecessary data. We don't use kmem_zalloc() for the + * unnecessary data. We don't use kvzalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. */ kmem_free(lip->li_lv_shadow); + lv = xlog_kvmalloc(buf_size); - /* - * We are in transaction context, which means this - * allocation will pick up GFP_NOFS from the - * memalloc_nofs_save/restore context the transaction - * holds. This means we can use GFP_KERNEL here so the - * generic kvmalloc() code will run vmalloc on - * contiguous page allocation failure as we require. - */ - lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs)); + INIT_LIST_HEAD(&lv->lv_list); lv->lv_item = lip; lv->lv_size = buf_size; if (ordered) @@ -239,7 +355,6 @@ xlog_cil_alloc_shadow_bufs( else lv->lv_buf_len = 0; lv->lv_bytes = 0; - lv->lv_next = NULL; } /* Ensure the lv is set up according to ->iop_size */ @@ -253,22 +368,18 @@ xlog_cil_alloc_shadow_bufs( /* * Prepare the log item for insertion into the CIL. Calculate the difference in - * log space and vectors it will consume, and if it is a new item pin it as - * well. + * log space it will consume, and if it is a new item pin it as well. */ STATIC void xfs_cil_prepare_item( struct xlog *log, struct xfs_log_vec *lv, struct xfs_log_vec *old_lv, - int *diff_len, - int *diff_iovecs) + int *diff_len) { /* Account for the new LV being passed in */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; - *diff_iovecs += lv->lv_niovecs; - } /* * If there is no old LV, this is the first time we've seen the item in @@ -285,7 +396,6 @@ xfs_cil_prepare_item( ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; - *diff_iovecs -= old_lv->lv_niovecs; lv->lv_item->li_lv_shadow = old_lv; } @@ -334,12 +444,10 @@ static void xlog_cil_insert_format_items( struct xlog *log, struct xfs_trans *tp, - int *diff_len, - int *diff_iovecs) + int *diff_len) { struct xfs_log_item *lip; - /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); @@ -373,7 +481,6 @@ xlog_cil_insert_format_items( if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; - lv->lv_next = NULL; if (ordered) goto insert; @@ -382,7 +489,6 @@ xlog_cil_insert_format_items( * set the item up as though it is a new insertion so * that the space reservation accounting is correct. */ - *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; /* Ensure the lv is set up according to ->iop_size */ @@ -407,11 +513,28 @@ xlog_cil_insert_format_items( ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); insert: - xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + xfs_cil_prepare_item(log, lv, old_lv, diff_len); } } /* + * The use of lockless waitqueue_active() requires that the caller has + * serialised itself against the wakeup call in xlog_cil_push_work(). That + * can be done by either holding the push lock or the context lock. + */ +static inline bool +xlog_cil_over_hard_limit( + struct xlog *log, + int32_t space_used) +{ + if (waitqueue_active(&log->l_cilp->xc_push_wait)) + return true; + if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + return true; + return false; +} + +/* * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space @@ -421,15 +544,17 @@ insert: static void xlog_cil_insert_items( struct xlog *log, - struct xfs_trans *tp) + struct xfs_trans *tp, + uint32_t released_space) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; - int diff_iovecs = 0; - int iclog_space; int iovhdr_res = 0, split_res = 0, ctx_res = 0; + int space_used; + int order; + struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -437,51 +562,114 @@ xlog_cil_insert_items( * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + xlog_cil_insert_format_items(log, tp, &len); + + /* + * Subtract the space released by intent cancelation from the space we + * consumed so that we remove it from the CIL space and add it back to + * the current transaction reservation context. + */ + len -= released_space; + + /* + * Grab the per-cpu pointer for the CIL before we start any accounting. + * That ensures that we are running with pre-emption disabled and so we + * can't be scheduled away between split sample/update operations that + * are done without outside locking to serialise them. + */ + cilpcp = get_cpu_ptr(cil->xc_pcp); + + /* + * We need to take the CIL checkpoint unit reservation on the first + * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't + * unnecessarily do an atomic op in the fast path here. We can clear the + * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that + * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit. + */ + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) && + test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) + ctx_res = ctx->ticket->t_unit_res; - spin_lock(&cil->xc_cil_lock); + /* + * Check if we need to steal iclog headers. atomic_read() is not a + * locked atomic operation, so we can check the value before we do any + * real atomic ops in the fast path. If we've already taken the CIL unit + * reservation from this commit, we've already got one iclog header + * space reserved so we have to account for that otherwise we risk + * overrunning the reservation on this ticket. + * + * If the CIL is already at the hard limit, we might need more header + * space that originally reserved. So steal more header space from every + * commit that occurs once we are over the hard limit to ensure the CIL + * push won't run out of reservation space. + * + * This can steal more than we need, but that's OK. + * + * The cil->xc_ctx_lock provides the serialisation necessary for safely + * calling xlog_cil_over_hard_limit() in this context. + */ + space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len; + if (atomic_read(&cil->xc_iclog_hdrs) > 0 || + xlog_cil_over_hard_limit(log, space_used)) { + split_res = log->l_iclog_hsize + + sizeof(struct xlog_op_header); + if (ctx_res) + ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1); + else + ctx_res = split_res * tp->t_ticket->t_iclog_hdrs; + atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs); + } + cilpcp->space_reserved += ctx_res; - /* account for space used by new iovec headers */ - iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t); - len += iovhdr_res; - ctx->nvecs += diff_iovecs; + /* + * Accurately account when over the soft limit, otherwise fold the + * percpu count into the global count if over the per-cpu threshold. + */ + if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) { + atomic_add(len, &ctx->space_used); + } else if (cilpcp->space_used + len > + (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) { + space_used = atomic_add_return(cilpcp->space_used + len, + &ctx->space_used); + cilpcp->space_used = 0; + /* + * If we just transitioned over the soft limit, we need to + * transition to the global atomic counter. + */ + if (space_used >= XLOG_CIL_SPACE_LIMIT(log)) + xlog_cil_insert_pcp_aggregate(cil, ctx); + } else { + cilpcp->space_used += len; + } /* attach the transaction to the CIL if it has any busy extents */ if (!list_empty(&tp->t_busy)) - list_splice_init(&tp->t_busy, &ctx->busy_extents); + list_splice_init(&tp->t_busy, &cilpcp->busy_extents); /* - * Now transfer enough transaction reservation to the context ticket - * for the checkpoint. The context ticket is special - the unit - * reservation has to grow as well as the current reservation as we - * steal from tickets so we can correctly determine the space used - * during the transaction commit. + * Now update the order of everything modified in the transaction + * and insert items into the CIL if they aren't already there. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. */ - if (ctx->ticket->t_curr_res == 0) { - ctx_res = ctx->ticket->t_unit_res; - ctx->ticket->t_curr_res = ctx_res; - tp->t_ticket->t_curr_res -= ctx_res; - } + order = atomic_inc_return(&ctx->order_id); + list_for_each_entry(lip, &tp->t_items, li_trans) { + /* Skip items which aren't dirty in this transaction. */ + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) + continue; - /* do we need space for more log record headers? */ - iclog_space = log->l_iclog_size - log->l_iclog_hsize; - if (len > 0 && (ctx->space_used / iclog_space != - (ctx->space_used + len) / iclog_space)) { - split_res = (len + iclog_space - 1) / iclog_space; - /* need to take into account split region headers, too */ - split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header); - ctx->ticket->t_unit_res += split_res; - ctx->ticket->t_curr_res += split_res; - tp->t_ticket->t_curr_res -= split_res; - ASSERT(tp->t_ticket->t_curr_res >= len); + lip->li_order_id = order; + if (!list_empty(&lip->li_cil)) + continue; + list_add_tail(&lip->li_cil, &cilpcp->log_items); } - tp->t_ticket->t_curr_res -= len; - ctx->space_used += len; + put_cpu_ptr(cilpcp); /* * If we've overrun the reservation, dump the tx details before we move * the log items. Shutdown is imminent... */ + tp->t_ticket->t_curr_res -= ctx_res + len; if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { xfs_warn(log->l_mp, "Transaction log reservation overrun:"); xfs_warn(log->l_mp, @@ -491,44 +679,20 @@ xlog_cil_insert_items( split_res); xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); xlog_print_trans(tp); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - - /* - * Now (re-)position everything modified at the tail of the CIL. - * We do this here so we only need to take the CIL lock once during - * the transaction commit. - */ - list_for_each_entry(lip, &tp->t_items, li_trans) { - - /* Skip items which aren't dirty in this transaction. */ - if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) - continue; - - /* - * Only move the item if it isn't already at the tail. This is - * to prevent a transient list_empty() state when reinserting - * an item that is already the only item in the CIL. - */ - if (!list_is_last(&lip->li_cil, &cil->xc_cil)) - list_move_tail(&lip->li_cil, &cil->xc_cil); - } - - spin_unlock(&cil->xc_cil_lock); - - if (tp->t_ticket->t_curr_res < 0) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } static void xlog_cil_free_logvec( - struct xfs_log_vec *log_vector) + struct list_head *lv_chain) { struct xfs_log_vec *lv; - for (lv = log_vector; lv; ) { - struct xfs_log_vec *next = lv->lv_next; + while (!list_empty(lv_chain)) { + lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); + list_del_init(&lv->lv_list); kmem_free(lv); - lv = next; } } @@ -581,7 +745,7 @@ xlog_discard_busy_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", @@ -628,7 +792,7 @@ xlog_cil_committed( spin_unlock(&ctx->cil->xc_push_lock); } - xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); xfs_extent_busy_sort(&ctx->busy_extents); @@ -639,7 +803,7 @@ xlog_cil_committed( list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); - xlog_cil_free_logvec(ctx->lv_chain); + xlog_cil_free_logvec(&ctx->lv_chain); if (!list_empty(&ctx->busy_extents)) xlog_discard_busy_extents(mp, ctx); @@ -681,11 +845,21 @@ xlog_cil_set_ctx_write_state( * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can - * move the tail beyond the grant write head. + * move the grant write head beyond the tail LSN and overwrite + * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); + + /* + * Make sure the metadata we are about to overwrite in the log + * has been flushed to stable storage before this iclog is + * issued. + */ + spin_lock(&cil->xc_log->l_icloglock); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + spin_unlock(&cil->xc_log->l_icloglock); return; } @@ -788,7 +962,7 @@ restart: static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, - struct xfs_log_vec *chain) + uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; int error; @@ -796,7 +970,7 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS); + return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len); } /* @@ -810,9 +984,14 @@ xlog_cil_write_commit_record( struct xfs_cil_ctx *ctx) { struct xlog *log = ctx->cil->xc_log; + struct xlog_op_header ophdr = { + .oh_clientid = XFS_TRANSACTION, + .oh_tid = cpu_to_be32(ctx->ticket->t_tid), + .oh_flags = XLOG_COMMIT_TRANS, + }; struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, + .i_addr = &ophdr, + .i_len = sizeof(struct xlog_op_header), .i_type = XLOG_REG_TYPE_COMMIT, }; struct xfs_log_vec vec = { @@ -820,6 +999,8 @@ xlog_cil_write_commit_record( .lv_iovecp = ®, }; int error; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); if (xlog_is_shutdown(log)) return -EIO; @@ -828,12 +1009,155 @@ xlog_cil_write_commit_record( if (error) return error; - error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); + /* account for space used by record data */ + ctx->ticket->t_curr_res -= reg.i_len; + error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len); if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } +struct xlog_cil_trans_hdr { + struct xlog_op_header oph[2]; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr[2]; +}; + +/* + * Build a checkpoint transaction header to begin the journal transaction. We + * need to account for the space used by the transaction header here as it is + * not accounted for in xlog_write(). + * + * This is the only place we write a transaction header, so we also build the + * log opheaders that indicate the start of a log transaction and wrap the + * transaction header. We keep the start record in it's own log vector rather + * than compacting them into a single region as this ends up making the logic + * in xlog_write() for handling empty opheaders for start, commit and unmount + * records much simpler. + */ +static void +xlog_cil_build_trans_hdr( + struct xfs_cil_ctx *ctx, + struct xlog_cil_trans_hdr *hdr, + struct xfs_log_vec *lvhdr, + int num_iovecs) +{ + struct xlog_ticket *tic = ctx->ticket; + __be32 tid = cpu_to_be32(tic->t_tid); + + memset(hdr, 0, sizeof(*hdr)); + + /* Log start record */ + hdr->oph[0].oh_tid = tid; + hdr->oph[0].oh_clientid = XFS_TRANSACTION; + hdr->oph[0].oh_flags = XLOG_START_TRANS; + + /* log iovec region pointer */ + hdr->lhdr[0].i_addr = &hdr->oph[0]; + hdr->lhdr[0].i_len = sizeof(struct xlog_op_header); + hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER; + + /* log opheader */ + hdr->oph[1].oh_tid = tid; + hdr->oph[1].oh_clientid = XFS_TRANSACTION; + hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header)); + + /* transaction header in host byte order format */ + hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + hdr->thdr.th_type = XFS_TRANS_CHECKPOINT; + hdr->thdr.th_tid = tic->t_tid; + hdr->thdr.th_num_items = num_iovecs; + + /* log iovec region pointer */ + hdr->lhdr[1].i_addr = &hdr->oph[1]; + hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) + + sizeof(struct xfs_trans_header); + hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR; + + lvhdr->lv_niovecs = 2; + lvhdr->lv_iovecp = &hdr->lhdr[0]; + lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; + + tic->t_curr_res -= lvhdr->lv_bytes; +} + +/* + * CIL item reordering compare function. We want to order in ascending ID order, + * but we want to leave items with the same ID in the order they were added to + * the list. This is important for operations like reflink where we log 4 order + * dependent intents in a single transaction when we overwrite an existing + * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop), + * CUI (inc), BUI(remap)... + */ +static int +xlog_cil_order_cmp( + void *priv, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list); + struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list); + + return l1->lv_order_id > l2->lv_order_id; +} + +/* + * Pull all the log vectors off the items in the CIL, and remove the items from + * the CIL. We don't need the CIL lock here because it's only needed on the + * transaction commit side which is currently locked out by the flush lock. + * + * If a log item is marked with a whiteout, we do not need to write it to the + * journal and so we just move them to the whiteout list for the caller to + * dispose of appropriately. + */ +static void +xlog_cil_build_lv_chain( + struct xfs_cil_ctx *ctx, + struct list_head *whiteouts, + uint32_t *num_iovecs, + uint32_t *num_bytes) +{ + while (!list_empty(&ctx->log_items)) { + struct xfs_log_item *item; + struct xfs_log_vec *lv; + + item = list_first_entry(&ctx->log_items, + struct xfs_log_item, li_cil); + + if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { + list_move(&item->li_cil, whiteouts); + trace_xfs_cil_whiteout_skip(item); + continue; + } + + lv = item->li_lv; + lv->lv_order_id = item->li_order_id; + + /* we don't write ordered log vectors */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + *num_bytes += lv->lv_bytes; + *num_iovecs += lv->lv_niovecs; + list_add_tail(&lv->lv_list, &ctx->lv_chain); + + list_del_init(&item->li_cil); + item->li_order_id = 0; + item->li_lv = NULL; + } +} + +static void +xlog_cil_cleanup_whiteouts( + struct list_head *whiteouts) +{ + while (!list_empty(whiteouts)) { + struct xfs_log_item *item = list_first_entry(whiteouts, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + trace_xfs_cil_whiteout_unpin(item); + item->li_ops->iop_unpin(item, 1); + } +} + /* * Push the Committed Item List to the log. * @@ -856,19 +1180,16 @@ xlog_cil_push_work( container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; - struct xfs_log_vec *lv; struct xfs_cil_ctx *new_ctx; - struct xlog_ticket *tic; - int num_iovecs; + int num_iovecs = 0; + int num_bytes = 0; int error = 0; - struct xfs_trans_header thdr; - struct xfs_log_iovec lhdr; - struct xfs_log_vec lvhdr = { NULL }; - xfs_lsn_t preflush_tail_lsn; + struct xlog_cil_trans_hdr thdr; + struct xfs_log_vec lvhdr = {}; xfs_csn_t push_seq; - struct bio bio; - DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable; + LIST_HEAD (whiteouts); + struct xlog_ticket *ticket; new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -892,12 +1213,14 @@ xlog_cil_push_work( if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait); + xlog_cil_push_pcp_aggregate(cil, ctx); + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. */ - if (list_empty(&cil->xc_cil)) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { cil->xc_push_seq = 0; spin_unlock(&cil->xc_push_lock); goto out_skip; @@ -937,45 +1260,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - /* - * The CIL is stable at this point - nothing new will be added to it - * because we hold the flush lock exclusively. Hence we can now issue - * a cache flush to ensure all the completed metadata in the journal we - * are about to overwrite is on stable storage. - * - * Because we are issuing this cache flush before we've written the - * tail lsn to the iclog, we can have metadata IO completions move the - * tail forwards between the completion of this flush and the iclog - * being written. In this case, we need to re-issue the cache flush - * before the iclog write. To detect whether the log tail moves, sample - * the tail LSN *before* we issue the flush. - */ - preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); - xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, - &bdev_flush); - - /* - * Pull all the log vectors off the items in the CIL, and remove the - * items from the CIL. We don't need the CIL lock here because it's only - * needed on the transaction commit side which is currently locked out - * by the flush lock. - */ - lv = NULL; - num_iovecs = 0; - while (!list_empty(&cil->xc_cil)) { - struct xfs_log_item *item; - - item = list_first_entry(&cil->xc_cil, - struct xfs_log_item, li_cil); - list_del_init(&item->li_cil); - if (!ctx->lv_chain) - ctx->lv_chain = item->li_lv; - else - lv->lv_next = item->li_lv; - lv = item->li_lv; - item->li_lv = NULL; - num_iovecs += lv->lv_niovecs; - } + xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1008,35 +1293,30 @@ xlog_cil_push_work( up_write(&cil->xc_ctx_lock); /* + * Sort the log vector chain before we add the transaction headers. + * This ensures we always have the transaction headers at the start + * of the chain. + */ + list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp); + + /* * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). - * - * The LSN we need to pass to the log items on transaction commit is - * the LSN reported by the first log vector write. If we use the commit - * record lsn then we can move the tail beyond the grant write head. + * Add the lvhdr to the head of the lv chain we pass to xlog_write() so + * it gets written into the iclog first. */ - tic = ctx->ticket; - thdr.th_magic = XFS_TRANS_HEADER_MAGIC; - thdr.th_type = XFS_TRANS_CHECKPOINT; - thdr.th_tid = tic->t_tid; - thdr.th_num_items = num_iovecs; - lhdr.i_addr = &thdr; - lhdr.i_len = sizeof(xfs_trans_header_t); - lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; - tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); - - lvhdr.lv_niovecs = 1; - lvhdr.lv_iovecp = &lhdr; - lvhdr.lv_next = ctx->lv_chain; + xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); + num_bytes += lvhdr.lv_bytes; + list_add(&lvhdr.lv_list, &ctx->lv_chain); /* - * Before we format and submit the first iclog, we have to ensure that - * the metadata writeback ordering cache flush is complete. + * Take the lvhdr back off the lv_chain immediately after calling + * xlog_cil_write_chain() as it should not be passed to log IO + * completion. */ - wait_for_completion(&bdev_flush); - - error = xlog_cil_write_chain(ctx, &lvhdr); + error = xlog_cil_write_chain(ctx, num_bytes); + list_del(&lvhdr.lv_list); if (error) goto out_abort_free_ticket; @@ -1044,7 +1324,14 @@ xlog_cil_push_work( if (error) goto out_abort_free_ticket; - xfs_log_ticket_ungrant(log, tic); + /* + * Grab the ticket from the ctx so we can ungrant it after releasing the + * commit_iclog. The ctx may be freed by the time we return from + * releasing the commit_iclog (i.e. checkpoint has been completed and + * callback run) so we can't reference the ctx after the call to + * xlog_state_release_iclog(). + */ + ticket = ctx->ticket; /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs @@ -1094,11 +1381,14 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + ticket = ctx->ticket; + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xlog_cil_cleanup_whiteouts(&whiteouts); + xfs_log_ticket_ungrant(log, ticket); return; out_skip: @@ -1108,16 +1398,19 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_ungrant(log, tic); ASSERT(xlog_is_shutdown(log)); + xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { + xfs_log_ticket_ungrant(log, ctx->ticket); xlog_cil_committed(ctx); return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog, 0); + ticket = ctx->ticket; + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xfs_log_ticket_ungrant(log, ticket); } /* @@ -1132,18 +1425,27 @@ xlog_cil_push_background( struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; + int space_used = atomic_read(&cil->xc_ctx->space_used); /* * The cil won't be empty because we are called while holding the - * context lock so whatever we added to the CIL will still be there + * context lock so whatever we added to the CIL will still be there. */ - ASSERT(!list_empty(&cil->xc_cil)); + ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); /* - * Don't do a background push if we haven't used up all the - * space available yet. + * We are done if: + * - we haven't used up all the space available yet; or + * - we've already queued up a push; and + * - we're not over the hard limit; and + * - nothing has been over the hard limit. + * + * If so, we don't need to take the push lock as there's nothing to do. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + if (space_used < XLOG_CIL_SPACE_LIMIT(log) || + (cil->xc_push_seq == cil->xc_current_sequence && + space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) && + !waitqueue_active(&cil->xc_push_wait))) { up_read(&cil->xc_ctx_lock); return; } @@ -1170,12 +1472,11 @@ xlog_cil_push_background( * dipping back down under the hard limit. * * The ctx->xc_push_lock provides the serialisation necessary for safely - * using the lockless waitqueue_active() check in this context. + * calling xlog_cil_over_hard_limit() in this context. */ - if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) || - waitqueue_active(&cil->xc_push_wait)) { + if (xlog_cil_over_hard_limit(log, space_used)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); - ASSERT(cil->xc_ctx->space_used < log->l_logsize); + ASSERT(space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); return; } @@ -1219,18 +1520,28 @@ xlog_cil_push_now( if (!async) flush_workqueue(cil->xc_push_wq); + spin_lock(&cil->xc_push_lock); + + /* + * If this is an async flush request, we always need to set the + * xc_push_commit_stable flag even if something else has already queued + * a push. The flush caller is asking for the CIL to be on stable + * storage when the next push completes, so regardless of who has queued + * the push, the flush requires stable semantics from it. + */ + cil->xc_push_commit_stable = async; + /* * If the CIL is empty or we've already pushed the sequence then - * there's no work we need to do. + * there's no more work that we need to do. */ - spin_lock(&cil->xc_push_lock); - if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) || + push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; - cil->xc_push_commit_stable = async; queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } @@ -1243,13 +1554,50 @@ xlog_cil_empty( bool empty = false; spin_lock(&cil->xc_push_lock); - if (list_empty(&cil->xc_cil)) + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) empty = true; spin_unlock(&cil->xc_push_lock); return empty; } /* + * If there are intent done items in this transaction and the related intent was + * committed in the current (same) CIL checkpoint, we don't need to write either + * the intent or intent done item to the journal as the change will be + * journalled atomically within this checkpoint. As we cannot remove items from + * the CIL here, mark the related intent with a whiteout so that the CIL push + * can remove it rather than writing it to the journal. Then remove the intent + * done item from the current transaction and release it so it doesn't get put + * into the CIL at all. + */ +static uint32_t +xlog_cil_process_intents( + struct xfs_cil *cil, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *ilip, *next; + uint32_t len = 0; + + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE)) + continue; + + ilip = lip->li_ops->iop_intent(lip); + if (!ilip || !xlog_item_in_current_chkpt(cil, ilip)) + continue; + set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); + trace_xfs_cil_whiteout_mark(ilip); + len += ilip->li_lv->lv_bytes; + kmem_free(ilip->li_lv); + ilip->li_lv = NULL; + + xfs_trans_del_item(lip); + lip->li_ops->iop_release(lip); + } + return len; +} + +/* * Commit a transaction with the given vector to the Committed Item List. * * To do this, we need to format the item, pin it in memory if required and @@ -1271,6 +1619,7 @@ xlog_cil_commit( { struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; + uint32_t released_space = 0; /* * Do all necessary memory allocation before we lock the CIL. @@ -1282,7 +1631,10 @@ xlog_cil_commit( /* lock out background commit */ down_read(&cil->xc_ctx_lock); - xlog_cil_insert_items(log, tp); + if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE) + released_space = xlog_cil_process_intents(cil, tp); + + xlog_cil_insert_items(log, tp, released_space); if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); @@ -1328,6 +1680,13 @@ xlog_cil_flush( trace_xfs_log_force(log->l_mp, seq, _RET_IP_); xlog_cil_push_now(log, seq, true); + + /* + * If the CIL is empty, make sure that any previous checkpoint that may + * still be in an active iclog is pushed to stable storage. + */ + if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags)) + xfs_log_force(log->l_mp, 0); } /* @@ -1411,7 +1770,7 @@ restart: * we would have found the context on the committing list. */ if (sequence == cil->xc_current_sequence && - !list_empty(&cil->xc_cil)) { + !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { spin_unlock(&cil->xc_push_lock); goto restart; } @@ -1432,29 +1791,35 @@ out_shutdown: } /* - * Check if the current log item was first committed in this sequence. - * We can't rely on just the log item being in the CIL, we have to check - * the recorded commit sequence number. + * Move dead percpu state to the relevant CIL context structures. * - * Note: for this to be used in a non-racy manner, it has to be called with - * CIL flushing locked out. As a result, it should only be used during the - * transaction commit process when deciding what to format into the item. + * We have to lock the CIL context here to ensure that nothing is modifying + * the percpu state, either addition or removal. Both of these are done under + * the CIL context lock, so grabbing that exclusively here will ensure we can + * safely drain the cilpcp for the CPU that is dying. */ -bool -xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) +void +xlog_cil_pcp_dead( + struct xlog *log, + unsigned int cpu) { - struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; - - if (list_empty(&lip->li_cil)) - return false; + struct xfs_cil *cil = log->l_cilp; + struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + struct xfs_cil_ctx *ctx; - /* - * li_seq is written on the first commit of a log item to record the - * first checkpoint it is written to. Hence if it is different to the - * current sequence, we're in a new checkpoint. - */ - return lip->li_seq == ctx->sequence; + down_write(&cil->xc_ctx_lock); + ctx = cil->xc_ctx; + if (ctx->ticket) + ctx->ticket->t_curr_res += cilpcp->space_reserved; + cilpcp->space_reserved = 0; + + if (!list_empty(&cilpcp->log_items)) + list_splice_init(&cilpcp->log_items, &ctx->log_items); + if (!list_empty(&cilpcp->busy_extents)) + list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); + atomic_add(cilpcp->space_used, &ctx->space_used); + cilpcp->space_used = 0; + up_write(&cil->xc_ctx_lock); } /* @@ -1462,10 +1827,12 @@ xfs_log_item_in_current_chkpt( */ int xlog_cil_init( - struct xlog *log) + struct xlog *log) { - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + struct xlog_cil_pcp *cilpcp; + int cpu; cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) @@ -1480,22 +1847,31 @@ xlog_cil_init( if (!cil->xc_push_wq) goto out_destroy_cil; - INIT_LIST_HEAD(&cil->xc_cil); + cil->xc_log = log; + cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp); + if (!cil->xc_pcp) + goto out_destroy_wq; + + for_each_possible_cpu(cpu) { + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + INIT_LIST_HEAD(&cilpcp->busy_extents); + INIT_LIST_HEAD(&cilpcp->log_items); + } + INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_commit_wait); - cil->xc_log = log; log->l_cilp = cil; ctx = xlog_cil_ctx_alloc(); xlog_cil_ctx_switch(cil, ctx); - return 0; +out_destroy_wq: + destroy_workqueue(cil->xc_push_wq); out_destroy_cil: kmem_free(cil); return -ENOMEM; @@ -1505,14 +1881,17 @@ void xlog_cil_destroy( struct xlog *log) { - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); + struct xfs_cil *cil = log->l_cilp; + + if (cil->xc_ctx) { + if (cil->xc_ctx->ticket) + xfs_log_ticket_put(cil->xc_ctx->ticket); + kmem_free(cil->xc_ctx); } - ASSERT(list_empty(&log->l_cilp->xc_cil)); - destroy_workqueue(log->l_cilp->xc_push_wq); - kmem_free(log->l_cilp); + ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); + free_percpu(cil->xc_pcp); + destroy_workqueue(cil->xc_push_wq); + kmem_free(cil); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 23103d68423c..1bd2963e8fbd 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,8 +51,8 @@ enum xlog_iclog_state { /* * In core log flags */ -#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ -#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ +#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */ #define XLOG_ICL_STRINGS \ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ @@ -62,7 +62,7 @@ enum xlog_iclog_state { /* * Log ticket flags */ -#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */ #define XLOG_TIC_FLAGS \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } @@ -142,37 +142,17 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -/* Ticket reservation region accounting */ -#define XLOG_TIC_LEN_MAX 15 - -/* - * Reservation region - * As would be stored in xfs_log_iovec but without the i_addr which - * we don't care about. - */ -typedef struct xlog_res { - uint r_len; /* region length :4 */ - uint r_type; /* region's transaction type :4 */ -} xlog_res_t; - typedef struct xlog_ticket { - struct list_head t_queue; /* reserve/write queue */ - struct task_struct *t_task; /* task that owns this ticket */ - xlog_tid_t t_tid; /* transaction identifier : 4 */ - atomic_t t_ref; /* ticket reference count : 4 */ - int t_curr_res; /* current reservation in bytes : 4 */ - int t_unit_res; /* unit reservation in bytes : 4 */ - char t_ocnt; /* original count : 1 */ - char t_cnt; /* current count : 1 */ - char t_clientid; /* who does this belong to; : 1 */ - char t_flags; /* properties of reservation : 1 */ - - /* reservation array fields */ - uint t_res_num; /* num in array : 4 */ - uint t_res_num_ophdrs; /* num op hdrs : 4 */ - uint t_res_arr_sum; /* array sum : 4 */ - uint t_res_o_flow; /* sum overflow : 4 */ - xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier */ + atomic_t t_ref; /* ticket reference count */ + int t_curr_res; /* current reservation */ + int t_unit_res; /* unit reservation */ + char t_ocnt; /* original unit count */ + char t_cnt; /* current unit count */ + uint8_t t_flags; /* properties of reservation */ + int t_iclog_hdrs; /* iclog hdrs in t_curr_res */ } xlog_ticket_t; /* @@ -211,7 +191,7 @@ typedef struct xlog_in_core { u32 ic_offset; enum xlog_iclog_state ic_state; unsigned int ic_flags; - char *ic_datap; /* pointer to iclog data */ + void *ic_datap; /* pointer to iclog data */ struct list_head ic_callbacks; /* reference counts need their own cacheline */ @@ -242,14 +222,25 @@ struct xfs_cil_ctx { xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ - int nvecs; /* number of regions */ - int space_used; /* aggregate size of regions */ + atomic_t space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ - struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct list_head log_items; /* log items in chkpt */ + struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; struct work_struct push_work; + atomic_t order_id; +}; + +/* + * Per-cpu CIL tracking items + */ +struct xlog_cil_pcp { + int32_t space_used; + uint32_t space_reserved; + struct list_head busy_extents; + struct list_head log_items; }; /* @@ -270,8 +261,8 @@ struct xfs_cil_ctx { */ struct xfs_cil { struct xlog *xc_log; - struct list_head xc_cil; - spinlock_t xc_cil_lock; + unsigned long xc_flags; + atomic_t xc_iclog_hdrs; struct workqueue_struct *xc_push_wq; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; @@ -285,8 +276,17 @@ struct xfs_cil { wait_queue_head_t xc_start_wait; xfs_csn_t xc_current_sequence; wait_queue_head_t xc_push_wait; /* background push throttle */ + + void __percpu *xc_pcp; /* percpu CIL structures */ +#ifdef CONFIG_HOTPLUG_CPU + struct list_head xc_pcp_list; +#endif } ____cacheline_aligned_in_smp; +/* xc_flags bit values */ +#define XLOG_CIL_EMPTY 1 +#define XLOG_CIL_PCP_SPACE 2 + /* * The amount of log space we allow the CIL to aggregate is difficult to size. * Whatever we choose, we have to make sure we can get a reservation for the @@ -441,10 +441,6 @@ struct xlog { struct xfs_kobj l_kobj; - /* The following field are used for debugging; need to hold icloglock */ -#ifdef DEBUG - void *l_iclog_bak[XLOG_MAX_ICLOGS]; -#endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; @@ -454,9 +450,6 @@ struct xlog { struct rw_semaphore l_incompat_users; }; -#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ @@ -484,6 +477,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); } +/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover( @@ -498,34 +502,21 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern struct kmem_cache *xfs_log_ticket_cache; -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int count, - char client, - bool permanent); - -static inline void -xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) -{ - *ptr += bytes; - *len -= bytes; - *off += bytes; -} +struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, + int count, bool permanent); void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, - struct xfs_log_vec *log_vector, struct xlog_ticket *tic, - uint optype); + struct list_head *lv_chain, struct xlog_ticket *tic, + uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t log_tail_lsn); + struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -680,4 +671,43 @@ xlog_valid_lsn( return valid; } +/* + * Log vector and shadow buffers can be large, so we need to use kvmalloc() here + * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts + * to fall back to vmalloc, so we can't actually do anything useful with gfp + * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() + * will do direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or + * buddy allocator. Hence we have to open code kvmalloc outselves here. + * + * This assumes that the caller uses memalloc_nofs_save task context here, so + * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS + * allocations. This is actually the only way to make vmalloc() do GFP_NOFS + * allocations, so lets just all pretend this is a GFP_KERNEL context + * operation.... + */ +static inline void * +xlog_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + +/* + * CIL CPU dead notifier + */ +void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 53366cc0bc9e..322eb2ee6c55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -27,7 +27,7 @@ #include "xfs_buf_item.h" #include "xfs_ag.h" #include "xfs_quota.h" - +#include "xfs_reflink.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -39,13 +39,6 @@ STATIC int xlog_clear_stale_blocks( struct xlog *, xfs_lsn_t); -#if defined(DEBUG) -STATIC void -xlog_recover_check_summary( - struct xlog *); -#else -#define xlog_recover_check_summary(log) -#endif STATIC int xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); @@ -129,7 +122,7 @@ xlog_do_io( xfs_daddr_t blk_no, unsigned int nbblks, char *data, - unsigned int op) + enum req_op op) { int error; @@ -1800,6 +1793,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_cud_item_ops, &xlog_bui_item_ops, &xlog_bud_item_ops, + &xlog_attri_item_ops, + &xlog_attrd_item_ops, }; static const struct xlog_recover_item_ops * @@ -2485,7 +2480,7 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -2519,21 +2514,22 @@ xlog_abort_defer_ops( xfs_defer_ops_capture_free(mp, dfc); } } + /* * When this is called, all of the log intent items which did not have - * corresponding log done items should be in the AIL. What we do now - * is update the data structures associated with each one. + * corresponding log done items should be in the AIL. What we do now is update + * the data structures associated with each one. * - * Since we process the log intent items in normal transactions, they - * will be removed at some point after the commit. This prevents us - * from just walking down the list processing each one. We'll use a - * flag in the intent item to skip those that we've already processed - * and use the AIL iteration mechanism's generation count to try to - * speed this up at least a bit. + * Since we process the log intent items in normal transactions, they will be + * removed at some point after the commit. This prevents us from just walking + * down the list processing each one. We'll use a flag in the intent item to + * skip those that we've already processed and use the AIL iteration mechanism's + * generation count to try to speed this up at least a bit. * - * When we start, we know that the intents are the only things in the - * AIL. As we process them, however, other items are added to the - * AIL. + * When we start, we know that the intents are the only things in the AIL. As we + * process them, however, other items are added to the AIL. Hence we know we + * have started recovery on all the pending intents when we find an non-intent + * item in the AIL. */ STATIC int xlog_recover_process_intents( @@ -2556,17 +2552,10 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + const struct xfs_item_ops *ops; + + if (!xlog_item_is_intent(lip)) break; - } /* * We should never see a redo item with a LSN higher than @@ -2580,13 +2569,17 @@ xlog_recover_process_intents( * deferred ops, you /must/ attach them to the capture list in * the recover routine or else those subsequent intents will be * replayed in the wrong order! + * + * The recovery function can free the log item, so we must not + * access lip after it returns. */ spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, &capture_list); + ops = lip->li_ops; + error = ops->iop_recover(lip, &capture_list); spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, - lip->li_ops->iop_recover); + ops->iop_recover); break; } } @@ -2607,8 +2600,9 @@ err: } /* - * A cancel occurs when the mount has failed and we're bailing out. - * Release all pending log intent items so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending log intent items that we haven't started recovery on so they don't + * pin the AIL. */ STATIC void xlog_recover_cancel_intents( @@ -2622,17 +2616,8 @@ xlog_recover_cancel_intents( spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } spin_unlock(&ailp->ail_lock); lip->li_ops->iop_release(lip); @@ -2650,21 +2635,21 @@ xlog_recover_cancel_intents( */ STATIC void xlog_recover_clear_agi_bucket( - xfs_mount_t *mp, - xfs_agnumber_t agno, - int bucket) + struct xfs_perag *pag, + int bucket) { - xfs_trans_t *tp; - xfs_agi_t *agi; - struct xfs_buf *agibp; - int offset; - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_trans *tp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + int offset; + int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) goto out_error; - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) goto out_abort; @@ -2683,60 +2668,62 @@ xlog_recover_clear_agi_bucket( out_abort: xfs_trans_cancel(tp); out_error: - xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, + pag->pag_agno); return; } -STATIC xfs_agino_t -xlog_recover_process_one_iunlink( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino, - int bucket) +static int +xlog_recover_iunlink_bucket( + struct xfs_perag *pag, + struct xfs_agi *agi, + int bucket) { - struct xfs_buf *ibp; - struct xfs_dinode *dip; - struct xfs_inode *ip; - xfs_ino_t ino; - int error; - - ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip); - if (error) - goto fail; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *prev_ip = NULL; + struct xfs_inode *ip; + xfs_agino_t prev_agino, agino; + int error = 0; - /* - * Get the on disk inode to find the next inode in the bucket. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp); - if (error) - goto fail_iput; - dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); + agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (agino != NULLAGINO) { + error = xfs_iget(mp, NULL, + XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), + 0, 0, &ip); + if (error) + break; - xfs_iflags_clear(ip, XFS_IRECOVERY); - ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(VFS_I(ip)->i_mode != 0); + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + xfs_iflags_clear(ip, XFS_IRECOVERY); + agino = ip->i_next_unlinked; - /* setup for the next pass */ - agino = be32_to_cpu(dip->di_next_unlinked); - xfs_buf_relse(ibp); + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); - xfs_irele(ip); - return agino; + /* + * Ensure the inode is removed from the unlinked list + * before we continue so that it won't race with + * building the in-memory list here. This could be + * serialised with the agibp lock, but that just + * serialises via lockstepping and it's much simpler + * just to flush the inodegc queue and wait for it to + * complete. + */ + xfs_inodegc_flush(mp); + } - fail_iput: - xfs_irele(ip); - fail: - /* - * We can't read in the inode this bucket points to, or this inode - * is messed up. Just ditch this bucket of inodes. We will lose - * some inodes and space, but at least we won't hang. - * - * Call xlog_recover_clear_agi_bucket() to perform a transaction to - * clear the inode pointer in the bucket. - */ - xlog_recover_clear_agi_bucket(mp, agno, bucket); - return NULLAGINO; + prev_agino = agino; + prev_ip = ip; + } + + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); + } + xfs_inodegc_flush(mp); + return error; } /* @@ -2762,59 +2749,70 @@ xlog_recover_process_one_iunlink( * scheduled on this CPU to ensure other scheduled work can run without undue * latency. */ -STATIC void -xlog_recover_process_iunlinks( - struct xlog *log) +static void +xlog_recover_iunlink_ag( + struct xfs_perag *pag) { - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - xfs_agnumber_t agno; struct xfs_agi *agi; struct xfs_buf *agibp; - xfs_agino_t agino; int bucket; int error; - for_each_perag(mp, agno, pag) { - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, NULL, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt after we've + * recovered all the ag's we can.... + */ + return; + } + + /* + * Unlock the buffer so that it can be acquired in the normal course of + * the transaction to truncate and free each inode. Because we are not + * racing with anyone else here for the AGI buffer, we don't even need + * to hold it locked to read the initial unlinked bucket entries out of + * the buffer. We keep buffer reference though, so that it stays pinned + * in memory while we need the buffer. + */ + agi = agibp->b_addr; + xfs_buf_unlock(agibp); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + error = xlog_recover_iunlink_bucket(pag, agi, bucket); if (error) { /* - * AGI is b0rked. Don't process it. - * - * We should probably mark the filesystem as corrupt - * after we've recovered all the ag's we can.... + * Bucket is unrecoverable, so only a repair scan can + * free the remaining unlinked inodes. Just empty the + * bucket and remaining inodes on it unreferenced and + * unfreeable. */ - continue; - } - /* - * Unlock the buffer so that it can be acquired in the normal - * course of the transaction to truncate and free each inode. - * Because we are not racing with anyone else here for the AGI - * buffer, we don't even need to hold it locked to read the - * initial unlinked bucket entries out of the buffer. We keep - * buffer reference though, so that it stays pinned in memory - * while we need the buffer. - */ - agi = agibp->b_addr; - xfs_buf_unlock(agibp); - - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { - agino = be32_to_cpu(agi->agi_unlinked[bucket]); - while (agino != NULLAGINO) { - agino = xlog_recover_process_one_iunlink(mp, - pag->pag_agno, agino, bucket); - cond_resched(); - } + xfs_inodegc_flush(pag->pag_mount); + xlog_recover_clear_agi_bucket(pag, bucket); } - xfs_buf_rele(agibp); } + xfs_buf_rele(agibp); +} + +static void +xlog_recover_process_iunlinks( + struct xlog *log) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag(log->l_mp, agno, pag) + xlog_recover_iunlink_ag(pag); + /* * Flush the pending unlinked inodes to ensure that the inactivations * are fully completed on disk and the incore inodes can be reclaimed * before we signal that recovery is complete. */ - xfs_inodegc_flush(mp); + xfs_inodegc_flush(log->l_mp); } STATIC void @@ -3244,7 +3242,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error; ASSERT(head_blk != tail_blk); @@ -3252,37 +3250,25 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error; error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; } @@ -3346,15 +3332,14 @@ xlog_do_recover( /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); - error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, + &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); return error; } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - xlog_recover_check_summary(log); - /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; @@ -3470,7 +3455,7 @@ xlog_recover_finish( */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -3497,7 +3482,28 @@ xlog_recover_finish( } xlog_recover_process_iunlinks(log); - xlog_recover_check_summary(log); + + /* + * Recover any CoW staging blocks that are still referenced by the + * ondisk refcount metadata. During mount there cannot be any live + * staging extents as we have not permitted any user modifications. + * Therefore, it is safe to free them all right now, even on a + * read-only mount. + */ + error = xfs_reflink_recover_cow(log->l_mp); + if (error) { + xfs_alert(log->l_mp, + "Failed to recover leftover CoW staging extents, err %d.", + error); + /* + * If we get an error here, make sure the log is shut down + * but return zero so that any log items committed since the + * end of intents processing can be pushed through the CIL + * and AIL. + */ + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + } + return 0; } @@ -3509,54 +3515,3 @@ xlog_recover_cancel( xlog_recover_cancel_intents(log); } -#if defined(DEBUG) -/* - * Read all of the agf and agi counters and check that they - * are consistent with the superblock counters. - */ -STATIC void -xlog_recover_check_summary( - struct xlog *log) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; - - mp = log->l_mp; - - freeblks = 0LL; - itotal = 0LL; - ifree = 0LL; - for_each_perag(mp, agno, pag) { - error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); - if (error) { - xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agf *agfp = agfbp->b_addr; - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - } - - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); - if (error) { - xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agi *agi = agibp->b_addr; - - itotal += be32_to_cpu(agi->agi_count); - ifree += be32_to_cpu(agi->agi_freecount); - xfs_buf_relse(agibp); - } - } -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bc66d95c8d4c..8f495cc23903 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -27,42 +27,34 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -#define define_xfs_printk_level(func, kern_level) \ -void func(const struct xfs_mount *mp, const char *fmt, ...) \ -{ \ - struct va_format vaf; \ - va_list args; \ - int level; \ - \ - va_start(args, fmt); \ - \ - vaf.fmt = fmt; \ - vaf.va = &args; \ - \ - __xfs_printk(kern_level, mp, &vaf); \ - va_end(args); \ - \ - if (!kstrtoint(kern_level, 0, &level) && \ - level <= LOGLEVEL_ERR && \ - xfs_error_level >= XFS_ERRLEVEL_HIGH) \ - xfs_stack_trace(); \ -} \ - -define_xfs_printk_level(xfs_emerg, KERN_EMERG); -define_xfs_printk_level(xfs_alert, KERN_ALERT); -define_xfs_printk_level(xfs_crit, KERN_CRIT); -define_xfs_printk_level(xfs_err, KERN_ERR); -define_xfs_printk_level(xfs_warn, KERN_WARNING); -define_xfs_printk_level(xfs_notice, KERN_NOTICE); -define_xfs_printk_level(xfs_info, KERN_INFO); -#ifdef DEBUG -define_xfs_printk_level(xfs_debug, KERN_DEBUG); -#endif +void +xfs_printk_level( + const char *kern_level, + const struct xfs_mount *mp, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(kern_level, mp, &vaf); + + va_end(args); + + if (!kstrtoint(kern_level, 0, &level) && + level <= LOGLEVEL_ERR && + xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} void -xfs_alert_tag( +_xfs_alert_tag( const struct xfs_mount *mp, - int panic_tag, + uint32_t panic_tag, const char *fmt, ...) { struct va_format vaf; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bb9860ec9a93..cc323775a12c 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -6,33 +6,46 @@ struct xfs_mount; -extern __printf(2, 3) -void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); -extern __printf(2, 3) -void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); +void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, + const char *fmt, ...); +#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \ + xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \ +}) +#define xfs_emerg(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__) +#define xfs_alert(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__) +#define xfs_crit(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__) +#define xfs_err(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__) +#define xfs_warn(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__) +#define xfs_notice(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) +#define xfs_info(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__) #ifdef DEBUG -extern __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#define xfs_debug(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) #else -static inline __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) -{ -} +#define xfs_debug(mp, fmt, ...) do {} while (0) #endif +#define xfs_alert_tag(mp, tag, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \ + _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \ +}) + +extern __printf(3, 4) +void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag, + const char *fmt, ...); + #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ @@ -62,6 +75,12 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_mount(mp, warntag, fmt, ...) \ +do { \ + if (xfs_should_warn((mp), (warntag))) \ + xfs_warn((mp), (fmt), ##__VA_ARGS__); \ +} while (0) + #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 359109b6f0d3..e8bb3c2e847e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -21,6 +21,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" @@ -299,26 +300,28 @@ xfs_validate_new_dalign( "alignment check failed: sunit/swidth vs. blocksize(%d)", mp->m_sb.sb_blocksize); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - mp->m_sb.sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, mp->m_sb.sb_blocksize); - return -EINVAL; - } } + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); + return -EINVAL; + } + + if (!mp->m_dalign) { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); + return -EINVAL; + } + + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); @@ -467,6 +470,8 @@ STATIC int xfs_check_summary_counts( struct xfs_mount *mp) { + int error = 0; + /* * The AG0 superblock verifier rejects in-progress filesystems, * so we should never see the flag set this far into mounting. @@ -505,11 +510,32 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && - !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) - return 0; + if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || + xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { + error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + if (error) + return error; + } + + /* + * Older kernels misused sb_frextents to reflect both incore + * reservations made by running transactions and the actual count of + * free rt extents in the ondisk metadata. Transactions committed + * during runtime can therefore contain a superblock update that + * undercounts the number of free rt extents tracked in the rt bitmap. + * A clean unmount record will have the correct frextents value since + * there can be no other transactions running at that point. + * + * If we're mounting the rt volume after recovering the log, recompute + * frextents from the rtbitmap file to fix the inconsistency. + */ + if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + error = xfs_rtalloc_reinit_frextents(mp); + if (error) + return error; + } - return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + return 0; } /* @@ -754,7 +780,8 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, + &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); goto out_free_dir; @@ -783,11 +810,6 @@ xfs_mountfs( goto out_inodegc_shrinker; } - /* Make sure the summary counts are ok. */ - error = xfs_check_summary_counts(mp); - if (error) - goto out_log_dealloc; - /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -843,6 +865,11 @@ xfs_mountfs( goto out_rele_rip; } + /* Make sure the summary counts are ok. */ + error = xfs_check_summary_counts(mp); + if (error) + goto out_rtunmount; + /* * If this is a read-only mount defer the superblock updates until * the next remount into writeable mode. Otherwise we would never @@ -936,15 +963,6 @@ xfs_mountfs( xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_quota; - } - /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) @@ -955,7 +973,6 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); - out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); @@ -1096,24 +1113,33 @@ xfs_fs_writable( return true; } +/* Adjust m_fdblocks or m_frextents. */ int -xfs_mod_fdblocks( +xfs_mod_freecounter( struct xfs_mount *mp, + struct percpu_counter *counter, int64_t delta, bool rsvd) { int64_t lcounter; long long res_used; + uint64_t set_aside = 0; s32 batch; - uint64_t set_aside; + bool has_resv_pool; + + ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); + has_resv_pool = (counter == &mp->m_fdblocks); + if (rsvd) + ASSERT(has_resv_pool); if (delta > 0) { /* * If the reserve pool is depleted, put blocks back into it * first. Most of the time the pool is full. */ - if (likely(mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(&mp->m_fdblocks, delta); + if (likely(!has_resv_pool || + mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(counter, delta); return 0; } @@ -1125,7 +1151,7 @@ xfs_mod_fdblocks( } else { delta -= res_used; mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(&mp->m_fdblocks, delta); + percpu_counter_add(counter, delta); } spin_unlock(&mp->m_sb_lock); return 0; @@ -1139,7 +1165,7 @@ xfs_mod_fdblocks( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1156,9 +1182,10 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); - percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, + if (has_resv_pool) + set_aside = xfs_fdblocks_unavailable(mp); + percpu_counter_add_batch(counter, delta, batch); + if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; @@ -1169,8 +1196,8 @@ xfs_mod_fdblocks( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(&mp->m_fdblocks, -delta); - if (!rsvd) + percpu_counter_add(counter, -delta); + if (!has_resv_pool || !rsvd) goto fdblocks_enospc; lcounter = (long long)mp->m_resblks_avail + delta; @@ -1187,24 +1214,6 @@ fdblocks_enospc: return -ENOSPC; } -int -xfs_mod_frextents( - struct xfs_mount *mp, - int64_t delta) -{ - int64_t lcounter; - int ret = 0; - - spin_lock(&mp->m_sb_lock); - lcounter = mp->m_sb.sb_frextents + delta; - if (lcounter < 0) - ret = -ENOSPC; - else - mp->m_sb.sb_frextents = lcounter; - spin_unlock(&mp->m_sb_lock); - return ret; -} - /* * Used to free the superblock along various error paths. */ @@ -1350,7 +1359,6 @@ xfs_clear_incompat_log_features( if (xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { - xfs_info(mp, "Clearing log incompat feature flags."); xfs_sb_remove_incompat_log_features(&mp->m_sb); ret = true; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 00720a02e761..8aca2cc173ac 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -61,7 +61,7 @@ struct xfs_error_cfg { */ struct xfs_inodegc { struct llist_head list; - struct work_struct work; + struct delayed_work work; /* approximate count of inodes in the list */ unsigned int items; @@ -183,6 +183,8 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + struct percpu_counter m_frextents; /* free rt extent counter */ + /* * Count of data device blocks reserved for delayed allocations, * including indlen blocks. Does not include allocated CoW staging @@ -276,6 +278,7 @@ typedef struct xfs_mount { #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ +#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) +__XFS_HAS_FEAT(large_extent_counts, NREXT64) /* * Mount features @@ -387,6 +391,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 +/* Kernel has logged a warning about online fsck being used on this fs. */ +#define XFS_OPSTATE_WARNED_SCRUB 7 +/* Kernel has logged a warning about shrink being used on this fs. */ +#define XFS_OPSTATE_WARNED_SHRINK 8 +/* Kernel has logged a warning about logged xattr updates being used. */ +#define XFS_OPSTATE_WARNED_LARP 9 + #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ { \ @@ -409,6 +420,12 @@ __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +static inline bool +xfs_should_warn(struct xfs_mount *mp, long nr) +{ + return !test_and_set_bit(nr, &mp->m_opstate); +} + #define XFS_OPSTATE_STRINGS \ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ @@ -416,7 +433,10 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ - { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ + { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } /* * Max and min values for mount-option defined I/O @@ -425,16 +445,16 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -#define xfs_is_shutdown(mp) xfs_is_shutdown(mp) -void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, +void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ xfs_do_force_shutdown(m, f, __FILE__, __LINE__) -#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ -#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ -#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ -#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ +#define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ @@ -479,9 +499,35 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 -extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, - bool reserved); -extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +static inline uint64_t +xfs_fdblocks_unavailable( + struct xfs_mount *mp) +{ + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + +int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + int64_t delta, bool rsvd); + +static inline int +xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +{ + return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline int +xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +{ + return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); +} extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c new file mode 100644 index 000000000000..c4078d0ec108 --- /dev/null +++ b/fs/xfs/xfs_notify_failure.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Fujitsu. All Rights Reserved. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_trans.h" +#include "xfs_ag.h" + +#include <linux/mm.h> +#include <linux/dax.h> + +struct xfs_failure_info { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + int mf_flags; + bool want_shutdown; +}; + +static pgoff_t +xfs_failure_pgoff( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rec, + const struct xfs_failure_info *notify) +{ + loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); + + if (notify->startblock > rec->rm_startblock) + pos += XFS_FSB_TO_B(mp, + notify->startblock - rec->rm_startblock); + return pos >> PAGE_SHIFT; +} + +static unsigned long +xfs_failure_pgcnt( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rec, + const struct xfs_failure_info *notify) +{ + xfs_agblock_t end_rec; + xfs_agblock_t end_notify; + xfs_agblock_t start_cross; + xfs_agblock_t end_cross; + + start_cross = max(rec->rm_startblock, notify->startblock); + + end_rec = rec->rm_startblock + rec->rm_blockcount; + end_notify = notify->startblock + notify->blockcount; + end_cross = min(end_rec, end_notify); + + return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; +} + +static int +xfs_dax_failure_fn( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *data) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip; + struct xfs_failure_info *notify = data; + int error = 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || + (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + notify->want_shutdown = true; + return 0; + } + + /* Get files that incore, filter out others that are not in use. */ + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, + 0, &ip); + /* Continue the rmap query if the inode isn't incore */ + if (error == -ENODATA) + return 0; + if (error) { + notify->want_shutdown = true; + return 0; + } + + error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, + xfs_failure_pgoff(mp, rec, notify), + xfs_failure_pgcnt(mp, rec, notify), + notify->mf_flags); + xfs_irele(ip); + return error; +} + +static int +xfs_dax_notify_ddev_failure( + struct xfs_mount *mp, + xfs_daddr_t daddr, + xfs_daddr_t bblen, + int mf_flags) +{ + struct xfs_failure_info notify = { .mf_flags = mf_flags }; + struct xfs_trans *tp = NULL; + struct xfs_btree_cur *cur = NULL; + struct xfs_buf *agf_bp = NULL; + int error = 0; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); + xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + for (; agno <= end_agno; agno++) { + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_agf *agf; + xfs_agblock_t agend; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) { + xfs_perag_put(pag); + break; + } + + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); + + /* + * Set the rmap range from ri_low to ri_high, which represents + * a [start, end] where we looking for the files or metadata. + */ + memset(&ri_high, 0xFF, sizeof(ri_high)); + ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); + if (agno == end_agno) + ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); + + agf = agf_bp->b_addr; + agend = min(be32_to_cpu(agf->agf_length), + ri_high.rm_startblock); + notify.startblock = ri_low.rm_startblock; + notify.blockcount = agend - ri_low.rm_startblock; + + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_dax_failure_fn, ¬ify); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(tp, agf_bp); + xfs_perag_put(pag); + if (error) + break; + + fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); + } + + xfs_trans_cancel(tp); + if (error || notify.want_shutdown) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + if (!error) + error = -EFSCORRUPTED; + } + return error; +} + +static int +xfs_dax_notify_failure( + struct dax_device *dax_dev, + u64 offset, + u64 len, + int mf_flags) +{ + struct xfs_mount *mp = dax_holder(dax_dev); + u64 ddev_start; + u64 ddev_end; + + if (!(mp->m_super->s_flags & SB_BORN)) { + xfs_warn(mp, "filesystem is not ready for notify_failure()!"); + return -EIO; + } + + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { + xfs_debug(mp, + "notify_failure() not supported on realtime device!"); + return -EOPNOTSUPP; + } + + if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && + mp->m_logdev_targp != mp->m_ddev_targp) { + xfs_err(mp, "ondisk log corrupt, shutting down fs!"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + + if (!xfs_has_rmapbt(mp)) { + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); + return -EOPNOTSUPP; + } + + ddev_start = mp->m_ddev_targp->bt_dax_part_off; + ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + + /* Ignore the range out of filesystem area */ + if (offset + len < ddev_start) + return -ENXIO; + if (offset > ddev_end) + return -ENXIO; + + /* Calculate the real range when it touches the boundary */ + if (offset > ddev_start) + offset -= ddev_start; + else { + len -= ddev_start - offset; + offset = 0; + } + if (offset + len > ddev_end) + len -= ddev_end - offset; + + return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), + mf_flags); +} + +const struct dax_holder_operations xfs_dax_holder_operations = { + .notify_failure = xfs_dax_notify_failure, +}; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 25991923c1a8..9737b5a9f405 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void) /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); @@ -132,6 +132,23 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + + XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); /* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 5e1d29d8b2e7..37a24f0f7cd4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -71,6 +71,40 @@ xfs_fs_get_uuid( } /* + * We cannot use file based VFS helpers such as file_modified() to update + * inode state as we modify the data/metadata in the inode here. Hence we have + * to open code the timestamp updates and SUID/SGID stripping. We also need + * to set the inode prealloc flag to ensure that the extents we allocate are not + * removed if the inode is reclaimed from memory before xfs_fs_block_commit() + * is from the client to indicate that data has been written and the file size + * can be extended. + */ +static int +xfs_fs_map_update_inode( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + VFS_I(ip)->i_mode &= ~S_ISUID; + if (VFS_I(ip)->i_mode & S_IXGRP) + VFS_I(ip)->i_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp); +} + +/* * Get a layout for the pNFS client. */ int @@ -155,7 +189,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, &imap); + end_fsb - offset_fsb, 0, &imap); if (error) goto out_unlock; @@ -164,16 +198,18 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_fs_map_update_inode(ip); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); *device_generation = mp->m_generation; return error; out_unlock: @@ -255,7 +291,7 @@ xfs_fs_commit_blocks( length = end - start; if (!length) continue; - + /* * Make sure reads through the pagecache see the new data. */ @@ -283,7 +319,8 @@ xfs_fs_commit_blocks( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_setattr_time(ip, iattr); + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(&init_user_ns, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 32ac8d9c8940..18bb4ec4d7c9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -25,6 +25,7 @@ #include "xfs_error.h" #include "xfs_ag.h" #include "xfs_ialloc.h" +#include "xfs_log_priv.h" /* * The global quota manager. There is only one of these for the entire @@ -121,8 +122,7 @@ xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; int error = -EAGAIN; xfs_dqlock(dqp); @@ -157,7 +157,7 @@ xfs_qm_dqpurge( } ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(xfs_is_shutdown(mp) || + ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); @@ -172,7 +172,7 @@ xfs_qm_dqpurge( */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); - XFS_STATS_DEC(mp, xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; @@ -582,9 +582,6 @@ xfs_qm_init_timelimits( defq->blk.time = XFS_QM_BTIMELIMIT; defq->ino.time = XFS_QM_ITIMELIMIT; defq->rtb.time = XFS_QM_RTBTIMELIMIT; - defq->blk.warn = XFS_QM_BWARNLIMIT; - defq->ino.warn = XFS_QM_IWARNLIMIT; - defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -608,12 +605,6 @@ xfs_qm_init_timelimits( defq->ino.time = dqp->q_ino.timer; if (dqp->q_rtb.timer) defq->rtb.time = dqp->q_rtb.timer; - if (dqp->q_blk.warnings) - defq->blk.warn = dqp->q_blk.warnings; - if (dqp->q_ino.warnings) - defq->ino.warn = dqp->q_ino.warnings; - if (dqp->q_rtb.warnings) - defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } @@ -686,7 +677,8 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - error = register_shrinker(&qinf->qi_shrinker); + error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", + mp->m_super->s_id); if (error) goto out_free_inos; @@ -1163,7 +1155,7 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) @@ -1238,10 +1230,14 @@ xfs_qm_flush_one( */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ - bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp) { - error = -EINVAL; + error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + goto out_unlock; + + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + error = -EAGAIN; + xfs_buf_relse(bp); goto out_unlock; } xfs_buf_unlock(bp); @@ -1317,8 +1313,15 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) + if (error) { + /* + * The inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and + * tearing down the quotainfo, or else the dquots will leak. + */ + xfs_qm_dqpurge_all(mp); goto error_return; + } /* * We've made all the changes that we need to make incore. Flush them diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 5bb12717ea28..9683f0457d19 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -34,7 +34,6 @@ struct xfs_quota_limits { xfs_qcnt_t hard; /* default hard limit */ xfs_qcnt_t soft; /* default soft limit */ time64_t time; /* limit for timers */ - xfs_qwarncnt_t warn; /* limit for warnings */ }; /* Defaults for each quota type: time limits, warn limits, usage limits */ @@ -134,10 +133,6 @@ struct xfs_dquot_acct { #define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ #define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ -#define XFS_QM_BWARNLIMIT 5 -#define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_RTBWARNLIMIT 5 - extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* quota ops */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 47fe60e1a887..392cb39cc10c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -217,8 +217,7 @@ xfs_qm_scall_quotaon( return 0; } -#define XFS_QC_MASK \ - (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK) /* * Adjust limits of this quota, and the defaults if passed in. Returns true @@ -251,17 +250,6 @@ xfs_setqlim_limits( } static inline void -xfs_setqlim_warns( - struct xfs_dquot_res *res, - struct xfs_quota_limits *qlim, - int warns) -{ - res->warnings = warns; - if (qlim) - qlim->warn = warns; -} - -static inline void xfs_setqlim_timer( struct xfs_mount *mp, struct xfs_dquot_res *res, @@ -303,13 +291,6 @@ xfs_qm_scall_setqlim( return 0; /* - * We don't want to race with a quotaoff so take the quotaoff lock. - * We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening. - */ - mutex_lock(&q->qi_quotaofflock); - - /* * Get the dquot (locked) before we start, as we need to do a * transaction to allocate it if it doesn't exist. Once we have the * dquot, unlock it so we can start the next transaction safely. We hold @@ -319,7 +300,7 @@ xfs_qm_scall_setqlim( error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); - goto out_unlock; + return error; } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -361,8 +342,6 @@ xfs_qm_scall_setqlim( if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); - if (newlim->d_fieldmask & QC_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); @@ -377,8 +356,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->rtb : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); @@ -393,8 +370,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->ino : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); - if (newlim->d_fieldmask & QC_INO_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); @@ -415,8 +390,6 @@ xfs_qm_scall_setqlim( out_rele: xfs_qm_dqrele(dqp); -out_unlock: - mutex_unlock(&q->qi_quotaofflock); return error; } @@ -437,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_count = dqp->q_ino.reserved; dst->d_spc_timer = dqp->q_blk.timer; dst->d_ino_timer = dqp->q_ino.timer; - dst->d_ino_warns = dqp->q_ino.warnings; - dst->d_spc_warns = dqp->q_blk.warnings; + dst->d_ino_warns = 0; + dst->d_spc_warns = 0; dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = dqp->q_rtb.timer; - dst->d_rt_spc_warns = dqp->q_rtb.warnings; + dst->d_rt_spc_warns = 0; /* * Internally, we don't reset all the timers when quota enforcement @@ -481,9 +454,12 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; - /* Flush inodegc work at the start of a quota reporting scan. */ + /* + * Expedite pending inodegc work at the start of a quota reporting + * scan but don't block waiting for it to complete. + */ if (id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); /* * Try to get the dquot. We don't want it allocated on disk, so don't @@ -525,7 +501,7 @@ xfs_qm_scall_getquota_next( /* Flush inodegc work at the start of a quota reporting scan. */ if (*id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 07989bd67728..9c162e69976b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -40,9 +40,9 @@ xfs_qm_fill_state( tstate->spc_timelimit = (u32)defq->blk.time; tstate->ino_timelimit = (u32)defq->ino.time; tstate->rt_spc_timelimit = (u32)defq->rtb.time; - tstate->spc_warnlimit = defq->blk.warn; - tstate->ino_warnlimit = defq->ino.warn; - tstate->rt_spc_warnlimit = defq->rtb.warn; + tstate->spc_warnlimit = 0; + tstate->ino_warnlimit = 0; + tstate->rt_spc_warnlimit = 0; if (tempqip) xfs_irele(ip); } @@ -98,7 +98,7 @@ xfs_quota_type(int type) } } -#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK) /* * Adjust quota timers & warnings diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d3da67772d57..858e3e9eb4a8 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -35,6 +35,7 @@ STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { + kmem_free(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else @@ -53,10 +54,11 @@ xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } + if (!atomic_dec_and_test(&cuip->cui_refcount)) + return; + + xfs_trans_ail_delete(&cuip->cui_item, 0); + xfs_cui_item_free(cuip); } @@ -204,14 +206,24 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); + kmem_free(cudp->cud_item.li_lv_shadow); kmem_cache_free(xfs_cud_cache, cudp); } +static struct xfs_log_item * +xfs_cud_item_intent( + struct xfs_log_item *lip) +{ + return &CUD_ITEM(lip)->cud_cuip->cui_item; +} + static const struct xfs_item_ops xfs_cud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_cud_item_size, .iop_format = xfs_cud_item_format, .iop_release = xfs_cud_item_release, + .iop_intent = xfs_cud_item_intent, }; static struct xfs_cud_log_item * @@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update( * 1.) releases the CUI and frees the CUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; @@ -457,7 +469,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; unsigned int refc_type; @@ -511,7 +523,9 @@ xfs_cui_item_recover( type = refc_type; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -524,7 +538,8 @@ xfs_cui_item_recover( &new_fsb, &new_len, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - refc, sizeof(*refc)); + &cuip->cui_format, + sizeof(cuip->cui_format)); if (error) goto abort_error; @@ -600,6 +615,7 @@ xfs_cui_item_relog( } static const struct xfs_item_ops xfs_cui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, @@ -609,28 +625,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_relog = xfs_cui_item_relog, }; -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) + struct xfs_cui_log_format *dst, + const struct xfs_cui_log_format *src) { - struct xfs_cui_log_format *src_cui_fmt; - uint len; + unsigned int i; - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents)); - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->cui_nextents; i++) + memcpy(&dst->cui_extents[i], &src->cui_extents[i], + sizeof(struct xfs_phys_extent)); } /* @@ -647,19 +653,28 @@ xlog_recover_cui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_cui_log_item *cuip; struct xfs_cui_log_format *cui_formatp; + size_t len; cui_formatp = item->ri_buf[0].i_addr; - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -693,7 +708,8 @@ xlog_recover_cud_commit_pass2( cud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cb0edb1d68ef..93bdd25680bc 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -125,11 +125,10 @@ * shared blocks. If there are no shared extents, fbno and flen will * be set to NULLAGBLOCK and 0, respectively. */ -int +static int xfs_reflink_find_shared( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, @@ -140,11 +139,11 @@ xfs_reflink_find_shared( struct xfs_btree_cur *cur; int error; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag); + cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); @@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared( struct xfs_bmbt_irec *irec, bool *shared) { - xfs_agnumber_t agno; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t fbno; @@ -186,12 +186,13 @@ xfs_reflink_trim_around_shared( trace_xfs_reflink_trim_around_shared(ip, irec); - agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); - agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); aglen = irec->br_blockcount; - error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, - aglen, &fbno, &flen, true); + error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, + true); + xfs_perag_put(pag); if (error) return error; @@ -199,7 +200,9 @@ xfs_reflink_trim_around_shared( if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; - } else if (fbno == agbno) { + } + + if (fbno == agbno) { /* * The start of this extent is shared. Truncate the * mapping at the end of the shared region so that a @@ -209,16 +212,16 @@ xfs_reflink_trim_around_shared( irec->br_blockcount = flen; *shared = true; return 0; - } else { - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; - return 0; } + + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + return 0; } int @@ -340,9 +343,41 @@ xfs_find_trim_cow_extent( return 0; } -/* Allocate all CoW reservations covering a range of blocks in a file. */ -int -xfs_reflink_allocate_cow( +static int +xfs_reflink_convert_unwritten( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool convert_now) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + int error; + + /* + * cmap might larger than imap due to cowextsize hint. + */ + xfs_trim_extent(cmap, offset_fsb, count_fsb); + + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || cmap->br_state == XFS_EXT_NORM) + return 0; + + trace_xfs_reflink_convert_cow(ip, cmap); + + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + + return error; +} + +static int +xfs_reflink_fill_cow_hole( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, @@ -351,25 +386,12 @@ xfs_reflink_allocate_cow( bool convert_now) { struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = imap->br_startoff; - xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_trans *tp; - int nimaps, error = 0; - bool found; xfs_filblks_t resaligned; - xfs_extlen_t resblks = 0; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); - if (error || !*shared) - return error; - if (found) - goto convert; + xfs_extlen_t resblks; + int nimaps; + int error; + bool found; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); @@ -385,17 +407,17 @@ xfs_reflink_allocate_cow( *lockmode = XFS_ILOCK_EXCL; - /* - * Check for an overlapping extent again now that we dropped the ilock. - */ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; + if (found) { xfs_trans_cancel(tp); goto convert; } + ASSERT(cmap->br_startoff > imap->br_startoff); + /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, @@ -415,23 +437,135 @@ xfs_reflink_allocate_cow( */ if (nimaps == 0) return -ENOSPC; + convert: - xfs_trim_extent(cmap, offset_fsb, count_fsb); - /* - * COW fork extents are supposed to remain unwritten until we're ready - * to initiate a disk write. For direct I/O we are going to write the - * data and need the conversion, but for buffered writes we're done. - */ - if (!convert_now || cmap->br_state == XFS_EXT_NORM) - return 0; - trace_xfs_reflink_convert_cow(ip, cmap); - return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); out_trans_cancel: xfs_trans_cancel(tp); return error; } +static int +xfs_reflink_fill_delalloc( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nimaps; + int error; + bool found; + + do { + xfs_iunlock(ip, *lockmode); + *lockmode = 0; + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, + false, &tp); + if (error) + return error; + + *lockmode = XFS_ILOCK_EXCL; + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, + &found); + if (error || !*shared) + goto out_trans_cancel; + + if (found) { + xfs_trans_cancel(tp); + break; + } + + ASSERT(isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK); + + /* + * Replace delalloc reservation with an unwritten extent. + */ + nimaps = 1; + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, + cmap->br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, + cmap, &nimaps); + if (error) + goto out_trans_cancel; + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + return error; + + /* + * Allocation succeeded but the requested range was not even + * partially satisfied? Bail out! + */ + if (nimaps == 0) + return -ENOSPC; + } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); + + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); + +out_trans_cancel: + xfs_trans_cancel(tp); + return error; +} + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +int +xfs_reflink_allocate_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + int error; + bool found; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); + if (error || !*shared) + return error; + + /* CoW fork has a real extent */ + if (found) + return xfs_reflink_convert_unwritten(ip, imap, cmap, + convert_now); + + /* + * CoW fork does not have an extent and data extent is shared. + * Allocate a real extent in the CoW fork. + */ + if (cmap->br_startoff > imap->br_startoff) + return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared, + lockmode, convert_now); + + /* + * CoW fork has a delalloc reservation. Replace it with a real extent. + * There may or may not be a data fork mapping. + */ + if (isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK) + return xfs_reflink_fill_delalloc(ip, imap, cmap, shared, + lockmode, convert_now); + + /* Shouldn't get here. */ + ASSERT(0); + return -EFSCORRUPTED; +} + /* * Cancel CoW reservations for some block range of an inode. * @@ -449,7 +583,7 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb, bool cancel_real) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -583,21 +717,21 @@ out: STATIC int xfs_reflink_end_cow_extent( struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - xfs_fileoff_t *end_fsb) + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) { - struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - xfs_filblks_t rlen; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); unsigned int resblks; + int nmaps; int error; /* No COW extents? That's easy! */ if (ifp->if_bytes == 0) { - *end_fsb = offset_fsb; + *offset_fsb = end_fsb; return 0; } @@ -617,6 +751,9 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_REFLINK_END_COW_CNT); if (error) goto out_cancel; @@ -625,42 +762,66 @@ xfs_reflink_end_cow_extent( * left by the time I/O completes for the loser of the race. In that * case we are done. */ - if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || - got.br_startoff + got.br_blockcount <= offset_fsb) { - *end_fsb = offset_fsb; + if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; goto out_cancel; } /* - * Structure copy @got into @del, then trim @del to the range that we - * were asked to remap. We preserve @got for the eventual CoW fork - * deletion; from now on @del represents the mapping that we're - * actually remapping. - */ - del = got; - xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); - - ASSERT(del.br_blockcount > 0); - - /* * Only remap real extents that contain data. With AIO, speculative * preallocations can leak into the range we are called upon, and we - * need to skip them. + * need to skip them. Preserve @got for the eventual CoW fork + * deletion; from now on @del represents the mapping that we're + * actually remapping. */ - if (!xfs_bmap_is_written_extent(&got)) { - *end_fsb = del.br_startoff; - goto out_cancel; + while (!xfs_bmap_is_written_extent(&got)) { + if (!xfs_iext_next_extent(ifp, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; + goto out_cancel; + } } + del = got; - /* Unmap the old blocks in the data fork. */ - rlen = del.br_blockcount; - error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); + /* Grab the corresponding mapping in the data fork. */ + nmaps = 1; + error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, + &nmaps, 0); if (error) goto out_cancel; - /* Trim the extent to whatever got unmapped. */ - xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); - trace_xfs_reflink_cow_remap(ip, &del); + /* We can only remap the smaller of the two extent sizes. */ + data.br_blockcount = min(data.br_blockcount, del.br_blockcount); + del.br_blockcount = data.br_blockcount; + + trace_xfs_reflink_cow_remap_from(ip, &del); + trace_xfs_reflink_cow_remap_to(ip, &data); + + if (xfs_bmap_is_real_extent(&data)) { + /* + * If the extent we're remapping is backed by storage (written + * or not), unmap the extent and drop its refcount. + */ + xfs_bmap_unmap_extent(tp, ip, &data); + xfs_refcount_decrease_extent(tp, &data); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + -data.br_blockcount); + } else if (data.br_startblock == DELAYSTARTBLOCK) { + int done; + + /* + * If the extent we're remapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = xfs_bunmapi(NULL, ip, data.br_startoff, + data.br_blockcount, 0, 1, &done); + if (error) + goto out_cancel; + ASSERT(done); + } /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); @@ -681,7 +842,7 @@ xfs_reflink_end_cow_extent( return error; /* Update the caller about how much progress we made. */ - *end_fsb = del.br_startoff; + *offset_fsb = del.br_startoff + del.br_blockcount; return 0; out_cancel: @@ -709,7 +870,7 @@ xfs_reflink_end_cow( end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* - * Walk backwards until we're out of the I/O range. The loop function + * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * @@ -741,7 +902,7 @@ xfs_reflink_end_cow( * blocks will be remapped. */ while (end_fsb > offset_fsb && !error) - error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); + error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb); if (error) trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); @@ -749,7 +910,10 @@ xfs_reflink_end_cow( } /* - * Free leftover CoW reservations that didn't get cleaned out. + * Free all CoW staging blocks that are still referenced by the ondisk refcount + * metadata. The ondisk metadata does not track which inode created the + * staging extent, so callers must ensure that there are no cached inodes with + * live CoW staging extents. */ int xfs_reflink_recover_cow( @@ -1115,6 +1279,8 @@ xfs_reflink_remap_extent( ++iext_delta; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto out_cancel; @@ -1127,7 +1293,7 @@ xfs_reflink_remap_extent( xfs_refcount_decrease_extent(tp, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { - xfs_filblks_t len = smap.br_blockcount; + int done; /* * If the extent we're unmapping is a delalloc reservation, @@ -1135,10 +1301,11 @@ xfs_reflink_remap_extent( * incore state. Dropping the delalloc reservation takes care * of the quota reservation for us. */ - error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + error = xfs_bunmapi(NULL, ip, smap.br_startoff, + smap.br_blockcount, 0, 1, &done); if (error) goto out_cancel; - ASSERT(len == 0); + ASSERT(done); } /* @@ -1269,8 +1436,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, - &xfs_buffered_write_iomap_ops); + return xfs_zero_range(ip, isize, pos - isize, NULL); } /* @@ -1328,12 +1494,16 @@ xfs_reflink_remap_prep( if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) goto out_unlock; - /* Don't share DAX file data for now. */ - if (IS_DAX(inode_in) || IS_DAX(inode_out)) + /* Don't share DAX file data with non-DAX file. */ + if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); + if (!IS_DAX(inode_in)) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags); + else + ret = dax_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; @@ -1385,16 +1555,11 @@ xfs_reflink_inode_has_shared_extents( struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; struct xfs_iext_cursor icur; bool found; int error; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) return error; @@ -1402,17 +1567,25 @@ xfs_reflink_inode_has_shared_extents( *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { + struct xfs_perag *pag; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) goto next; - agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); + + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); aglen = got.br_blockcount; - - error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, + error = xfs_reflink_find_shared(pag, tp, agbno, aglen, &rbno, &rlen, false); + xfs_perag_put(pag); if (error) return error; + /* Is there still a shared block here? */ if (rbno != NULLAGBLOCK) { *has_shared = true; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index bea65f2fe657..65c5dfe17ecf 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -16,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip) return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } -extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, - xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c3966b4c58ef..534504ede1a3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -35,6 +35,7 @@ STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { + kmem_free(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else @@ -53,10 +54,11 @@ xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } + if (!atomic_dec_and_test(&ruip->rui_refcount)) + return; + + xfs_trans_ail_delete(&ruip->rui_item, 0); + xfs_rui_item_free(ruip); } STATIC void @@ -153,31 +155,6 @@ xfs_rui_init( return ruip; } -/* - * Copy an RUI format buffer from the given buf, and into the destination - * RUI format structure. The RUI/RUD items were designed not to need any - * special alignment handling. - */ -STATIC int -xfs_rui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt) -{ - struct xfs_rui_log_format *src_rui_fmt; - uint len; - - src_rui_fmt = buf->i_addr; - len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy(dst_rui_fmt, src_rui_fmt, len); - return 0; -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); @@ -227,14 +204,24 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); + kmem_free(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } +static struct xfs_log_item * +xfs_rud_item_intent( + struct xfs_log_item *lip) +{ + return &RUD_ITEM(lip)->rud_ruip->rui_item; +} + static const struct xfs_item_ops xfs_rud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_rud_item_size, .iop_format = xfs_rud_item_format, .iop_release = xfs_rud_item_release, + .iop_intent = xfs_rud_item_intent, }; static struct xfs_rud_log_item * @@ -327,7 +314,7 @@ xfs_trans_log_finish_rmap_update( * 1.) releases the RUI and frees the RUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; @@ -510,7 +497,7 @@ xfs_rui_item_recover( struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; enum xfs_rmap_intent_type type; xfs_exntst_t state; int i; @@ -570,7 +557,9 @@ xfs_rui_item_recover( type = XFS_RMAP_FREE; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -630,6 +619,7 @@ xfs_rui_item_relog( } static const struct xfs_item_ops xfs_rui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, @@ -639,6 +629,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_relog = xfs_rui_item_relog, }; +static inline void +xfs_rui_copy_format( + struct xfs_rui_log_format *dst, + const struct xfs_rui_log_format *src) +{ + unsigned int i; + + memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents)); + + for (i = 0; i < src->rui_nextents; i++) + memcpy(&dst->rui_extents[i], &src->rui_extents[i], + sizeof(struct xfs_map_extent)); +} + /* * This routine is called to create an in-core extent rmap update * item from the rui format structure which was logged on disk. @@ -653,19 +657,28 @@ xlog_recover_rui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; + size_t len; rui_formatp = item->ri_buf[0].i_addr; - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -698,7 +711,11 @@ xlog_recover_rud_commit_pass2( struct xfs_rud_log_format *rud_formatp; rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + rud_formatp, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); return 0; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b8c79ee791af..292d5e54a92c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -806,6 +806,9 @@ xfs_growfs_rt_alloc( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -1284,6 +1287,44 @@ xfs_rtmount_init( return 0; } +static int +xfs_rtalloc_count_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + uint64_t *valp = priv; + + *valp += rec->ar_extcount; + return 0; +} + +/* + * Reinitialize the number of free realtime extents from the realtime bitmap. + * Callers must ensure that there is no other activity in the filesystem. + */ +int +xfs_rtalloc_reinit_frextents( + struct xfs_mount *mp) +{ + uint64_t val = 0; + int error; + + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, + &val); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_frextents = val; + spin_unlock(&mp->m_sb_lock); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + return 0; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 91b00289509b..62c7ad79cbb6 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,6 +22,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_trans *tp, +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, bool *is_free); +int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 20e0534a772c..90a77cd3ebad 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", defer_relog); @@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) { int j; - seq_printf(m, "qm"); + seq_puts(m, "qm"); for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 778b57b1f020..ee4b429a2f2c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,9 @@ #include "xfs_pwork.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" +#include "xfs_iunlink_item.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -331,13 +334,36 @@ xfs_set_inode_alloc( return xfs_is_inode32(mp) ? maxagi : agcount; } -static bool -xfs_buftarg_is_dax( - struct super_block *sb, - struct xfs_buftarg *bt) +static int +xfs_setup_dax_always( + struct xfs_mount *mp) { - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, - bdev_nr_sectors(bt->bt_bdev)); + if (!mp->m_ddev_targp->bt_daxdev && + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) { + xfs_alert(mp, + "DAX unsupported by block device. Turning off DAX."); + goto disable_dax; + } + + if (mp->m_super->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "DAX not supported for blocksize. Turning off DAX."); + goto disable_dax; + } + + if (xfs_has_reflink(mp) && + bdev_is_partition(mp->m_ddev_targp->bt_bdev)) { + xfs_alert(mp, + "DAX and reflink cannot work with multi-partitions!"); + return -EINVAL; + } + + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + return 0; + +disable_dax: + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); + return 0; } STATIC int @@ -370,26 +396,19 @@ STATIC void xfs_close_devices( struct xfs_mount *mp) { - struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; - if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); } xfs_free_buftarg(mp->m_ddev_targp); - fs_put_dax(dax_ddev); } /* @@ -407,8 +426,6 @@ xfs_open_devices( struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; - struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); - struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; struct block_device *logdev = NULL, *rtdev = NULL; int error; @@ -418,8 +435,7 @@ xfs_open_devices( if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - goto out; - dax_logdev = fs_dax_get_by_bdev(logdev); + return error; } if (mp->m_rtname) { @@ -433,25 +449,24 @@ xfs_open_devices( error = -EINVAL; goto out_close_rtdev; } - dax_rtdev = fs_dax_get_by_bdev(rtdev); } /* * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -467,14 +482,9 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); out_close_logdev: - if (logdev && logdev != ddev) { + if (logdev && logdev != ddev) xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); - } - out: - fs_put_dax(dax_ddev); return error; } @@ -643,7 +653,7 @@ xfs_fs_destroy_inode( static void xfs_fs_dirty_inode( struct inode *inode, - int flag) + int flags) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -651,7 +661,13 @@ xfs_fs_dirty_inode( if (!(inode->i_sb->s_flags & SB_LAZYTIME)) return; - if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME)) + + /* + * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC) + * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed + * in flags possibly together with I_DIRTY_SYNC. + */ + if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME)) return; if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp)) @@ -730,6 +746,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; trace_xfs_fs_sync_fs(mp, __return_address); @@ -739,7 +756,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -786,8 +806,11 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - /* Wait for whatever inactivations are in progress. */ - xfs_inodegc_flush(mp); + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -806,7 +829,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); @@ -833,9 +857,11 @@ xfs_fs_statfs( if (XFS_IS_REALTIME_MOUNT(mp) && (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + s64 freertx; + statp->f_blocks = sbp->sb_rblocks; - statp->f_bavail = statp->f_bfree = - sbp->sb_frextents * sbp->sb_rextsize; + freertx = percpu_counter_sum_positive(&mp->m_frextents); + statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; } return 0; @@ -1005,8 +1031,14 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc; + return 0; +free_delalloc: + percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: percpu_counter_destroy(&mp->m_fdblocks); free_ifree: @@ -1023,6 +1055,7 @@ xfs_reinit_percpu_counters( percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); } static void @@ -1035,6 +1068,7 @@ xfs_destroy_percpu_counters( ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); + percpu_counter_destroy(&mp->m_frextents); } static int @@ -1052,7 +1086,7 @@ xfs_inodegc_init_percpu( gc = per_cpu_ptr(mp->m_inodegc, cpu); init_llist_head(&gc->list); gc->items = 0; - INIT_WORK(&gc->work, xfs_inodegc_worker); + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; } @@ -1593,36 +1627,15 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (xfs_has_dax_always(mp)) { - bool rtdev_is_dax = false, datadev_is_dax; - - xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - - datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); - if (mp->m_rtdev_targp) - rtdev_is_dax = xfs_buftarg_is_dax(sb, - mp->m_rtdev_targp); - if (!rtdev_is_dax && !datadev_is_dax) { - xfs_alert(mp, - "DAX unsupported by block device. Turning off DAX."); - xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); - } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, - "DAX and reflink cannot be used together!"); - error = -EINVAL; + error = xfs_setup_dax_always(mp); + if (error) goto out_filestream_unmount; - } } - if (xfs_has_discard(mp)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - xfs_warn(mp, "mounting with \"discard\" option, but " - "the device does not support discard"); - mp->m_features &= ~XFS_FEAT_DISCARD; - } + if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) { + xfs_warn(mp, + "mounting with \"discard\" option, but the device does not support discard"); + mp->m_features &= ~XFS_FEAT_DISCARD; } if (xfs_has_reflink(mp)) { @@ -1646,6 +1659,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_large_extent_counts(mp)) + xfs_warn(mp, + "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1739,15 +1756,6 @@ xfs_remount_rw( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } xfs_blockgc_start(mp); /* Create the per-AG metadata reservation pool .*/ @@ -1770,6 +1778,11 @@ xfs_remount_ro( }; int error; + /* Flush all the dirty data to disk. */ + error = sync_filesystem(mp->m_super); + if (error) + return error; + /* * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. @@ -1848,8 +1861,6 @@ xfs_fs_reconfigure( if (error) return error; - sync_filesystem(mp->m_super); - /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; @@ -1964,11 +1975,19 @@ xfs_init_caches(void) { int error; + xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); + if (!xfs_buf_cache) + goto out; + xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket", sizeof(struct xlog_ticket), 0, 0, NULL); if (!xfs_log_ticket_cache) - goto out; + goto out_destroy_buf_cache; error = xfs_btree_init_cur_caches(); if (error) @@ -2009,18 +2028,14 @@ xfs_init_caches(void) goto out_destroy_trans_cache; xfs_efd_cache = kmem_cache_create("xfs_efd_item", - (sizeof(struct xfs_efd_log_item) + - (XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efd_cache) goto out_destroy_buf_item_cache; xfs_efi_cache = kmem_cache_create("xfs_efi_item", - (sizeof(struct xfs_efi_log_item) + - (XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efi_cache) goto out_destroy_efd_cache; @@ -2082,8 +2097,32 @@ xfs_init_caches(void) if (!xfs_bui_cache) goto out_destroy_bud_cache; + xfs_attrd_cache = kmem_cache_create("xfs_attrd_item", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + if (!xfs_attrd_cache) + goto out_destroy_bui_cache; + + xfs_attri_cache = kmem_cache_create("xfs_attri_item", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + if (!xfs_attri_cache) + goto out_destroy_attrd_cache; + + xfs_iunlink_cache = kmem_cache_create("xfs_iul_item", + sizeof(struct xfs_iunlink_item), + 0, 0, NULL); + if (!xfs_iunlink_cache) + goto out_destroy_attri_cache; + return 0; + out_destroy_attri_cache: + kmem_cache_destroy(xfs_attri_cache); + out_destroy_attrd_cache: + kmem_cache_destroy(xfs_attrd_cache); + out_destroy_bui_cache: + kmem_cache_destroy(xfs_bui_cache); out_destroy_bud_cache: kmem_cache_destroy(xfs_bud_cache); out_destroy_cui_cache: @@ -2118,6 +2157,8 @@ xfs_init_caches(void) xfs_btree_destroy_cur_caches(); out_destroy_log_ticket_cache: kmem_cache_destroy(xfs_log_ticket_cache); + out_destroy_buf_cache: + kmem_cache_destroy(xfs_buf_cache); out: return -ENOMEM; } @@ -2130,6 +2171,9 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_iunlink_cache); + kmem_cache_destroy(xfs_attri_cache); + kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); kmem_cache_destroy(xfs_bud_cache); kmem_cache_destroy(xfs_cui_cache); @@ -2148,6 +2192,7 @@ xfs_destroy_caches(void) xfs_defer_destroy_item_caches(); xfs_btree_destroy_cur_caches(); kmem_cache_destroy(xfs_log_ticket_cache); + kmem_cache_destroy(xfs_buf_cache); } STATIC int __init @@ -2193,6 +2238,7 @@ xfs_cpu_dead( list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { spin_unlock(&xfs_mount_list_lock); xfs_inodegc_cpu_dead(mp, cpu); + xlog_cil_pcp_dead(mp->m_log, cpu); spin_lock(&xfs_mount_list_lock); } spin_unlock(&xfs_mount_list_lock); @@ -2252,13 +2298,9 @@ init_xfs_fs(void) if (error) goto out_destroy_wq; - error = xfs_buf_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_init_procfs(); if (error) - goto out_buf_terminate; + goto out_mru_cache_uninit; error = xfs_sysctl_register(); if (error) @@ -2315,8 +2357,6 @@ init_xfs_fs(void) xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); - out_buf_terminate: - xfs_buf_terminate(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: @@ -2342,7 +2382,6 @@ exit_xfs_fs(void) kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); - xfs_buf_terminate(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_caches(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 167d23f92ffe..364e2c2648a8 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -91,8 +91,8 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; -extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; +extern const struct dax_holder_operations xfs_dax_holder_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index fc2c6a404647..8389f3ef88ef 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_error.h" /* ----- Kernel only functions below ----- */ int @@ -96,17 +97,15 @@ xfs_readlink_bmap_ilocked( int xfs_readlink( - struct xfs_inode *ip, - char *link) + struct xfs_inode *ip, + char *link) { - struct xfs_mount *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = -EFSCORRUPTED; trace_xfs_readlink(ip); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); - if (xfs_is_shutdown(mp)) return -EIO; @@ -121,12 +120,22 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - error = -EFSCORRUPTED; goto out; } - - error = xfs_readlink_bmap_ilocked(ip, link); + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED + * if if_data is junk. + */ + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + goto out; + + memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + error = 0; + } else { + error = xfs_readlink_bmap_ilocked(ip, link); + } out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -184,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -217,11 +226,6 @@ xfs_symlink( goto out_trans_cancel; } - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * Allocate an inode for the symlink. */ @@ -252,7 +256,7 @@ xfs_symlink( /* * If the symlink will fit into the inode, write it inline. */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { + if (pathlen <= xfs_inode_data_fork_size(ip)) { xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_disk_size = pathlen; diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 7692e76ead33..f78ad6b10ea5 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -83,6 +83,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { #ifdef DEBUG int pwork_threads; /* parallel workqueue threads */ + bool larp; /* log attribute replay */ #endif int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8608f804388f..f7faf6e70d7f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -67,11 +67,12 @@ static const struct sysfs_ops xfs_sysfs_ops = { static struct attribute *xfs_mp_attrs[] = { NULL, }; +ATTRIBUTE_GROUPS(xfs_mp); struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_mp_attrs, + .default_groups = xfs_mp_groups, }; #ifdef DEBUG @@ -227,6 +228,29 @@ pwork_threads_show( return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); + +static ssize_t +larp_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.larp); + if (ret < 0) + return ret; + return count; +} + +STATIC ssize_t +larp_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); +} +XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ static struct attribute *xfs_dbg_attrs[] = { @@ -236,14 +260,16 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(always_cow), #ifdef DEBUG ATTR_LIST(pwork_threads), + ATTR_LIST(larp), #endif NULL, }; +ATTRIBUTE_GROUPS(xfs_dbg); struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_dbg_attrs, + .default_groups = xfs_dbg_groups, }; #endif /* DEBUG */ @@ -296,11 +322,12 @@ static struct attribute *xfs_stats_attrs[] = { ATTR_LIST(stats_clear), NULL, }; +ATTRIBUTE_GROUPS(xfs_stats); struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_stats_attrs, + .default_groups = xfs_stats_groups, }; /* xlog */ @@ -381,11 +408,12 @@ static struct attribute *xfs_log_attrs[] = { ATTR_LIST(write_grant_head), NULL, }; +ATTRIBUTE_GROUPS(xfs_log); struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_log_attrs, + .default_groups = xfs_log_groups, }; /* @@ -534,12 +562,12 @@ static struct attribute *xfs_error_attrs[] = { ATTR_LIST(retry_timeout_seconds), NULL, }; - +ATTRIBUTE_GROUPS(xfs_error); static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_error_attrs, + .default_groups = xfs_error_groups, }; static struct kobj_type xfs_error_ktype = { diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 43585850f154..513095e353a5 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -33,10 +33,15 @@ xfs_sysfs_init( const char *name) { struct kobject *parent; + int err; parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + if (err) + kobject_put(&kobj->kobject); + + return err; } static inline void diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4a8076ef8cb4..372d871bccc5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_ARGS(mp, caller_ip)) DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_push); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_queue); @@ -418,6 +419,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __field(unsigned, lockval) __field(unsigned, flags) __field(unsigned long, caller_ip) + __field(const void *, buf_ops) ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; @@ -428,9 +430,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; + __entry->buf_ops = bp->b_ops; ), TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " - "lock %d flags %s caller %pS", + "lock %d flags %s bufops %pS caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -438,6 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->pincount, __entry->lockval, __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + __entry->buf_ops, (void *)__entry->caller_ip) ) @@ -795,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE); TRACE_DEFINE_ENUM(PE_SIZE_PMD); TRACE_DEFINE_ENUM(PE_SIZE_PUD); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), @@ -933,7 +940,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); DECLARE_EVENT_CLASS(xfs_namespace_class, - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), TP_ARGS(dp, name), TP_STRUCT__entry( __field(dev_t, dev) @@ -956,7 +963,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class, #define DEFINE_NAMESPACE_EVENT(name) \ DEFINE_EVENT(xfs_namespace_class, name, \ - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \ TP_ARGS(dp, name)) DEFINE_NAMESPACE_EVENT(xfs_remove); DEFINE_NAMESPACE_EVENT(xfs_link); @@ -1096,22 +1103,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); -#define XFS_QMOPT_FLAGS \ - { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ - { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ - { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ - { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ - { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ - { XFS_QMOPT_INHERIT, "INHERIT" }, \ - { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ - { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ - { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ - { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ - { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ - { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ - { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ - { XFS_QMOPT_RES_INOS, "RES_INOS" } - TRACE_EVENT(xfs_trans_mod_dquot, TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, unsigned int field, int64_t delta), @@ -1182,7 +1173,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, __entry->ino_res_used = qtrx->qt_ino_res_used; __entry->icount_delta = qtrx->qt_icount_delta; ), - TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s " "blk_res %llu bcount_delta %lld delbcnt_delta %lld " "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " "ino_res %llu ino_res_used %llu icount_delta %lld", @@ -1308,7 +1299,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(xfs_lsn_t, lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; @@ -1348,6 +1339,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), @@ -1361,7 +1355,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(xfs_lsn_t, new_lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; @@ -1611,7 +1605,7 @@ TRACE_EVENT(xfs_bunmap, __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx " "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -1924,7 +1918,7 @@ DECLARE_EVENT_CLASS(xfs_da_class, __field(int, namelen) __field(xfs_dahash_t, hashval) __field(xfs_ino_t, inumber) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -1990,7 +1984,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(xfs_dahash_t, hashval) __field(unsigned int, attr_filter) __field(unsigned int, attr_flags) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -2097,7 +2091,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, idx) ), TP_fast_assign( @@ -2128,7 +2122,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, src_idx) __field(int, dst_idx) __field(int, count) @@ -2169,7 +2163,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(int, which) __field(xfs_ino_t, ino) __field(int, format) - __field(int, nex) + __field(xfs_extnum_t, nex) __field(int, broot_size) __field(int, fork_off) ), @@ -2180,9 +2174,9 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->format = ip->i_df.if_format; __entry->nex = ip->i_df.if_nextents; __entry->broot_size = ip->i_df.if_broot_bytes; - __entry->fork_off = XFS_IFORK_BOFF(ip); + __entry->fork_off = xfs_inode_fork_boff(ip); ), - TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, " "broot size %d, forkoff 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -2934,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2941,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) @@ -2967,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2975,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount, @@ -3003,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3013,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount) @@ -3047,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3058,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, @@ -3095,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) + __field(enum xfs_refc_domain, i3_domain) __field(xfs_agblock_t, i3_startblock) __field(xfs_extlen_t, i3_blockcount) __field(xfs_nlink_t, i3_refcount) @@ -3108,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; + __entry->i3_domain = i3->rc_domain; __entry->i3_startblock = i3->rc_startblock; __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, + __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i3_startblock, __entry->i3_blockcount, __entry->i3_refcount) @@ -3418,7 +3439,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -3513,7 +3535,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); -TRACE_EVENT(xfs_trans_resv_calc, +DECLARE_EVENT_CLASS(xfs_trans_resv_class, TP_PROTO(struct xfs_mount *mp, unsigned int type, struct xfs_trans_res *res), TP_ARGS(mp, type, res), @@ -3537,6 +3559,33 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logres, __entry->logcount, __entry->logflags) +) + +#define DEFINE_TRANS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_trans_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, unsigned int type, \ + struct xfs_trans_res *res), \ + TP_ARGS(mp, type, res)) +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc); +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize); + +TRACE_EVENT(xfs_log_get_max_trans_res, + TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res), + TP_ARGS(mp, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint, logres) + __field(int, logcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->logres = res->tr_logres; + __entry->logcount = res->tr_logcount; + ), + TP_printk("dev %d:%d logres %u logcount %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->logres, + __entry->logcount) ); DECLARE_EVENT_CLASS(xfs_trans_class, @@ -3653,7 +3702,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \ TP_ARGS(ip)) DEFINE_AGINODE_EVENT(xfs_iunlink); DEFINE_AGINODE_EVENT(xfs_iunlink_remove); -DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); DECLARE_EVENT_CLASS(xfs_fs_corrupt_class, TP_PROTO(struct xfs_mount *mp, unsigned int flags), @@ -4111,6 +4159,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); +TRACE_DEFINE_ENUM(XFS_DAS_UNINIT); +TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_DONE); + DECLARE_EVENT_CLASS(xfs_das_state_class, TP_PROTO(int das, struct xfs_inode *ip), TP_ARGS(das, ip), @@ -4122,8 +4191,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class, __entry->das = das; __entry->ino = ip->i_ino; ), - TP_printk("state change %d ino 0x%llx", - __entry->das, __entry->ino) + TP_printk("state change %s ino 0x%llx", + __print_symbolic(__entry->das, XFS_DAS_STRINGS), + __entry->ino) ) #define DEFINE_DAS_STATE_EVENT(name) \ @@ -4132,9 +4202,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \ TP_ARGS(das, ip)) DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); + TRACE_EVENT(xfs_force_shutdown, TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 234a9d9c2f43..7bd16fbff534 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -32,7 +32,6 @@ static void xfs_trans_trace_reservations( struct xfs_mount *mp) { - struct xfs_trans_res resv; struct xfs_trans_res *res; struct xfs_trans_res *end_res; int i; @@ -41,8 +40,6 @@ xfs_trans_trace_reservations( end_res = (struct xfs_trans_res *)(M_RES(mp) + 1); for (i = 0; res < end_res; i++, res++) trace_xfs_trans_resv_calc(mp, i, res); - xfs_log_get_max_trans_res(mp, &resv); - trace_xfs_trans_resv_calc(mp, -1, &resv); } #else # define xfs_trans_trace_reservations(mp) @@ -194,11 +191,9 @@ xfs_trans_reserve( ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(mp, - resp->tr_logres, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, - &tp->t_ticket, XFS_TRANSACTION, - permanent); + &tp->t_ticket, permanent); } if (error) @@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas( be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); - if (tp->t_res_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + /* + * Updating frextents requires careful handling because it does not + * behave like the lazysb counters because we cannot rely on log + * recovery in older kenels to recompute the value from the rtbitmap. + * This means that the ondisk frextents must be consistent with the + * rtbitmap. + * + * Therefore, log the frextents change to the ondisk superblock and + * update the incore superblock so that future calls to xfs_log_sb + * write the correct value ondisk. + * + * Don't touch m_frextents because it includes incore reservations, + * and those are handled by the unreserve function. + */ + if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + struct xfs_mount *mp = tp->t_mountp; + int64_t rtxdelta; + + rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; + + spin_lock(&mp->m_sb_lock); + be64_add_cpu(&sbp->sb_frextents, rtxdelta); + mp->m_sb.sb_frextents += rtxdelta; + spin_unlock(&mp->m_sb_lock); + } if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); @@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + if (rtxdelta) { + error = xfs_mod_frextents(mp, rtxdelta); + ASSERT(!error); + } + + if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; /* apply remaining deltas */ @@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; - mp->m_sb.sb_frextents += rtxdelta; + /* + * Do not touch sb_frextents here because we are dealing with incore + * reservation. sb_frextents is not part of the lazy sb counters so it + * must be consistent with the ondisk rtbitmap and must never include + * incore reservations. + */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; @@ -648,7 +674,7 @@ xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_log == tp->t_mountp->m_log); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); ASSERT(list_empty(&lip->li_trans)); ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); @@ -734,7 +760,7 @@ xfs_log_item_batch_insert( void xfs_trans_committed_bulk( struct xfs_ail *ailp, - struct xfs_log_vec *log_vector, + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted) { @@ -749,7 +775,7 @@ xfs_trans_committed_bulk( spin_unlock(&ailp->ail_lock); /* unpin all the log items */ - for (lv = log_vector; lv; lv = lv->lv_next ) { + list_for_each_entry(lv, lv_chain, lv_list) { struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; @@ -775,7 +801,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(xfs_is_shutdown(ailp->ail_mount)); + ASSERT(xlog_is_shutdown(ailp->ail_log)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; @@ -819,6 +845,90 @@ xfs_trans_committed_bulk( } /* + * Sort transaction items prior to running precommit operations. This will + * attempt to order the items such that they will always be locked in the same + * order. Items that have no sort function are moved to the end of the list + * and so are locked last. + * + * This may need refinement as different types of objects add sort functions. + * + * Function is more complex than it needs to be because we are comparing 64 bit + * values and the function only returns 32 bit values. + */ +static int +xfs_trans_precommit_sort( + void *unused_arg, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_item *lia = container_of(a, + struct xfs_log_item, li_trans); + struct xfs_log_item *lib = container_of(b, + struct xfs_log_item, li_trans); + int64_t diff; + + /* + * If both items are non-sortable, leave them alone. If only one is + * sortable, move the non-sortable item towards the end of the list. + */ + if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort) + return 0; + if (!lia->li_ops->iop_sort) + return 1; + if (!lib->li_ops->iop_sort) + return -1; + + diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib); + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +/* + * Run transaction precommit functions. + * + * If there is an error in any of the callouts, then stop immediately and + * trigger a shutdown to abort the transaction. There is no recovery possible + * from errors at this point as the transaction is dirty.... + */ +static int +xfs_trans_run_precommits( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip, *n; + int error = 0; + + /* + * Sort the item list to avoid ABBA deadlocks with other transactions + * running precommit operations that lock multiple shared items such as + * inode cluster buffers. + */ + list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort); + + /* + * Precommit operations can remove the log item from the transaction + * if the log item exists purely to delay modifications until they + * can be ordered against other operations. Hence we have to use + * list_for_each_entry_safe() here. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) + continue; + if (lip->li_ops->iop_precommit) { + error = lip->li_ops->iop_precommit(tp, lip); + if (error) + break; + } + } + if (error) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + +/* * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical @@ -836,12 +946,20 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; trace_xfs_trans_commit(tp, _RET_IP_); + error = xfs_trans_run_precommits(tp); + if (error) { + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + xfs_defer_cancel(tp); + goto out_unreserve; + } + /* * Finish deferred items on final commit. Only permanent transactions * should ever have deferred ops. @@ -864,7 +982,13 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (xfs_is_shutdown(mp)) { + /* + * We must check against log shutdown here because we cannot abort log + * items and leave them dirty, inconsistent and unpinned in memory while + * the log is active. This leaves them open to being written back to + * disk, and that will lead to on-disk corruption. + */ + if (xlog_is_shutdown(log)) { error = -EIO; goto out_unreserve; } @@ -878,7 +1002,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -905,10 +1029,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !xlog_is_shutdown(mp->m_log)) - xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + if (regrant && !xlog_is_shutdown(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); else - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } xfs_trans_free_items(tp, !!error); @@ -926,36 +1050,56 @@ xfs_trans_commit( } /* - * Unlock all of the transaction's items and free the transaction. - * The transaction must not have modified any of its items, because - * there is no way to restore them to their previous state. + * Unlock all of the transaction's items and free the transaction. If the + * transaction is dirty, we must shut down the filesystem because there is no + * way to restore them to their previous state. + * + * If the transaction has made a log reservation, make sure to release it as + * well. * - * If the transaction has made a log reservation, make sure to release - * it as well. + * This is a high level function (equivalent to xfs_trans_commit()) and so can + * be called after the transaction has effectively been aborted due to the mount + * being shut down. However, if the mount has not been shut down and the + * transaction is dirty we will shut the mount down and, in doing so, that + * guarantees that the log is shut down, too. Hence we don't need to be as + * careful with shutdown state and dirty items here as we need to be in + * xfs_trans_commit(). */ void xfs_trans_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); trace_xfs_trans_cancel(tp, _RET_IP_); - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + /* + * It's never valid to cancel a transaction with deferred ops attached, + * because the transaction is effectively dirty. Complain about this + * loudly before freeing the in-memory defer items. + */ + if (!list_empty(&tp->t_dfops)) { + ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + dirty = true; xfs_defer_cancel(tp); + } /* - * See if the caller is relying on us to shut down the - * filesystem. This happens in paths where we detect - * corruption and decide to give up. + * See if the caller is relying on us to shut down the filesystem. We + * only want an error report if there isn't already a shutdown in + * progress, so we only need to check against the mount shutdown state + * here. */ if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !xfs_is_shutdown(mp)) { + /* Log items need to be consistent until the log is shut down. */ + if (!dirty && !xlog_is_shutdown(log)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) @@ -966,7 +1110,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } @@ -1201,3 +1345,89 @@ out_cancel: xfs_trans_cancel(tp); return error; } + +/* + * Allocate an transaction, lock and join the directory and child inodes to it, + * and reserve quota for a directory update. If there isn't sufficient space, + * @dblocks will be set to zero for a reservationless directory update and + * @nospace_error will be set to a negative errno describing the space + * constraint we hit. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCKs will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_dir( + struct xfs_inode *dp, + struct xfs_trans_res *resv, + struct xfs_inode *ip, + unsigned int *dblocks, + struct xfs_trans **tpp, + int *nospace_error) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + unsigned int resblks; + bool retried = false; + int error; + +retry: + *nospace_error = 0; + resblks = *dblocks; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + if (error == -ENOSPC) { + *nospace_error = error; + resblks = 0; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + } + if (error) + return error; + + xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(dp, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + if (resblks == 0) + goto done; + + error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(dp, 0); + retried = true; + goto retry; + } + + *nospace_error = error; + resblks = 0; + error = 0; + } + if (error) + goto out_cancel; + +done: + *tpp = tp; + *dblocks = resblks; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a487b264a9eb..55819785941c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -8,6 +8,7 @@ /* kernel only transaction subsystem defines */ +struct xlog; struct xfs_buf; struct xfs_buftarg; struct xfs_efd_log_item; @@ -31,7 +32,7 @@ struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xlog *li_log; struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ unsigned long li_flags; /* misc flags */ @@ -44,6 +45,7 @@ struct xfs_log_item { struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_csn_t li_seq; /* CIL commit seq */ + uint32_t li_order_id; /* CIL commit order */ }; /* @@ -54,13 +56,15 @@ struct xfs_log_item { #define XFS_LI_IN_AIL 0 #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 -#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_DIRTY 3 +#define XFS_LI_WHITEOUT 4 #define XFS_LI_FLAGS \ - { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ - { (1 << XFS_LI_ABORTED), "ABORTED" }, \ - { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1u << XFS_LI_ABORTED), "ABORTED" }, \ + { (1u << XFS_LI_FAILED), "FAILED" }, \ + { (1u << XFS_LI_DIRTY), "DIRTY" }, \ + { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } struct xfs_item_ops { unsigned flags; @@ -68,39 +72,43 @@ struct xfs_item_ops { void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *); void (*iop_pin)(struct xfs_log_item *); void (*iop_unpin)(struct xfs_log_item *, int remove); - uint (*iop_push)(struct xfs_log_item *, struct list_head *); + uint64_t (*iop_sort)(struct xfs_log_item *lip); + int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip); void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); - void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); + void (*iop_release)(struct xfs_log_item *); int (*iop_recover)(struct xfs_log_item *lip, struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, struct xfs_trans *tp); + struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; -/* Is this log item a deferred action intent? */ +/* + * Log item ops flags + */ +/* + * Release the log item when the journal commits instead of inserting into the + * AIL for writeback tracking and/or log tail pinning. + */ +#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) +#define XFS_ITEM_INTENT (1 << 1) +#define XFS_ITEM_INTENT_DONE (1 << 2) + static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT; } -/* Is this a log intent-done item? */ static inline bool xlog_item_is_intent_done(struct xfs_log_item *lip) { - return lip->li_ops->iop_unpin == NULL && - lip->li_ops->iop_push == NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT_DONE; } -/* - * Release the log item as soon as committed. This is for items just logging - * intents that never need to be written back in place. - */ -#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) - void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, const struct xfs_item_ops *ops); @@ -174,7 +182,7 @@ xfs_trans_get_buf( struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); @@ -259,6 +267,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv, + struct xfs_inode *ip, unsigned int *dblocks, + struct xfs_trans **tpp, int *nospace_error); static inline void xfs_trans_set_context( diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2a8c8dc54c95..f51df7d94ef7 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -398,7 +398,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* @@ -418,7 +418,7 @@ static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; @@ -443,15 +443,27 @@ xfsaild_push( ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); - xlog_cil_flush(mp->m_log); + xlog_cil_flush(ailp->ail_log); } spin_lock(&ailp->ail_lock); - /* barrier matches the ail_target update in xfs_ail_push() */ - smp_rmb(); - target = ailp->ail_target; - ailp->ail_target_prev = target; + /* + * If we have a sync push waiter, we always have to push till the AIL is + * empty. Update the target to point to the end of the AIL so that + * capture updates that occur after the sync push waiter has gone to + * sleep. + */ + if (waitqueue_active(&ailp->ail_empty)) { + lip = xfs_ail_max(ailp); + if (lip) + target = lip->li_lsn; + } else { + /* barrier matches the ail_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->ail_target; + ailp->ail_target_prev = target; + } /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); @@ -590,9 +602,9 @@ xfsaild( while (1) { if (tout && tout <= 20) - set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); /* * Check kthread_should_stop() after we set the task state to @@ -620,7 +632,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - xfs_is_shutdown(ailp->ail_mount)); + xlog_is_shutdown(ailp->ail_log)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -641,14 +653,14 @@ xfsaild( ailp->ail_target == ailp->ail_target_prev && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); - freezable_schedule(); + schedule(); tout = 0; continue; } spin_unlock(&ailp->ail_lock); if (tout) - freezable_schedule_timeout(msecs_to_jiffies(tout)); + schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); @@ -683,7 +695,7 @@ xfs_ail_push( struct xfs_log_item *lip; lip = xfs_ail_min(ailp); - if (!lip || xfs_is_shutdown(ailp->ail_mount) || + if (!lip || xlog_is_shutdown(ailp->ail_log) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; @@ -718,13 +730,11 @@ void xfs_ail_push_all_sync( struct xfs_ail *ailp) { - struct xfs_log_item *lip; DEFINE_WAIT(wait); spin_lock(&ailp->ail_lock); - while ((lip = xfs_ail_max(ailp)) != NULL) { + while (xfs_ail_max(ailp) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); - ailp->ail_target = lip->li_lsn; wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); schedule(); @@ -740,7 +750,7 @@ xfs_ail_update_finish( struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; /* if the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { @@ -748,13 +758,13 @@ xfs_ail_update_finish( return; } - if (!xfs_is_shutdown(mp)) - xlog_assign_tail_lsn_locked(mp); + if (!xlog_is_shutdown(log)) + xlog_assign_tail_lsn_locked(log->l_mp); if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(mp); + xfs_log_space_wake(log->l_mp); } /* @@ -862,17 +872,17 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xfs_is_shutdown(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + if (shutdown_type && !xlog_is_shutdown(log)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, shutdown_type); + xlog_force_shutdown(log, shutdown_type); } return; } @@ -893,7 +903,7 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM; - ailp->ail_mount = mp; + ailp->ail_log = mp->m_log; INIT_LIST_HEAD(&ailp->ail_head); INIT_LIST_HEAD(&ailp->ail_cursors); spin_lock_init(&ailp->ail_lock); @@ -901,7 +911,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_super->s_id); + mp->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9ba7e6b9bed3..aa00cf67ad72 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -597,13 +597,11 @@ xfs_dqresv_check( if (softlimit && total_count > softlimit) { time64_t now = ktime_get_real_seconds(); - if ((res->timer != 0 && now > res->timer) || - (res->warnings != 0 && res->warnings >= qlim->warn)) { + if (res->timer != 0 && now > res->timer) { *fatal = true; return QUOTA_NL_ISOFTLONGWARN; } - res->warnings++; return QUOTA_NL_ISOFTWARN; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 3004aeac9110..d5400150358e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -6,6 +6,7 @@ #ifndef __XFS_TRANS_PRIV_H__ #define __XFS_TRANS_PRIV_H__ +struct xlog; struct xfs_log_item; struct xfs_mount; struct xfs_trans; @@ -18,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); -void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, +void xfs_trans_committed_bulk(struct xfs_ail *ailp, + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted); /* * AIL traversal cursor. @@ -50,7 +52,7 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *ail_mount; + struct xlog *ail_log; struct task_struct *ail_task; struct list_head ail_head; xfs_lsn_t ail_target; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0d050f8829ef..c325a28b89a8 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,12 +12,104 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" -#include "xfs_da_btree.h" +#include "xfs_log.h" +#include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +static inline int +xfs_attr_grab_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, + "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} + +static inline void +xfs_attr_rele_log_assist( + struct xfs_mount *mp) +{ + xlog_drop_incompat_feat(mp->m_log); +} + +static inline bool +xfs_attr_want_log_assist( + struct xfs_mount *mp) +{ +#ifdef DEBUG + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; +#else + return false; +#endif +} + +/* + * Set or remove an xattr, having grabbed the appropriate logging resources + * prior to calling libxfs. + */ +int +xfs_attr_change( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + bool use_logging = false; + int error; + + ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + + if (xfs_attr_want_log_assist(mp)) { + error = xfs_attr_grab_log_assist(mp); + if (error) + return error; + + args->op_flags |= XFS_DA_OP_LOGGED; + use_logging = true; + } + + error = xfs_attr_set(args); + + if (use_logging) + xfs_attr_rele_log_assist(mp); + return error; +} + static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, @@ -56,7 +148,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h new file mode 100644 index 000000000000..2b09133b1b9b --- /dev/null +++ b/fs/xfs/xfs_xattr.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_XATTR_H__ +#define __XFS_XATTR_H__ + +int xfs_attr_change(struct xfs_da_args *args); + +extern const struct xattr_handler *xfs_xattr_handlers[]; + +#endif /* __XFS_XATTR_H__ */ |