diff options
Diffstat (limited to 'fs/xfs/libxfs')
74 files changed, 11257 insertions, 6307 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 08d6beb54f8c..bb0c700afe3c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -22,6 +22,343 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" + + +/* + * Passive reference counting access wrappers to the perag structures. If the + * per-ag structure is to be freed, the freeing code is responsible for cleaning + * up objects with passive references before freeing the structure. This is + * things like cached buffers. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + unsigned int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + struct xfs_perag *pag; + struct xfs_sb *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + uint64_t fdblocks; + int error = 0; + + for (index = 0; index < agcount; index++) { + /* + * Read the AGF and AGI buffers to populate the per-ag + * structures for us. + */ + pag = xfs_perag_get(mp, index); + error = xfs_alloc_read_agf(pag, NULL, 0, NULL); + if (!error) + error = xfs_ialloc_read_agi(pag, NULL, NULL); + if (error) { + xfs_perag_put(pag); + return error; + } + + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + fdblocks = bfree + bfreelst + btree; + + /* + * If the new summary counts are obviously incorrect, fail the + * mount operation because that implies the AGFs are also corrupt. + * Clear FS_COUNTERS so that we don't unmount with a dirty log, which + * will prevent xfs_repair from fixing anything. + */ + if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { + xfs_alert(mp, "AGF corruption. Please run xfs_repair."); + error = -EFSCORRUPTED; + goto out; + } + + /* Overwrite incore superblock counters with just-read data */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = fdblocks; + spin_unlock(&mp->m_sb_lock); + + xfs_reinit_percpu_counters(mp); +out: + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); + return error; +} + +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); + kmem_free(pag); +} + +/* + * Free up the per-ag resources associated with the mount structure. + */ +void +xfs_free_perag( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_buf_hash_destroy(pag); + + call_rcu(&pag->rcu_head, __xfs_free_perag); + } +} + +/* Find the size of the AG, in blocks. */ +static xfs_agblock_t +__xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks) +{ + ASSERT(agno < agcount); + + if (agno < agcount - 1) + return mp->m_sb.sb_agblocks; + return dblocks - (agno * mp->m_sb.sb_agblocks); +} + +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); +} + +/* Calculate the first and last possible inode number in an AG. */ +static void +__xfs_agino_range( + struct xfs_mount *mp, + xfs_agblock_t eoag, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); + *first = XFS_AGB_TO_AGINO(mp, bno); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, M_IGEO(mp)->cluster_align); + *last = XFS_AGB_TO_AGINO(mp, bno) - 1; +} + +void +xfs_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); +} + +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks, + xfs_agnumber_t *maxagi) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = NULLAGNUMBER; + int error; + + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) { + error = -ENOMEM; + goto out_unwind_new_pags; + } + pag->pag_agno = index; + pag->pag_mount = mp; + + error = radix_tree_preload(GFP_NOFS); + if (error) + goto out_free_pag; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + WARN_ON_ONCE(1); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_free_pag; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + +#ifdef __KERNEL__ + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + spin_lock_init(&pag->pagb_lock); + spin_lock_init(&pag->pag_state_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + init_waitqueue_head(&pag->pagb_wait); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; +#endif /* __KERNEL__ */ + + error = xfs_buf_hash_init(pag); + if (error) + goto out_remove_pag; + + /* first new pag is fully initialized */ + if (first_initialised == NULLAGNUMBER) + first_initialised = index; + + /* + * Pre-calculated geometry + */ + pag->block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag->min_block = XFS_AGFL_BLOCK(mp); + __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + &pag->agino_max); + } + + index = xfs_set_inode_alloc(mp, agcount); + + if (maxagi) + *maxagi = index; + + mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); + return 0; + +out_remove_pag: + radix_tree_delete(&mp->m_perag_tree, index); +out_free_pag: + kmem_free(pag); +out_unwind_new_pags: + /* unwind any prior newly initialized pags */ + for (index = first_initialised; index < agcount; index++) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + if (!pag) + break; + xfs_buf_hash_destroy(pag); + kmem_free(pag); + } + return error; +} static int xfs_get_aghdr_buf( @@ -38,8 +375,6 @@ xfs_get_aghdr_buf( if (error) return error; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; @@ -47,12 +382,6 @@ xfs_get_aghdr_buf( return 0; } -static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) -{ - return mp->m_sb.sb_logstart > 0 && - id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); -} - /* * Generic btree root block init function */ @@ -78,7 +407,7 @@ xfs_freesp_init_recs( arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { struct xfs_alloc_rec *nrec; xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart); @@ -195,7 +524,7 @@ xfs_rmaproot_init( rrec->rm_offset = 0; /* account for refc btree root */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { rrec = XFS_RMAP_REC_ADDR(block, 5); rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); rrec->rm_blockcount = cpu_to_be32(1); @@ -205,7 +534,7 @@ xfs_rmaproot_init( } /* account for the log space */ - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { rrec = XFS_RMAP_REC_ADDR(block, be16_to_cpu(block->bb_numrecs) + 1); rrec->rm_startblock = cpu_to_be32( @@ -231,7 +560,7 @@ xfs_sbblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; @@ -243,7 +572,7 @@ xfs_agfblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -254,7 +583,7 @@ xfs_agfblock_init( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_RMAP_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); @@ -267,16 +596,16 @@ xfs_agfblock_init( tmpsize = id->agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { agf->agf_refcount_root = cpu_to_be32( xfs_refc_block(mp)); agf->agf_refcount_level = cpu_to_be32(1); agf->agf_refcount_blocks = cpu_to_be32(1); } - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { int64_t logblocks = mp->m_sb.sb_logblocks; be32_add_cpu(&agf->agf_freeblks, -logblocks); @@ -295,13 +624,13 @@ xfs_agflblock_init( __be32 *agfl_bno; int bucket; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(id->agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } @@ -312,7 +641,7 @@ xfs_agiblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -325,14 +654,19 @@ xfs_agiblock_init( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + if (xfs_has_inobtcounts(mp)) { + agi->agi_iblocks = cpu_to_be32(1); + if (xfs_has_finobt(mp)) + agi->agi_fblocks = cpu_to_be32(1); + } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, @@ -440,14 +774,14 @@ xfs_ag_init_headers( .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, - .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) + .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, - .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb) + .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), @@ -455,7 +789,7 @@ xfs_ag_init_headers( .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_REFC, - .need_init = xfs_sb_version_hasreflink(&mp->m_sb) + .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ .daddr = XFS_BUF_DADDR_NULL, @@ -480,14 +814,134 @@ xfs_ag_init_headers( return error; } +int +xfs_ag_shrink_space( + struct xfs_perag *pag, + struct xfs_trans **tpp, + xfs_extlen_t delta) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_alloc_arg args = { + .tp = *tpp, + .mp = mp, + .type = XFS_ALLOCTYPE_THIS_BNO, + .minlen = delta, + .maxlen = delta, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .resv = XFS_AG_RESV_NONE, + .prod = 1 + }; + struct xfs_buf *agibp, *agfbp; + struct xfs_agi *agi; + struct xfs_agf *agf; + xfs_agblock_t aglen; + int error, err2; + + ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(pag, *tpp, &agibp); + if (error) + return error; + + agi = agibp->b_addr; + + error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp); + if (error) + return error; + + agf = agfbp->b_addr; + aglen = be32_to_cpu(agi->agi_length); + /* some extra paranoid checks before we shrink the ag */ + if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) + return -EFSCORRUPTED; + if (delta >= aglen) + return -EINVAL; + + args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta); + + /* + * Make sure that the last inode cluster cannot overlap with the new + * end of the AG, even if it's sparse. + */ + error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp, + aglen - delta); + if (error) + return error; + + /* + * Disable perag reservations so it doesn't cause the allocation request + * to fail. We'll reestablish reservation before we return. + */ + error = xfs_ag_resv_free(pag); + if (error) + return error; + + /* internal log shouldn't also show up in the free space btrees */ + error = xfs_alloc_vextent(&args); + if (!error && args.agbno == NULLAGBLOCK) + error = -ENOSPC; + + if (error) { + /* + * if extent allocation fails, need to roll the transaction to + * ensure that the AGFL fixup has been committed anyway. + */ + xfs_trans_bhold(*tpp, agfbp); + err2 = xfs_trans_roll(tpp); + if (err2) + return err2; + xfs_trans_bjoin(*tpp, agfbp); + goto resv_init_out; + } + + /* + * if successfully deleted from freespace btrees, need to confirm + * per-AG reservation works as expected. + */ + be32_add_cpu(&agi->agi_length, -delta); + be32_add_cpu(&agf->agf_length, -delta); + + err2 = xfs_ag_resv_init(pag, *tpp); + if (err2) { + be32_add_cpu(&agi->agi_length, delta); + be32_add_cpu(&agf->agf_length, delta); + if (err2 != -ENOSPC) + goto resv_err; + + __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true); + + /* + * Roll the transaction before trying to re-init the per-ag + * reservation. The new transaction is clean so it will cancel + * without any side effects. + */ + error = xfs_defer_finish(tpp); + if (error) + return error; + + error = -ENOSPC; + goto resv_init_out; + } + xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); + xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); + return 0; + +resv_init_out: + err2 = xfs_ag_resv_init(pag, *tpp); + if (!err2) + return error; +resv_err: + xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return err2; +} + /* * Extent the AG indicated by the @id by the length passed in */ int xfs_ag_extend_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len) { struct xfs_buf *bp; @@ -495,27 +949,24 @@ xfs_ag_extend_space( struct xfs_agf *agf; int error; - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + + error = xfs_ialloc_read_agi(pag, tp, &bp); if (error) return error; - agi = XFS_BUF_TO_AGI(bp); + agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); - ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); /* * Change agf length. */ - error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + error = xfs_alloc_read_agf(pag, tp, 0, &bp); if (error) return error; - agf = XFS_BUF_TO_AGF(bp); + agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); @@ -526,54 +977,55 @@ xfs_ag_extend_space( * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - error = xfs_rmap_free(tp, bp, id->agno, - be32_to_cpu(agf->agf_length) - len, + error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; - return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, be32_to_cpu(agf->agf_length) - len), len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); + if (error) + return error; + + /* Update perag geometry */ + pag->block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + return 0; } /* Retrieve AG geometry. */ int xfs_ag_get_geometry( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { struct xfs_buf *agi_bp; struct xfs_buf *agf_bp; struct xfs_agi *agi; struct xfs_agf *agf; - struct xfs_perag *pag; unsigned int freeblks; int error; - if (agno >= mp->m_sb.sb_agcount) - return -EINVAL; - /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); if (error) goto out_agi; - pag = xfs_perag_get(mp, agno); /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = agno; + ageo->ag_number = pag->pag_agno; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + @@ -583,7 +1035,6 @@ xfs_ag_get_geometry( xfs_ag_geom_health(pag, ageo); /* Release resources. */ - xfs_perag_put(pag); xfs_buf_relse(agf_bp); out_agi: xfs_buf_relse(agi_bp); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 5166322807e7..191b22b9a35b 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -9,6 +9,215 @@ struct xfs_mount; struct xfs_trans; +struct xfs_perag; + +/* + * Per-ag infrastructure + */ + +/* per-AG block reservation data structures*/ +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. + */ +struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + bool pagf_agflreset; /* agfl requires reset before use */ + uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + + int pagb_count; /* pagb slots in use */ + uint8_t pagf_refcount_level; /* recount btree height */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for the reverse mapping btree. */ + struct xfs_ag_resv pag_rmapbt_resv; + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + + /* Precalculated geometry info */ + xfs_agblock_t block_count; + xfs_agblock_t min_block; + xfs_agino_t agino_min; + xfs_agino_t agino_max; + +#ifdef __KERNEL__ + /* -- kernel only structures below this line -- */ + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold pag_state_lock before accessing this field. + */ + uint16_t pag_checked; + uint16_t pag_sick; + spinlock_t pag_state_lock; + + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + unsigned int pagb_gen; /* generation count for pagb_tree */ + wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; + + /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + +#endif /* __KERNEL__ */ +}; + +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, + xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); +int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_free_perag(struct xfs_mount *mp); + +struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int tag); +void xfs_perag_put(struct xfs_perag *pag); + +/* + * Per-ag geometry infomation and validation + */ +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); + +static inline bool +xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) +{ + if (agbno >= pag->block_count) + return false; + if (agbno <= pag->min_block) + return false; + return true; +} + +static inline bool +xfs_verify_agbext( + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_agblock_t len) +{ + if (agbno + len <= agbno) + return false; + + if (!xfs_verify_agbno(pag, agbno)) + return false; + + return xfs_verify_agbno(pag, agbno + len - 1); +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +static inline bool +xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino < pag->agino_min) + return false; + if (agino > pag->agino_max) + return false; + return true; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +static inline bool +xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino == NULLAGINO) + return true; + return xfs_verify_agino(pag, agino); +} + +static inline bool +xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + return mp->m_sb.sb_logstart > 0 && + agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); +} + +/* + * Perag iteration APIs + */ +static inline struct xfs_perag * +xfs_perag_next( + struct xfs_perag *pag, + xfs_agnumber_t *agno, + xfs_agnumber_t end_agno) +{ + struct xfs_mount *mp = pag->pag_mount; + + *agno = pag->pag_agno + 1; + xfs_perag_put(pag); + if (*agno > end_agno) + return NULL; + return xfs_perag_get(mp, *agno); +} + +#define for_each_perag_range(mp, agno, end_agno, pag) \ + for ((pag) = xfs_perag_get((mp), (agno)); \ + (pag) != NULL; \ + (pag) = xfs_perag_next((pag), &(agno), (end_agno))) + +#define for_each_perag_from(mp, agno, pag) \ + for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) + + +#define for_each_perag(mp, agno, pag) \ + (agno) = 0; \ + for_each_perag_from((mp), (agno), (pag)) + +#define for_each_perag_tag(mp, agno, pag, tag) \ + for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (agno), (tag))) struct aghdr_init_data { /* per ag data */ @@ -24,9 +233,10 @@ struct aghdr_init_data { }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); -int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len); -int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_ag_geometry *ageo); +int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp, + xfs_extlen_t delta); +int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, + xfs_extlen_t len); +int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fdfe6dc0d307..5af123d13a63 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -19,7 +19,7 @@ #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* @@ -91,7 +91,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, + return XFS_TEST_ERROR(avail < orig / 10 || + avail < pag->pag_mount->m_agbtree_maxlevels, pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); } @@ -211,7 +212,11 @@ __xfs_ag_resv_init( ASSERT(0); return -EINVAL; } - error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + error = -ENOSPC; + else + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); @@ -246,20 +251,20 @@ xfs_ag_resv_init( struct xfs_trans *tp) { struct xfs_mount *mp = pag->pag_mount; - xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; - int error = 0; + int error = 0, error2; + bool has_resv = false; /* Create the metadata reservation. */ if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; - error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -277,7 +282,7 @@ xfs_ag_resv_init( mp->m_finobt_nores = true; - error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, + error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -287,32 +292,55 @@ xfs_ag_resv_init( if (error) goto out; } + if (ask) + has_resv = true; } /* Create the RMAPBT metadata reservation */ if (pag->pag_rmapbt_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); if (error) goto out; + if (ask) + has_resv = true; } -#ifdef DEBUG - /* need to read in the AGF for the ASSERT below to work */ - error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0); - if (error) - return error; - - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); -#endif out: + /* + * Initialize the pagf if we have at least one active reservation on the + * AG. This may have occurred already via reservation calculation, but + * fall back to an explicit init to ensure the in-core allocbt usage + * counters are initialized as soon as possible. This is important + * because filesystems with large perag reservations are susceptible to + * free space reservation problems that the allocbt counter is used to + * address. + */ + if (has_resv) { + error2 = xfs_alloc_read_agf(pag, tp, 0, NULL); + if (error2) + return error2; + + /* + * If there isn't enough space in the AG to satisfy the + * reservation, let the caller know that there wasn't enough + * space. Callers are responsible for deciding what to do + * next, since (in theory) we can stumble along with + * insufficient reservation if data blocks are being freed to + * replenish the AG's free space. + */ + if (!error && + xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved > + pag->pagf_freeblks + pag->pagf_flcount) + error = -ENOSPC; + } + return error; } @@ -338,7 +366,7 @@ xfs_ag_resv_alloc_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : XFS_TRANS_SB_FDBLOCKS; @@ -380,7 +408,7 @@ xfs_ag_resv_free_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); return; diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index c0352edc8e41..b74b210008ea 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> @@ -18,6 +18,21 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_trans *tp, xfs_extlen_t len); +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_RMAPBT: + return &pag->pag_rmapbt_resv; + default: + return NULL; + } +} + /* * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from * the AGFL, they are allocated one at a time and the reservation updates don't @@ -37,16 +52,4 @@ xfs_ag_resv_rmapbt_alloc( xfs_perag_put(pag); } -static inline void -xfs_ag_resv_rmapbt_free( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d8053bc96c4d..de79f5d07f65 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -10,7 +10,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_btree.h" @@ -24,10 +23,11 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" -extern kmem_zone_t *xfs_bmap_free_item_zone; +struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; @@ -51,7 +51,7 @@ xfs_agfl_size( { unsigned int size = mp->m_sb.sb_sectsize; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) size -= sizeof(struct xfs_agfl); return size / sizeof(xfs_agblock_t); @@ -61,9 +61,9 @@ unsigned int xfs_refc_block( struct xfs_mount *mp) { - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } @@ -72,16 +72,34 @@ xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) return xfs_refc_block(mp) + 1; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } /* + * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_mod_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -93,14 +111,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -124,13 +142,13 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - blocks++; /* rmap root block */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) + blocks++; /* rmap root block */ + if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; @@ -151,7 +169,7 @@ xfs_alloc_lookup_eq( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -171,7 +189,7 @@ xfs_alloc_lookup_ge( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -190,7 +208,7 @@ xfs_alloc_lookup_le( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -198,7 +216,7 @@ static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_private.a.priv.abt.active; + return cur && cur->bc_ag.abt.active; } /* @@ -230,7 +248,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -245,11 +263,7 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, *bno)) - goto out_bad_rec; - if (*bno > *bno + *len) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) + if (!xfs_verify_agbext(pag, *bno, *len)) goto out_bad_rec; return 0; @@ -257,7 +271,8 @@ xfs_alloc_get_rec( out_bad_rec: xfs_warn(mp, "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + pag->pag_agno); xfs_warn(mp, "start block 0x%x block count 0x%x", *bno, *len); return -EFSCORRUPTED; @@ -426,8 +441,8 @@ xfs_alloc_fix_len( */ STATIC int /* error code */ xfs_alloc_fixup_trees( - xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ - xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ + struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ xfs_agblock_t fbno, /* starting block of free extent */ xfs_extlen_t flen, /* length of free extent */ xfs_agblock_t rbno, /* starting block of returned extent */ @@ -488,8 +503,8 @@ xfs_alloc_fixup_trees( struct xfs_btree_block *bnoblock; struct xfs_btree_block *cntblock; - bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); - cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); if (XFS_IS_CORRUPT(mp, bnoblock->bb_numrecs != @@ -589,6 +604,7 @@ xfs_agfl_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; /* @@ -597,7 +613,7 @@ xfs_agfl_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return NULL; if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) @@ -614,8 +630,8 @@ xfs_agfl_verify( return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && - be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; } @@ -637,7 +653,7 @@ xfs_agfl_read_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) @@ -658,7 +674,7 @@ xfs_agfl_write_verify( xfs_failaddr_t fa; /* no verification of non-crc AGFLs */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_agfl_verify(bp); @@ -684,20 +700,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -int /* error */ +int xfs_alloc_read_agfl( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_buf_t **bpp) /* buffer for the ag free block array */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **bpp) { - xfs_buf_t *bp; /* return value */ - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_buf *bp; + int error; - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; @@ -709,19 +724,17 @@ xfs_alloc_read_agfl( STATIC int xfs_alloc_update_counters( struct xfs_trans *tp, - struct xfs_perag *pag, struct xfs_buf *agbp, long len) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; - pag->pagf_freeblks += len; + agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); - xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { - xfs_buf_corruption_error(agbp); + xfs_buf_mark_corrupt(agbp); return -EFSCORRUPTED; } @@ -777,7 +790,7 @@ xfs_alloc_cur_setup( */ if (!acur->cnt) acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_CNT); + args->agbp, args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -787,10 +800,10 @@ xfs_alloc_cur_setup( */ if (!acur->bnolt) acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); + args->agbp, args->pag, XFS_BTNUM_BNO); if (!acur->bnogt) acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); + args->agbp, args->pag, XFS_BTNUM_BNO); return i == 1 ? 0 : -ENOSPC; } @@ -907,7 +920,7 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, *new); return 0; @@ -922,13 +935,13 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1026,6 +1039,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_agf *agf = args->agbp->b_addr; int error = 0; xfs_agblock_t fbno = NULLAGBLOCK; xfs_extlen_t flen = 0; @@ -1054,17 +1068,17 @@ xfs_alloc_ag_vextent_small( if (args->minlen != 1 || args->alignment != 1 || args->resv == XFS_AG_RESV_AGFL || - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= - args->minleft)) + be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, + &fbno, 0); if (error) goto error; if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { @@ -1079,9 +1093,7 @@ xfs_alloc_ag_vextent_small( } *fbnop = args->agbno = fbno; *flenp = args->len = 1; - if (XFS_IS_CORRUPT(args->mp, - fbno >= be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error; } @@ -1092,7 +1104,7 @@ xfs_alloc_ag_vextent_small( * If we're feeding an AGFL block to something that doesn't live in the * free space, we need to clear out the OWN_AG rmap. */ - error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, + error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, &XFS_RMAP_OINFO_AG); if (error) goto error; @@ -1169,20 +1181,19 @@ xfs_alloc_ag_vextent( /* if not file data, insert new block into the reverse map btree */ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { - error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, args->agbno, args->len, &args->oinfo); if (error) return error; } if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->pag, - args->agbp, + error = xfs_alloc_update_counters(args->tp, args->agbp, -((long)(args->len))); if (error) return error; - ASSERT(!xfs_extent_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->pag, args->agbno, args->len)); } @@ -1203,8 +1214,9 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ - xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; + struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ + struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; xfs_agblock_t fbno; /* start block of found extent */ xfs_extlen_t flen; /* length of found extent */ @@ -1220,7 +1232,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-number freespace btree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->pag, XFS_BTNUM_BNO); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1280,9 +1292,8 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->pag, XFS_BTNUM_CNT); + ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -1353,7 +1364,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; if (count > 0) count--; @@ -1468,7 +1479,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_private.a.priv.abt.active = true; + acur->cnt->bc_ag.abt.active = true; fbcur = acur->cnt; fbinc = false; } @@ -1505,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32() & 1) + if (prandom_u32_max(2)) return 0; #endif @@ -1515,8 +1526,8 @@ xfs_alloc_ag_vextent_lastblock( * maxlen, go to the start of this block, and skip all those smaller * than minlen. */ - if (len || args->alignment > 1) { - acur->cnt->bc_ptrs[0] = 1; + if (*len || args->alignment > 1) { + acur->cnt->bc_levels[0].ptr = 1; do { error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); if (error) @@ -1661,8 +1672,9 @@ STATIC int /* error */ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { - xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ - xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + struct xfs_agf *agf = args->agbp->b_addr; + struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ + struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ int error; /* error result */ xfs_agblock_t fbno; /* start of found freespace */ xfs_extlen_t flen; /* length of found freespace */ @@ -1677,9 +1689,8 @@ restart: * Allocate and initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); bno_cur = NULL; - busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1840,7 +1851,7 @@ restart: * Allocate and initialize a cursor for the by-block tree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->pag, XFS_BTNUM_BNO); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1851,8 +1862,7 @@ restart: args->agbno = rbno; if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > - be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error0; } @@ -1888,7 +1898,6 @@ xfs_free_ag_extent( enum xfs_ag_resv_type type) { struct xfs_mount *mp; - struct xfs_perag *pag; struct xfs_btree_cur *bno_cur; struct xfs_btree_cur *cnt_cur; xfs_agblock_t gtbno; /* start of right neighbor */ @@ -1901,12 +1910,13 @@ xfs_free_ag_extent( int haveright; /* have a right neighbor */ int i; int error; + struct xfs_perag *pag = agbp->b_pag; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; if (!xfs_rmap_should_skip_owner_update(oinfo)) { - error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); if (error) goto error0; } @@ -1914,7 +1924,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1984,7 +1994,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2168,10 +2178,8 @@ xfs_free_ag_extent( /* * Update the freespace totals in the ag and superblock. */ - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_update_counters(tp, pag, agbp, len); - xfs_ag_resv_free_extent(pag, type, tp, len); - xfs_perag_put(pag); + error = xfs_alloc_update_counters(tp, agbp, len); + xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); if (error) goto error0; @@ -2197,14 +2205,15 @@ xfs_free_ag_extent( */ /* - * Compute and fill in value of m_ag_maxlevels. + * Compute and fill in value of m_alloc_maxlevels. */ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, + mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, (mp->m_sb.sb_agblocks + 1) / 2); + ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); } /* @@ -2262,16 +2271,16 @@ xfs_alloc_min_freelist( const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; - ASSERT(mp->m_ag_maxlevels > 0); + ASSERT(mp->m_alloc_maxlevels > 0); /* space needed by-bno freespace btree */ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_ag_maxlevels); + mp->m_alloc_maxlevels); /* space needed by-size freespace btree */ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_ag_maxlevels); + mp->m_alloc_maxlevels); /* space needed reverse mapping used space btree */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, mp->m_rmap_maxlevels); @@ -2380,7 +2389,7 @@ xfs_agfl_needs_reset( int active; /* no agfl header on v4 supers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return false; /* @@ -2424,7 +2433,7 @@ xfs_agfl_reset( struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; ASSERT(pag->pagf_agflreset); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); @@ -2446,7 +2455,7 @@ xfs_agfl_reset( /* * Defer an AGFL block free. This is effectively equivalent to - * xfs_bmap_add_free() with some special handling particular to AGFL blocks. + * xfs_free_extent_later() with some special handling particular to AGFL blocks. * * Deferring AGFL frees helps prevent log reservation overruns due to too many * allocation operations in a transaction. AGFL frees are prone to this problem @@ -2465,13 +2474,14 @@ xfs_defer_agfl_block( struct xfs_mount *mp = tp->t_mountp; struct xfs_extent_free_item *new; /* new element */ - ASSERT(xfs_bmap_free_item_zone != NULL); + ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; - new->xefi_oinfo = *oinfo; + new->xefi_owner = oinfo->oi_owner; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); @@ -2479,6 +2489,101 @@ xfs_defer_agfl_block( } /* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +__xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + bool skip_discard) +{ + struct xfs_extent_free_item *new; /* new element */ +#ifdef DEBUG + struct xfs_mount *mp = tp->t_mountp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_extfree_item_cache != NULL); + + new = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + new->xefi_startblock = bno; + new->xefi_blockcount = (xfs_extlen_t)len; + if (skip_discard) + new->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (oinfo) { + ASSERT(oinfo->oi_offset == 0); + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + new->xefi_flags |= XFS_EFI_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + new->xefi_flags |= XFS_EFI_BMBT_BLOCK; + new->xefi_owner = oinfo->oi_owner; + } else { + new->xefi_owner = XFS_RMAP_OWN_NULL; + } + trace_xfs_bmap_free_defer(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, + XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); +} + +#ifdef DEBUG +/* + * Check if an AGF has a free extent record whose length is equal to + * args->minlen. + */ +STATIC int +xfs_exact_minlen_extent_available( + struct xfs_alloc_arg *args, + struct xfs_buf *agbp, + int *stat) +{ + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, + args->pag, XFS_BTNUM_CNT); + error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); + if (error) + goto out; + + if (*stat == 0) { + error = -EFSCORRUPTED; + goto out; + } + + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); + if (error) + goto out; + + if (*stat == 1 && flen != args->minlen) + *stat = 0; + +out: + xfs_btree_del_cursor(cnt_cur, error); + + return error; +} +#endif + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -2501,7 +2606,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2531,7 +2636,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2549,6 +2654,15 @@ xfs_alloc_fix_freelist( if (!xfs_alloc_space_available(args, need, flags)) goto out_agbp_relse; +#ifdef DEBUG + if (args->alloc_minlen_only) { + int stat; + + error = xfs_exact_minlen_extent_available(args, agbp, &stat); + if (error || !stat) + goto out_agbp_relse; + } +#endif /* * Make the freelist shorter if it's too long. * @@ -2580,7 +2694,7 @@ xfs_alloc_fix_freelist( else targs.oinfo = XFS_RMAP_OINFO_AG; while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; @@ -2595,7 +2709,7 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) goto out_agbp_relse; @@ -2624,7 +2738,7 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - error = xfs_alloc_put_freelist(tp, agbp, + error = xfs_alloc_put_freelist(pag, tp, agbp, agflbp, bno, 0); if (error) goto out_agflbp_relse; @@ -2648,26 +2762,25 @@ out_no_agbp: * Get a block from the freelist. * Returns with the buffer for the block gotten. */ -int /* error */ +int xfs_alloc_get_freelist( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk) /* destination is a AGF btree */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agblock_t *bnop, + int btreeblk) { - xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ - xfs_agblock_t bno; /* block number returned */ - __be32 *agfl_bno; - int error; - int logflags; - xfs_mount_t *mp = tp->t_mountp; - xfs_perag_t *pag; /* per allocation group data */ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agflbp; + xfs_agblock_t bno; + __be32 *agfl_bno; + int error; + uint32_t logflags; + struct xfs_mount *mp = tp->t_mountp; /* * Freelist is empty, give up. */ - agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2675,8 +2788,7 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), - &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) return error; @@ -2684,17 +2796,15 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); - xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; @@ -2703,7 +2813,6 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2716,9 +2825,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2745,7 +2854,7 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; - trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); @@ -2754,58 +2863,37 @@ xfs_alloc_log_agf( } /* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags) /* XFS_ALLOC_FLAGS_... */ -{ - xfs_buf_t *bp; - int error; - - error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); - if (!error) - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Put the block on the freelist for the allocation group. */ -int /* error */ +int xfs_alloc_put_freelist( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer for a.g. freelist header */ - xfs_buf_t *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk) /* block came from a AGF btree */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_buf *agflbp, + xfs_agblock_t bno, + int btreeblk) { - xfs_agf_t *agf; /* a.g. freespace structure */ - __be32 *blockp;/* pointer to array entry */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = agbp->b_addr; + __be32 *blockp; int error; - int logflags; - xfs_mount_t *mp; /* mount structure */ - xfs_perag_t *pag; /* per allocation group data */ + uint32_t logflags; __be32 *agfl_bno; int startoff; - agf = XFS_BUF_TO_AGF(agbp); - mp = tp->t_mountp; + if (!agflbp) { + error = xfs_alloc_read_agfl(pag, tp, &agflbp); + if (error) + return error; + } - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) - return error; be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); - xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; @@ -2814,13 +2902,12 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); startoff = (char *)blockp - (char *)agflbp->b_addr; @@ -2838,13 +2925,12 @@ xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) return __this_address; } @@ -2858,15 +2944,29 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; + if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + return __this_address; + + if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || + be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > + mp->m_alloc_maxlevels || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > + mp->m_alloc_maxlevels) return __this_address; - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (xfs_has_rmapbt(mp) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > + mp->m_rmap_maxlevels)) + return __this_address; + + if (xfs_has_rmapbt(mp) && + be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) return __this_address; /* @@ -2878,13 +2978,18 @@ xfs_agf_verify( if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) return __this_address; - if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + if (xfs_has_lazysbcount(mp) && be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return __this_address; - if (xfs_sb_version_hasreflink(&mp->m_sb) && + if (xfs_has_reflink(mp) && + be32_to_cpu(agf->agf_refcount_blocks) > + be32_to_cpu(agf->agf_length)) + return __this_address; + + if (xfs_has_reflink(mp) && (be32_to_cpu(agf->agf_refcount_level) < 1 || - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) + be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) return __this_address; return NULL; @@ -2898,7 +3003,7 @@ xfs_agf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -2914,6 +3019,7 @@ xfs_agf_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agf *agf = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agf_verify(bp); @@ -2922,11 +3028,11 @@ xfs_agf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) - XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } @@ -2942,60 +3048,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +int xfs_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_BUF_ */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - int error; + struct xfs_mount *mp = pag->pag_mount; + int error; - trace_xfs_read_agf(mp, agno); + trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (error) return error; - ASSERT(!(*bpp)->b_error); - xfs_buf_set_ref(*bpp, XFS_AGF_REF); + xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); return 0; } /* - * Read in the allocation group header (free/alloc section). + * Read in the allocation group header (free/alloc section) and initialise the + * perag structure if necessary. If the caller provides @agfbpp, then return the + * locked buffer to the caller, otherwise free it. */ -int /* error */ +int xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - struct xfs_agf *agf; /* ag freelist header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_buf *agfbp; + struct xfs_agf *agf; int error; + int allocbt_blks; - trace_xfs_alloc_read_agf(mp, agno); + trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); - ASSERT(agno != NULLAGNUMBER); - error = xfs_read_agf(mp, tp, agno, + error = xfs_read_agf(pag, tp, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, - bpp); + &agfbp); if (error) return error; - ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - pag = xfs_perag_get(mp, agno); + agf = agfbp->b_addr; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3009,10 +3112,24 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); + + /* + * Update the in-core allocbt counter. Filter out the rmapbt + * subset of the btreeblks counter because the rmapbt is managed + * by perag reservation. Subtract one for the rmapbt root block + * because the rmap counter includes it while the btreeblks + * counter only tracks non-root blocks. + */ + allocbt_blks = pag->pagf_btreeblks; + if (xfs_has_rmapbt(pag->pag_mount)) + allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; + if (allocbt_blks > 0) + atomic64_add(allocbt_blks, + &pag->pag_mount->m_allocbt_blks); } #ifdef DEBUG - else if (!XFS_FORCED_SHUTDOWN(mp)) { + else if (!xfs_is_shutdown(pag->pag_mount)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3023,7 +3140,10 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif - xfs_perag_put(pag); + if (agfbpp) + *agfbpp = agfbp; + else + xfs_trans_brelse(tp, agfbp); return 0; } @@ -3100,7 +3220,7 @@ xfs_alloc_vextent( * the first a.g. fails. */ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && - (mp->m_flags & XFS_MOUNT_32BITINODES)) { + xfs_is_inode32(mp)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount), 0); @@ -3108,7 +3228,7 @@ xfs_alloc_vextent( } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); args->type = XFS_ALLOCTYPE_NEAR_BNO; - /* FALLTHROUGH */ + fallthrough; case XFS_ALLOCTYPE_FIRST_AG: /* * Rotate through the allocation groups looking for a winner. @@ -3226,7 +3346,7 @@ error0: int xfs_free_extent_fix_freelist( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_buf **agbp) { struct xfs_alloc_arg args; @@ -3235,7 +3355,8 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = agno; + args.agno = pag->pag_agno; + args.pag = pag; /* * validate that the block number is legal - the enables us to detect @@ -3244,17 +3365,12 @@ xfs_free_extent_fix_freelist( if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto out; + return error; *agbp = args.agbp; -out: - xfs_perag_put(args.pag); - return error; + return 0; } /* @@ -3275,8 +3391,10 @@ __xfs_free_extent( struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + struct xfs_agf *agf; int error; unsigned int busy_flags = 0; + struct xfs_perag *pag; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -3285,34 +3403,37 @@ __xfs_free_extent( XFS_ERRTAG_FREE_EXTENT)) return -EIO; - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + pag = xfs_perag_get(mp, agno); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - return error; + goto err; + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { error = -EFSCORRUPTED; - goto err; + goto err_release; } /* validate the extent size is legal now we have the agf locked */ - if (XFS_IS_CORRUPT(mp, - agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) { + if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; - goto err; + goto err_release; } error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) - goto err; + goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_perag_put(pag); return 0; -err: +err_release: xfs_trans_brelse(tp, agbp); +err: + xfs_perag_put(pag); return error; } @@ -3325,7 +3446,7 @@ struct xfs_alloc_query_range_info { STATIC int xfs_alloc_query_range_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_alloc_query_range_info *query = priv; @@ -3340,8 +3461,8 @@ xfs_alloc_query_range_helper( int xfs_alloc_query_range( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv) { @@ -3408,7 +3529,7 @@ xfs_agfl_walk( unsigned int i; int error; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); i = be32_to_cpu(agf->agf_flfirst); /* Nothing to walk in an empty AGFL. */ @@ -3428,3 +3549,20 @@ xfs_agfl_walk( return 0; } + +int __init +xfs_extfree_intent_init_cache(void) +{ + xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", + sizeof(struct xfs_extent_free_item), + 0, 0, NULL); + + return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_extfree_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_extfree_item_cache); + xfs_extfree_item_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7380fbe4a3ff..2c3f762dfb58 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -75,6 +75,9 @@ typedef struct xfs_alloc_arg { char wasfromfl; /* set if allocation is from freelist */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ +#ifdef DEBUG + bool alloc_minlen_only; /* allocate exact minlen extent */ +#endif } xfs_alloc_arg_t; /* @@ -85,7 +88,6 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ -#define XFS_ALLOC_AGFL_RESERVE 4 unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); @@ -93,65 +95,27 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); +int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk); +int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, struct xfs_buf *agflbp, + xfs_agblock_t bno, int btreeblk); /* - * Compute and fill in value of m_ag_maxlevels. + * Compute and fill in value of m_alloc_maxlevels. */ void xfs_alloc_compute_maxlevels( struct xfs_mount *mp); /* file system mount structure */ /* - * Get a block from the freelist. - * Returns with the buffer for the block gotten. - */ -int /* error */ -xfs_alloc_get_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk); /* destination is a AGF btree */ - -/* * Log the given fields from the agf structure. */ void xfs_alloc_log_agf( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields);/* mask of fields to be logged (XFS_AGF_...) */ - -/* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags); /* XFS_ALLOC_FLAGS_... */ - -/* - * Put the block on the freelist for the allocation group. - */ -int /* error */ -xfs_alloc_put_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for a.g. freelist header */ - struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk); /* owner was a AGF btree */ - -/* - * Read in the allocation group header (free/alloc section). - */ -int /* error */ -xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp); /* buffer for the ag freelist header */ + uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* * Allocate an extent (variable-size). @@ -204,26 +168,28 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); -int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); +int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); +int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); -int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); typedef int (*xfs_alloc_query_range_fn)( - struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, - void *priv); + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv); int xfs_alloc_query_range(struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv); int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); @@ -236,4 +202,49 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); +static inline __be32 * +xfs_buf_to_agfl_bno( + struct xfs_buf *bp) +{ + if (xfs_has_crc(bp->b_mount)) + return bp->b_addr + sizeof(struct xfs_agfl); + return bp->b_addr; +} + +void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, + xfs_filblks_t len, const struct xfs_owner_info *oinfo, + bool skip_discard); + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +struct xfs_extent_free_item { + struct list_head xefi_list; + uint64_t xefi_owner; + xfs_fsblock_t xefi_startblock;/* starting fs block number */ + xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + unsigned int xefi_flags; +}; + +#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ +#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ +#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ + +static inline void +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo) +{ + __xfs_free_extent_later(tp, bno, len, oinfo, false); +} + + +extern struct kmem_cache *xfs_extfree_item_cache; + +int __init xfs_extfree_intent_init_cache(void); +void xfs_extfree_intent_destroy_cache(void); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 279694d73e4e..549a3cba0234 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -9,61 +9,59 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans.h" +#include "xfs_ag.h" +static struct kmem_cache *xfs_allocbt_cur_cache; STATIC struct xfs_btree_cur * xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, - cur->bc_btnum); + cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); } STATIC void xfs_allocbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); + cur->bc_ag.pag->pagf_levels[btnum] += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } STATIC int xfs_allocbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { int error; xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &bno, 1); + error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -72,9 +70,9 @@ xfs_allocbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + atomic64_inc(&cur->bc_mp->m_allocbt_blks); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); - xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); *stat = 1; @@ -86,19 +84,19 @@ xfs_allocbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); + error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, + bno, 1); if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + atomic64_dec(&cur->bc_mp->m_allocbt_blks); + xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } @@ -107,14 +105,13 @@ xfs_allocbt_free_block( */ STATIC void xfs_allocbt_update_lastrec( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_rec *rec, - int ptr, - int reason) + struct xfs_btree_cur *cur, + const struct xfs_btree_block *block, + const union xfs_btree_rec *rec, + int ptr, + int reason) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; struct xfs_perag *pag; __be32 len; int numrecs; @@ -159,10 +156,9 @@ xfs_allocbt_update_lastrec( } agf->agf_longest = len; - pag = xfs_perag_get(cur->bc_mp, seqno); + pag = cur->bc_ag.agbp->b_pag; pag->pagf_longest = be32_to_cpu(len); - xfs_perag_put(pag); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } STATIC int @@ -183,8 +179,8 @@ xfs_allocbt_get_maxrecs( STATIC void xfs_allocbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->alloc.ar_startblock = rec->alloc.ar_startblock; key->alloc.ar_blockcount = rec->alloc.ar_blockcount; @@ -192,10 +188,10 @@ xfs_allocbt_init_key_from_rec( STATIC void xfs_bnobt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->alloc.ar_startblock); x += be32_to_cpu(rec->alloc.ar_blockcount) - 1; @@ -205,8 +201,8 @@ xfs_bnobt_init_high_key_from_rec( STATIC void xfs_cntbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->alloc.ar_blockcount = rec->alloc.ar_blockcount; key->alloc.ar_startblock = 0; @@ -226,32 +222,32 @@ xfs_allocbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } STATIC int64_t xfs_bnobt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; - xfs_alloc_key_t *kp = &key->alloc; + struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; + const struct xfs_alloc_rec *kp = &key->alloc; return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } STATIC int64_t xfs_cntbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; - xfs_alloc_key_t *kp = &key->alloc; - int64_t diff; + struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; + const struct xfs_alloc_rec *kp = &key->alloc; + int64_t diff; diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) @@ -262,9 +258,9 @@ xfs_cntbt_key_diff( STATIC int64_t xfs_bnobt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - be32_to_cpu(k2->alloc.ar_startblock); @@ -272,11 +268,11 @@ xfs_bnobt_diff_two_keys( STATIC int64_t xfs_cntbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - int64_t diff; + int64_t diff; diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); @@ -301,7 +297,7 @@ xfs_allocbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; @@ -322,7 +318,7 @@ xfs_allocbt_verify( if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[btnum]) return __this_address; - } else if (level >= mp->m_ag_maxlevels) + } else if (level >= mp->m_alloc_maxlevels) return __this_address; return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); @@ -382,9 +378,9 @@ const struct xfs_buf_ops xfs_cntbt_buf_ops = { STATIC int xfs_bnobt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->alloc.ar_startblock) < be32_to_cpu(k2->alloc.ar_startblock); @@ -392,9 +388,9 @@ xfs_bnobt_keys_inorder( STATIC int xfs_bnobt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->alloc.ar_startblock) + be32_to_cpu(r1->alloc.ar_blockcount) <= @@ -403,9 +399,9 @@ xfs_bnobt_recs_inorder( STATIC int xfs_cntbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->alloc.ar_blockcount) < be32_to_cpu(k2->alloc.ar_blockcount) || @@ -416,9 +412,9 @@ xfs_cntbt_keys_inorder( STATIC int xfs_cntbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->alloc.ar_blockcount) < be32_to_cpu(r2->alloc.ar_blockcount) || @@ -471,6 +467,41 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .recs_inorder = xfs_cntbt_recs_inorder, }; +/* Allocate most of a new allocation btree cursor. */ +STATIC struct xfs_btree_cur * +xfs_allocbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); + + cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels, + xfs_allocbt_cur_cache); + cur->bc_ag.abt.active = false; + + if (btnum == XFS_BTNUM_CNT) { + cur->bc_ops = &xfs_cntbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); + cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; + } else { + cur->bc_ops = &xfs_bnobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + } + + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; + + if (xfs_has_crc(mp)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + return cur; +} + /* * Allocate a new allocation btree cursor. */ @@ -479,43 +510,77 @@ xfs_allocbt_init_cursor( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, xfs_btnum_t btnum) /* btree identifier */ { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_btnum = btnum; - cur->bc_blocklog = mp->m_sb.sb_blocklog; - - if (btnum == XFS_BTNUM_CNT) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); - cur->bc_ops = &xfs_cntbt_ops; + cur = xfs_allocbt_init_common(mp, tp, pag, btnum); + if (btnum == XFS_BTNUM_CNT) cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); - cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; - } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); - cur->bc_ops = &xfs_bnobt_ops; + else cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); - } - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.agbp = agbp; - if (xfs_sb_version_hascrc(&mp->m_sb)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + return cur; +} + +/* Create a free space btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_allocbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + cur = xfs_allocbt_init_common(mp, NULL, pag, btnum); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new free space btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_allocbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + } else { + cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + } +} + +/* Calculate number of records in an alloc btree block. */ +static inline unsigned int +xfs_allocbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +} + +/* * Calculate number of records in an alloc btree block. */ int @@ -525,10 +590,26 @@ xfs_allocbt_maxrecs( int leaf) { blocklen -= XFS_ALLOC_BLOCK_LEN(mp); + return xfs_allocbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(xfs_alloc_rec_t); - return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +/* Free space btrees are at their largest when every other block is free. */ +#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2) + +/* Compute the max possible height for free space btrees. */ +unsigned int +xfs_allocbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, + XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); + + minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS); } /* Calculate the freespace btree size for some records. */ @@ -539,3 +620,22 @@ xfs_allocbt_calc_size( { return xfs_btree_calc_size(mp->m_alloc_mnr, len); } + +int __init +xfs_allocbt_init_cur_cache(void) +{ + xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur", + xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_allocbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_allocbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_allocbt_cur_cache); + xfs_allocbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index c9305ebb69f6..45df893ef6bb 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -13,12 +13,14 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; +struct xbtree_afakeroot; /* * Btree block header size depends on a superblock flag. */ #define XFS_ALLOC_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* @@ -45,11 +47,22 @@ struct xfs_mount; (maxrecs) * sizeof(xfs_alloc_key_t) + \ ((index) - 1) * sizeof(xfs_alloc_ptr_t))) -extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, - xfs_agnumber_t, xfs_btnum_t); +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_perag *pag, xfs_btnum_t btnum); +struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, struct xfs_perag *pag, + xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); +void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + +unsigned int xfs_allocbt_maxlevels_ondisk(void); + +int __init xfs_allocbt_init_cur_cache(void); +void xfs_allocbt_destroy_cur_cache(void); + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e6149720ce02..e28d93d232de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -24,6 +24,10 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" + +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -44,55 +48,169 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); * Internal routines when attribute list is one block. */ STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); -STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); -STATIC int xfs_attr_node_addname(xfs_da_args_t *args); -STATIC int xfs_attr_node_removename(xfs_da_args_t *args); -STATIC int xfs_attr_fillstate(xfs_da_state_t *state); -STATIC int xfs_attr_refillstate(xfs_da_state_t *state); +STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); +int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!xfs_inode_has_attr_fork(ip)) + return 0; + if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0) + return 0; + return 1; +} -STATIC int -xfs_attr_args_init( - struct xfs_da_args *args, - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) +/* + * Returns true if the there is exactly only block in the attr fork, in which + * case the attribute fork consists of a single leaf block entry. + */ +bool +xfs_attr_is_leaf( + struct xfs_inode *ip) { + struct xfs_ifork *ifp = &ip->i_af; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap; - if (!name) - return -EINVAL; + if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS) + return false; - memset(args, 0, sizeof(*args)); - args->geo = dp->i_mount->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->dp = dp; - args->flags = flags; - args->name = name; - args->namelen = namelen; - if (args->namelen >= MAXNAMELEN) - return -EFAULT; /* match IRIX behaviour */ + xfs_iext_first(ifp, &icur); + xfs_iext_get_extent(ifp, &icur, &imap); + return imap.br_startoff == 0 && imap.br_blockcount == 1; +} + +/* + * XXX (dchinner): name path state saving and refilling is an optimisation to + * avoid needing to look up name entries after rolling transactions removing + * remote xattr blocks between the name entry lookup and name entry removal. + * This optimisation got sidelined when combining the set and remove state + * machines, but the code has been left in place because it is worthwhile to + * restore the optimisation once the combined state machine paths have settled. + * + * This comment is a public service announcement to remind Future Dave that he + * still needs to restore this code to working order. + */ +#if 0 +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +static int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } - args->hashval = xfs_da_hashname(args->name, args->namelen); return 0; } -int -xfs_inode_hasattr( - struct xfs_inode *ip) +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +static int +xfs_attr_refillstate(xfs_da_state_t *state) { - if (!XFS_IFORK_Q(ip) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) - return 0; - return 1; + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; } +#else +static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; } +#endif /*======================================================================== * Overall external interface routines. @@ -104,91 +222,66 @@ xfs_inode_hasattr( */ int xfs_attr_get_ilocked( - struct xfs_inode *ip, struct xfs_da_args *args) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - if (!xfs_inode_hasattr(ip)) + if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + if (xfs_attr_is_leaf(args->dp)) return xfs_attr_leaf_get(args); - else - return xfs_attr_node_get(args); + return xfs_attr_node_get(args); } /* * Retrieve an extended attribute by name, and its value if requested. * - * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, - * just an indication whether the attribute exists and the size of the value if - * it exists. The size is returned in @valuelenp, + * If args->valuelen is zero, then the caller does not want the value, just an + * indication whether the attribute exists and the size of the value if it + * exists. The size is returned in args.valuelen. * - * If the attribute is found, but exceeds the size limit set by the caller in - * @valuelenp, return -ERANGE with the size of the attribute that was found in - * @valuelenp. + * If args->value is NULL but args->valuelen is non-zero, allocate the buffer + * for the value after existence of the attribute has been determined. The + * caller always has to free args->value if it is set, no matter if this + * function was successful or not. * - * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after - * existence of the attribute has been determined. On success, return that - * buffer to the caller and leave them to free it. On failure, free any - * allocated buffer and ensure the buffer pointer returned to the caller is - * null. + * If the attribute is found, but exceeds the size limit set by the caller in + * args->valuelen, return -ERANGE with the size of the attribute that was found + * in args->valuelen. */ int xfs_attr_get( - struct xfs_inode *ip, - const unsigned char *name, - size_t namelen, - unsigned char **value, - int *valuelenp, - int flags) + struct xfs_da_args *args) { - struct xfs_da_args args; uint lock_mode; int error; - ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); + XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - XFS_STATS_INC(ip->i_mount, xs_attr_get); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, namelen, flags); - if (error) - return error; + args->geo = args->dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* Entirely possible to look up a name which doesn't exist */ - args.op_flags = XFS_DA_OP_OKNOENT; - if (flags & ATTR_ALLOC) - args.op_flags |= XFS_DA_OP_ALLOCVAL; - else - args.value = *value; - args.valuelen = *valuelenp; + args->op_flags = XFS_DA_OP_OKNOENT; - lock_mode = xfs_ilock_attr_map_shared(ip); - error = xfs_attr_get_ilocked(ip, &args); - xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; + lock_mode = xfs_ilock_attr_map_shared(args->dp); + error = xfs_attr_get_ilocked(args); + xfs_iunlock(args->dp, lock_mode); - /* on error, we have to clean up allocated value buffers */ - if (error) { - if (flags & ATTR_ALLOC) { - kmem_free(args.value); - *value = NULL; - } - return error; - } - *value = args.value; - return 0; + return error; } /* * Calculate how many blocks we need for the new attribute, */ -STATIC int +int xfs_attr_calc_size( struct xfs_da_args *args, int *local) @@ -221,14 +314,46 @@ xfs_attr_calc_size( return nblks; } +/* Initialize transaction reservation for attr operations */ +void +xfs_init_attr_trans( + struct xfs_da_args *args, + struct xfs_trans_res *tres, + unsigned int *total) +{ + struct xfs_mount *mp = args->dp->i_mount; + + if (args->value) { + tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; + *total = args->total; + } else { + *tres = M_RES(mp)->tr_attrrm; + *total = XFS_ATTRRM_SPACE_RES(mp); + } +} + +/* + * Add an attr to a shortform fork. If there is no space, + * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC. + * to use. + */ STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, struct xfs_da_args *args) { - struct xfs_mount *mp = dp->i_mount; - int error, error2; + int error; + + /* + * Build initial attribute list (if required). + */ + if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); if (error == -ENOSPC) @@ -238,326 +363,793 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && (args->flags & ATTR_KERNOTIME) == 0) + if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(dp->i_mount)) xfs_trans_set_sync(args->trans); - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; + return error; +} + +static int +xfs_attr_sf_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + int error = 0; + + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + ASSERT(!error || error == -EEXIST); + attr->xattri_dela_state = XFS_DAS_DONE; + goto out; + } + + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args); + if (error) + return error; + + attr->xattri_dela_state = XFS_DAS_LEAF_ADD; +out: + trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); + return error; } /* - * Set the attribute specified in @args. + * Handle the state change on completion of a multi-state attr operation. + * + * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first + * modification in a attr replace operation and we still have to do the second + * state, indicated by @replace_state. + * + * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on + * completion of the second half of the attr replace operation we correctly + * signal that it is done. */ -int -xfs_attr_set_args( - struct xfs_da_args *args) +static enum xfs_delattr_state +xfs_attr_complete_op( + struct xfs_attr_intent *attr, + enum xfs_delattr_state replace_state) { - struct xfs_inode *dp = args->dp; - struct xfs_buf *leaf_bp = NULL; + struct xfs_da_args *args = attr->xattri_da_args; + bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + args->op_flags &= ~XFS_DA_OP_REPLACE; + if (do_replace) { + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; + } + return XFS_DAS_DONE; +} + +static int +xfs_attr_leaf_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; int error; + ASSERT(xfs_attr_is_leaf(args->dp)); + /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + error = xfs_attr_leaf_try_add(args); + + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; /* - * Build initial attribute list (if required). + * We're not in leaf format anymore, so roll the transaction and + * retry the add to the newly allocated node block. */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(args); + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + goto out; + } + if (error) + return error; + + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_LEAF_REPLACE); +out: + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); + return error; +} +/* + * Add an entry to a node format attr tree. + * + * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell + * the difference between leaf + remote attr blocks and a node format tree, + * so we may still end up having to convert from leaf to node format here. + */ +static int +xfs_attr_node_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; + + error = xfs_attr_node_try_addname(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; /* - * Try to add the attr to the attribute list in the inode. + * No state change, we really are in node form now + * but we need the transaction rolled to continue. */ - error = xfs_attr_try_sf_addname(dp, args); - if (error != -ENOSPC) + goto out; + } + if (error) + return error; + + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_NODE_REPLACE); +out: + trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); + return error; +} + +static int +xfs_attr_rmtval_alloc( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); + if (error) return error; + /* Roll the transaction only if there is more to allocate. */ + if (attr->xattri_blkcnt > 0) + goto out; + } + error = xfs_attr_rmtval_set_value(args); + if (error) + return error; + + attr->xattri_dela_state = xfs_attr_complete_op(attr, + ++attr->xattri_dela_state); + /* + * If we are not doing a rename, we've finished the operation but still + * have to clear the incomplete flag protecting the new attr from + * exposing partially initialised state if we crash during creation. + */ + if (attr->xattri_dela_state == XFS_DAS_DONE) + error = xfs_attr3_leaf_clearflag(args); +out: + trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +static int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; + + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + return error; + + /* + * Mark the attribute as INCOMPLETE + */ + return xfs_attr3_leaf_setflag(args); +} + +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); +} + +/* + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. + */ +static +int xfs_attr_node_removename_setup( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state; + int error; + + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + if (error != -EEXIST) + goto out; + error = 0; + + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); + + error = xfs_attr_leaf_mark_incomplete(args, state); + if (error) + goto out; + if (args->rmtblkno > 0) + error = xfs_attr_rmtval_invalidate(args); +out: + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } + + return error; +} + +/* + * Remove the original attr we have just replaced. This is dependent on the + * original lookup and insert placing the old attr in args->blkno/args->index + * and the new attr in args->blkno2/args->index2. + */ +static int +xfs_attr_leaf_remove_attr( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff; + int error; + + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; + + xfs_attr3_leaf_remove(bp, args); + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + + return error; +} + +/* + * Shrink an attribute from leaf to shortform. Used by the node format remove + * path when the node format collapses to a single block and so we have to check + * if it can be collapsed further. + */ +static int +xfs_attr_leaf_shrink( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp; + int forkoff; + int error; + + if (!xfs_attr_is_leaf(dp)) + return 0; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else { + xfs_trans_brelse(args->trans, bp); + } + + return error; +} + +/* + * Run the attribute operation specified in @attr. + * + * This routine is meant to function as a delayed operation and will set the + * state to XFS_DAS_DONE when the operation is complete. Calling functions will + * need to handle this, and recall the function until either an error or + * XFS_DAS_DONE is detected. + */ +int +xfs_attr_set_iter( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; + + /* State machine switch */ +next_state: + switch (attr->xattri_dela_state) { + case XFS_DAS_UNINIT: + ASSERT(0); + return -EFSCORRUPTED; + case XFS_DAS_SF_ADD: + return xfs_attr_sf_addname(attr); + case XFS_DAS_LEAF_ADD: + return xfs_attr_leaf_addname(attr); + case XFS_DAS_NODE_ADD: + return xfs_attr_node_addname(attr); + + case XFS_DAS_SF_REMOVE: + error = xfs_attr_sf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_LEAF_REMOVE: + error = xfs_attr_leaf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_NODE_REMOVE: + error = xfs_attr_node_removename_setup(attr); + if (error == -ENOATTR && + (args->op_flags & XFS_DA_OP_RECOVERY)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + error = 0; + break; + } + if (error) + return error; + attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; + if (args->rmtblkno == 0) + attr->xattri_dela_state++; + break; + + case XFS_DAS_LEAF_SET_RMT: + case XFS_DAS_NODE_SET_RMT: + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; + attr->xattri_dela_state++; + fallthrough; + + case XFS_DAS_LEAF_ALLOC_RMT: + case XFS_DAS_NODE_ALLOC_RMT: + error = xfs_attr_rmtval_alloc(attr); + if (error) + return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + break; + goto next_state; + + case XFS_DAS_LEAF_REPLACE: + case XFS_DAS_NODE_REPLACE: /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. + * We must "flip" the incomplete flags on the "new" and "old" + * attribute/value pairs so that one disappears and one appears + * atomically. */ - error = xfs_attr_shortform_to_leaf(args, &leaf_bp); + error = xfs_attr3_leaf_flipflags(args); if (error) return error; + /* + * We must commit the flag value change now to make it atomic + * and then we can start the next trans in series at REMOVE_OLD. + */ + attr->xattri_dela_state++; + break; + case XFS_DAS_LEAF_REMOVE_OLD: + case XFS_DAS_NODE_REMOVE_OLD: /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - * Once we're done rolling the transaction we can release - * the hold and add the attr to the leaf. + * If we have a remote attr, start the process of removing it + * by invalidating any cached buffers. + * + * If we don't have a remote attr, we skip the remote block + * removal state altogether with a second state increment. */ - xfs_trans_bhold(args->trans, leaf_bp); - error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, leaf_bp); - if (error) { - xfs_trans_brelse(args->trans, leaf_bp); - return error; + xfs_attr_restore_rmt_blk(args); + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + } else { + attr->xattri_dela_state++; + } + + attr->xattri_dela_state++; + goto next_state; + + case XFS_DAS_LEAF_REMOVE_RMT: + case XFS_DAS_NODE_REMOVE_RMT: + error = xfs_attr_rmtval_remove(attr); + if (error == -EAGAIN) { + error = 0; + break; } + if (error) + return error; + + /* + * We've finished removing the remote attr blocks, so commit the + * transaction and move on to removing the attr name from the + * leaf/node block. Removing the attr might require a full + * transaction reservation for btree block freeing, so we + * can't do that in the same transaction where we removed the + * remote attr blocks. + */ + attr->xattri_dela_state++; + break; + + case XFS_DAS_LEAF_REMOVE_ATTR: + error = xfs_attr_leaf_remove_attr(attr); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + + case XFS_DAS_NODE_REMOVE_ATTR: + error = xfs_attr_node_remove_attr(attr); + if (!error) + error = xfs_attr_leaf_shrink(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + default: + ASSERT(0); + break; } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) - error = xfs_attr_leaf_addname(args); - else - error = xfs_attr_node_addname(args); + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return error; } + /* - * Remove the attribute specified in @args. + * Return EEXIST if attr is found, or ENOATTR if not */ -int -xfs_attr_remove_args( - struct xfs_da_args *args) +static int +xfs_attr_lookup( + struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; - if (!xfs_inode_hasattr(dp)) { - error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); - error = xfs_attr_shortform_remove(args); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_removename(args); - } else { - error = xfs_attr_node_removename(args); + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) + return xfs_attr_sf_findname(args, NULL, NULL); + + if (xfs_attr_is_leaf(dp)) { + error = xfs_attr_leaf_hasname(args, &bp); + + if (bp) + xfs_trans_brelse(args->trans, bp); + + return error; } + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); return error; } -int -xfs_attr_set( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - unsigned char *value, - int valuelen, - int flags) +static int +xfs_attr_intent_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ { - struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; - struct xfs_trans_res tres; - int rsvd = (flags & ATTR_ROOT) != 0; - int error, local; - XFS_STATS_INC(mp, xs_attr_set); + struct xfs_attr_intent *new; - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return -EIO; + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); + new->xattri_op_flags = op_flags; + new->xattri_da_args = args; - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; + *attr = new; + return 0; +} - args.value = value; - args.valuelen = valuelen; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - args.total = xfs_attr_calc_size(&args, &local); +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_add( + struct xfs_da_args *args) +{ + struct xfs_attr_intent *new; + int error = 0; - error = xfs_qm_dqattach(dp); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); if (error) return error; - /* - * If the inode doesn't have an attribute fork, add one. - * (inode must not be locked when we call this routine) - */ - if (XFS_IFORK_Q(dp) == 0) { - int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); + new->xattri_dela_state = xfs_attr_init_add_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); - if (error) - return error; - } + return 0; +} - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_replace( + struct xfs_da_args *args) +{ + struct xfs_attr_intent *new; + int error = 0; - /* - * Root fork attributes can use reserved data blocks for this - * operation if necessary - */ - error = xfs_trans_alloc(mp, &tres, args.total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); if (error) return error; - xfs_ilock(dp, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; + new->xattri_dela_state = xfs_attr_init_replace_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); - xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args); - if (error) - goto out_trans_cancel; - if (!args.trans) { - /* shortform attribute has already been committed */ - goto out_unlock; - } + return 0; +} - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); +/* Removes an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_remove( + struct xfs_da_args *args) +{ + + struct xfs_attr_intent *new; + int error; - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); + if (error) + return error; - /* - * Commit the last in the sequence of transactions. - */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); -out_unlock: - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + new->xattri_dela_state = xfs_attr_init_remove_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); -out_trans_cancel: - if (args.trans) - xfs_trans_cancel(args.trans); - goto out_unlock; + return 0; } /* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. + * Note: If args->value is NULL the attribute will be removed, just like the + * Linux ->setattr API. */ int -xfs_attr_remove( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) +xfs_attr_set( + struct xfs_da_args *args) { + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; - int error; - - XFS_STATS_INC(mp, xs_attr_remove); + struct xfs_trans_res tres; + bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); + int error, local; + int rmt_blks = 0; + unsigned int total; - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, namelen, flags); + error = xfs_qm_dqattach(dp); if (error) return error; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); + /* - * we have no control over the attribute names that userspace passes us + * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute - * removal to fail. + * removal to fail as well. Preserve the logged flag, since we need + * to pass that through to the logging code. */ - args.op_flags = XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_OKNOENT | + (args->op_flags & XFS_DA_OP_LOGGED); - error = xfs_qm_dqattach(dp); - if (error) - return error; + if (args->value) { + XFS_STATS_INC(mp, xs_attr_set); + args->total = xfs_attr_calc_size(args, &local); + + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (xfs_inode_has_attr_fork(dp) == 0) { + int sf_size = sizeof(struct xfs_attr_sf_hdr) + + xfs_attr_sf_entsize_byname(args->namelen, + args->valuelen); + + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } + + if (!local) + rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); + } else { + XFS_STATS_INC(mp, xs_attr_remove); + rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); + } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0, - (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, - &args.trans); + xfs_init_attr_trans(args, &tres, &total); + error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; - xfs_ilock(dp, XFS_ILOCK_EXCL); - /* - * No need to make quota reservations here. We expect to release some - * blocks not allocate in the common case. - */ - xfs_trans_ijoin(args.trans, dp, 0); + if (args->value || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(args->trans, dp, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error) + goto out_trans_cancel; + } - error = xfs_attr_remove_args(&args); + error = xfs_attr_lookup(args); + switch (error) { + case -EEXIST: + /* if no value, we are performing a remove operation */ + if (!args->value) { + error = xfs_attr_defer_remove(args); + break; + } + /* Pure create fails if the attr already exists */ + if (args->attr_flags & XATTR_CREATE) + goto out_trans_cancel; + + error = xfs_attr_defer_replace(args); + break; + case -ENOATTR: + /* Can't remove what isn't there. */ + if (!args->value) + goto out_trans_cancel; + + /* Pure replace fails if no existing attr to replace. */ + if (args->attr_flags & XATTR_REPLACE) + goto out_trans_cancel; + + error = xfs_attr_defer_add(args); + break; + default: + goto out_trans_cancel; + } if (error) - goto out; + goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); + if (xfs_has_wsync(mp)) + xfs_trans_set_sync(args->trans); - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + if (!(args->op_flags & XFS_DA_OP_NOTIME)) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args->trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: - if (args.trans) - xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; +out_trans_cancel: + if (args->trans) + xfs_trans_cancel(args->trans); + goto out_unlock; } /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ +static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +{ + struct xfs_attr_shortform *sf; + + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; + return be16_to_cpu(sf->hdr.totsize); +} + /* * Add a name to the shortform attribute list structure * This is the external routine. */ -STATIC int -xfs_attr_shortform_addname(xfs_da_args_t *args) +static int +xfs_attr_shortform_addname( + struct xfs_da_args *args) { - int newsize, forkoff, retval; + int newsize, forkoff; + int error; trace_xfs_attr_sf_addname(args); - retval = xfs_attr_shortform_lookup(args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) - return retval; - retval = xfs_attr_shortform_remove(args); - if (retval) - return retval; + error = xfs_attr_shortform_lookup(args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + return error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + return error; + + error = xfs_attr_sf_removename(args); + if (error) + return error; + /* - * Since we have removed the old attr, clear ATTR_REPLACE so - * that the leaf format add routine won't trip over the attr - * not being around. + * Since we have removed the old attr, clear XFS_DA_OP_REPLACE + * so that the new attr doesn't fit in shortform format, the + * leaf format add routine won't trip over the attr not being + * around. */ - args->flags &= ~ATTR_REPLACE; + args->op_flags &= ~XFS_DA_OP_REPLACE; + break; + case 0: + break; + default: + return error; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) return -ENOSPC; - newsize = XFS_ATTR_SF_TOTSIZE(args->dp); - newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + newsize = xfs_attr_sf_totsize(args->dp); + newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); if (!forkoff) @@ -572,183 +1164,106 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ +/* Save the current remote block info and clear the current pointers. */ +static void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; +} + +/* Set stored info about a remote block */ +static void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + /* - * Add a name to the leaf attribute list structure + * Tries to add an attribute to an inode in leaf form * - * This leaf block cannot have a "remote" value, we only call this routine - * if bmap_one_block() says there is only one block (ie: no remote blks). + * This function is meant to execute as part of a delayed operation and leaves + * the transaction handling to the caller. On success the attribute is added + * and the inode and transaction are left dirty. If there is not enough space, + * the attr data is converted to node format and -ENOSPC is returned. Caller is + * responsible for handling the dirty inode and transaction or adding the attr + * in node format. */ STATIC int -xfs_attr_leaf_addname( +xfs_attr_leaf_try_add( struct xfs_da_args *args) { - struct xfs_inode *dp; struct xfs_buf *bp; - int retval, error, forkoff; - - trace_xfs_attr_leaf_addname(args); + int error; - /* - * Read the (only) block in the attribute list in. - */ - dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); if (error) return error; /* - * Look up the given attribute in the leaf block. Figure out if - * the given flags produce an error or call for an atomic rename. + * Look up the xattr name to set the insertion point for the new xattr. */ - retval = xfs_attr3_leaf_lookup_int(bp, args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - xfs_trans_brelse(args->trans, bp); - return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_trans_brelse(args->trans, bp); - return retval; - } + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + goto out_brelse; trace_xfs_attr_leaf_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; - /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - /* - * Add the attribute to the leaf block, transitioning to a Btree - * if required. - */ - retval = xfs_attr3_leaf_add(bp, args); - if (retval == -ENOSPC) { - /* - * Promote the attribute list to the Btree format, then - * Commit that transaction so that the node_addname() call - * can manage its own transactions. - */ - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + return xfs_attr3_leaf_add(bp, args); - /* - * Commit the current trans (including the inode) and start - * a new one. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return error; +} - /* - * Fob the whole rest of the problem off on the Btree code. - */ - error = xfs_attr_node_addname(args); - return error; - } +/* + * Return EEXIST if attr is found, or ENOATTR if not + */ +STATIC int +xfs_attr_leaf_hasname( + struct xfs_da_args *args, + struct xfs_buf **bp) +{ + int error = 0; - /* - * Commit the transaction that added the attr name so that - * later routines can manage their own transactions. - */ - error = xfs_trans_roll_inode(&args->trans, dp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); if (error) return error; - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } + error = xfs_attr3_leaf_lookup_int(*bp, args); + if (error != -ENOATTR && error != -EEXIST) + xfs_trans_brelse(args->trans, *bp); - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { - /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Read in the block containing the "old" attr, then - * remove the "old" attr from that block (neat, huh!) - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; - - xfs_attr3_leaf_remove(bp, args); - - /* - * If the result is small enough, shrink it all into the inode. - */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } - - /* - * Commit the remove and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - - } else if (args->rmtblkno > 0) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - error = xfs_attr3_leaf_clearflag(args); - } return error; } @@ -772,31 +1287,26 @@ xfs_attr_leaf_removename( * Remove the attribute. */ dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; - error = xfs_attr3_leaf_lookup_int(bp, args); + error = xfs_attr_leaf_hasname(args, &bp); if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; + return error; + } else if (error != -EEXIST) return error; - } xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + return xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } + return 0; } @@ -816,118 +1326,117 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; + error = xfs_attr_leaf_hasname(args, &bp); - error = xfs_attr3_leaf_lookup_int(bp, args); - if (error != -EEXIST) { + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; + + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); return error; } -/*======================================================================== - * External routines when attribute list size > geo->blksize - *========================================================================*/ - -/* - * Add a name to a Btree-format attribute list. - * - * This will involve walking down the Btree, and may involve splitting - * leaf nodes and even splitting intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_addname( - struct xfs_da_args *args) +xfs_attr_node_lookup( + struct xfs_da_args *args, + struct xfs_da_state *state) { - struct xfs_da_state *state; - struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - struct xfs_mount *mp; int retval, error; - trace_xfs_attr_node_addname(args); - /* - * Fill in bucket of arguments/results/context to carry around. + * Search to see if name exists, and get back a pointer to it. */ - dp = args->dp; - mp = dp->i_mount; -restart: - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + return error; + + return retval; +} + +/*======================================================================== + * External routines when attribute list size > geo->blksize + *========================================================================*/ + +STATIC int +xfs_attr_node_addname_find_attr( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) - goto out; - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - goto out; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) - goto out; - - trace_xfs_attr_node_replace(args); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + goto error; - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + trace_xfs_attr_node_replace(args); /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto error; + } + + return 0; +error: + if (attr->xattri_da_state) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; } + return error; +} + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +static int +xfs_attr_node_try_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_state *state = attr->xattri_da_state; + struct xfs_da_state_blk *blk; + int error; + + trace_xfs_attr_node_addname(state->args); + + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_add(blk->bp, state->args); - if (retval == -ENOSPC) { + error = xfs_attr3_leaf_add(blk->bp, state->args); + if (error == -ENOSPC) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* - * have been a b-tree. + * have been a b-tree. Let the caller deal with this. */ - xfs_da_state_free(state); - state = NULL; - error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - - /* - * Commit the node conversion and start the next - * trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - - goto restart; + goto out; } /* @@ -939,9 +1448,6 @@ restart: error = xfs_da3_split(state); if (error) goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; } else { /* * Addition succeeded, update Btree hashvals. @@ -949,204 +1455,51 @@ restart: xfs_da3_fixhashpath(state, &state->path); } - /* - * Kill the state structure, we're done with it and need to - * allow the buffers to come back later. - */ +out: xfs_da_state_free(state); - state = NULL; - - /* - * Commit the leaf addition or btree split and start the next - * trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; + attr->xattri_da_state = NULL; + return error; +} - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } +static int +xfs_attr_node_removename( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *blk; + int retval; /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. + * Remove the name and update the hashvals in the tree. */ - if (args->op_flags & XFS_DA_OP_RENAME) { - /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Re-find the "old" attribute entry after any split ops. - * The INCOMPLETE flag means that we will find the "old" - * attr, not the "new" one. - */ - args->op_flags |= XFS_DA_OP_INCOMPLETE; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); - if (error) - goto out; - - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); - - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } - - /* - * Commit and start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - - } else if (args->rmtblkno > 0) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - error = xfs_attr3_leaf_clearflag(args); - if (error) - goto out; - } - retval = error = 0; + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); -out: - if (state) - xfs_da_state_free(state); - if (error) - return error; return retval; } -/* - * Remove a name from a B-tree attribute list. - * - * This will involve walking down the Btree, and may involve joining - * leaf nodes and even joining intermediate nodes up to and including - * the root node (a special case of an intermediate node). - */ -STATIC int -xfs_attr_node_removename( - struct xfs_da_args *args) +static int +xfs_attr_node_remove_attr( + struct xfs_attr_intent *attr) { - struct xfs_da_state *state; - struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - struct xfs_buf *bp; - int retval, error, forkoff; - - trace_xfs_attr_node_removename(args); + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = xfs_da_state_alloc(args); + int retval = 0; + int error = 0; /* - * Tie a string around our finger to remind us where we are. - */ - dp = args->dp; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = dp->i_mount; - - /* - * Search to see if name exists, and get back a pointer to it. + * The attr we are removing has already been marked incomplete, so + * we need to set the filter appropriately to re-find the "old" + * attribute entry after any split ops. */ + args->attr_filter |= XFS_ATTR_INCOMPLETE; error = xfs_da3_node_lookup_int(state, &retval); - if (error || (retval != -EEXIST)) { - if (error == 0) - error = retval; + if (error) goto out; - } - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a deadlock. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if (args->rmtblkno > 0) { - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - goto out; - - /* - * Mark the attribute as INCOMPLETE, then bunmapi() the - * remote value. - */ - error = xfs_attr3_leaf_setflag(args); - if (error) - goto out; - error = xfs_attr_rmtval_remove(args); - if (error) - goto out; - - /* - * Refill the state structure with buffers, the prior calls - * released our buffers. - */ - error = xfs_attr_refillstate(state); - if (error) - goto out; - } - - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + error = xfs_attr_node_removename(args, state); /* * Check to see if the tree needs to be collapsed. @@ -1155,150 +1508,14 @@ xfs_attr_node_removename( error = xfs_da3_join(state); if (error) goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - /* - * Commit the Btree join operation and start a new trans. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - } - - /* - * If the result is small enough, push it all into the inode. - */ - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - goto out; - - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } else - xfs_trans_brelse(args->trans, bp); } - error = 0; + retval = error = 0; out: xfs_da_state_free(state); - return error; -} - -/* - * Fill in the disk block numbers in the state structure for the buffers - * that are attached to the state structure. - * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commits have released these buffers. - */ -STATIC int -xfs_attr_fillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level; - - trace_xfs_attr_fillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". - */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - return 0; -} - -/* - * Reattach the buffers to the state structure based on the disk block - * numbers stored in the state structure. - * This is done after some set of transaction commits have released those - * buffers from our grip. - */ -STATIC int -xfs_attr_refillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level, error; - - trace_xfs_attr_refillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } - - /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". - */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } - - return 0; + if (error) + return error; + return retval; } /* @@ -1311,35 +1528,29 @@ xfs_attr_refillstate(xfs_da_state_t *state) * Returns 0 on successful retrieval, otherwise an error. */ STATIC int -xfs_attr_node_get(xfs_da_args_t *args) +xfs_attr_node_get( + struct xfs_da_args *args) { - xfs_da_state_t *state; - xfs_da_state_blk_t *blk; - int error, retval; - int i; + struct xfs_da_state *state; + struct xfs_da_state_blk *blk; + int i; + int error; trace_xfs_attr_node_get(args); - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; - /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) { - retval = error; - goto out_release; - } - if (retval != -EEXIST) + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + if (error != -EEXIST) goto out_release; /* * Get the value, local or "remote" */ blk = &state->path.blk[state->path.active - 1]; - retval = xfs_attr3_leaf_getvalue(blk->bp, args); + error = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. @@ -1351,7 +1562,7 @@ out_release: } xfs_da_state_free(state); - return retval; + return error; } /* Returns true if the attribute entry name is valid. */ @@ -1370,3 +1581,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 4243b2272642..81be9b3e4004 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -21,39 +21,6 @@ struct xfs_attr_list_context; * as possible so as to fit into the literal area of the inode. */ -/*======================================================================== - * External interfaces - *========================================================================*/ - - -#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ -#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ -#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ -#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ -#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ -#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ - -#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ -#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ - -#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ - -#define ATTR_KERNEL_FLAGS \ - (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) - -#define XFS_ATTR_FLAGS \ - { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ - { ATTR_ROOT, "ROOT" }, \ - { ATTR_TRUST, "TRUST" }, \ - { ATTR_SECURE, "SECURE" }, \ - { ATTR_CREATE, "CREATE" }, \ - { ATTR_REPLACE, "REPLACE" }, \ - { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" }, \ - { ATTR_ALLOC, "ALLOC" } - /* * The maximum size (into the kernel or returned from the kernel) of an * attribute value or the buffer used for an attr_list() call. Larger @@ -62,45 +29,16 @@ struct xfs_attr_list_context; #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ /* - * Define how lists of attribute names are returned to the user from - * the attr_list() call. A large, 32bit aligned, buffer is passed in - * along with its size. We put an array of offsets at the top that each - * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. - */ -typedef struct attrlist { - __s32 al_count; /* number of entries in attrlist */ - __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ -} attrlist_t; - -/* - * Show the interesting info about one attribute. This is what the - * al_offset[i] entry points to. - */ -typedef struct attrlist_ent { /* data from attr_list() */ - __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ -} attrlist_ent_t; - -/* - * Given a pointer to the (char*) buffer containing the attr_list() result, - * and an index, return a pointer to the indicated attribute in the buffer. - */ -#define ATTR_ENTRY(buffer, index) \ - ((attrlist_ent_t *) \ - &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) - -/* * Kernel-internal version of the attrlist cursor. */ -typedef struct attrlist_cursor_kern { +struct xfs_attrlist_cursor_kern { __u32 hashval; /* hash value of next entry to add */ __u32 blkno; /* block containing entry (suggestion) */ __u32 offset; /* offset in list of equal-hashvals */ __u16 pad1; /* padding to match user-level */ __u8 pad2; /* padding to match user-level */ __u8 initted; /* T/F: cursor has been initialized */ -} attrlist_cursor_kern_t; +}; /*======================================================================== @@ -112,27 +50,484 @@ typedef struct attrlist_cursor_kern { typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); -typedef struct xfs_attr_list_context { - struct xfs_trans *tp; - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - char *alist; /* output buffer */ +struct xfs_attr_list_context { + struct xfs_trans *tp; + struct xfs_inode *dp; /* inode */ + struct xfs_attrlist_cursor_kern cursor; /* position in list */ + void *buffer; /* output buffer */ /* * Abort attribute list iteration if non-zero. Can be used to pass * error values to the xfs_attr_list caller. */ - int seen_enough; + int seen_enough; + bool allow_incomplete; - ssize_t count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */ + int resynch; /* T/F: resynch with cursor */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +}; + + +/* + * ======================================================================== + * Structure used to pass context around among the delayed routines. + * ======================================================================== + */ + +/* + * Below is a state machine diagram for attr remove operations. The XFS_DAS_* + * states indicate places where the function would return -EAGAIN, and then + * immediately resume from after being called by the calling function. States + * marked as a "subroutine state" indicate that they belong to a subroutine, and + * so the calling function needs to pass them back to that subroutine to allow + * it to finish where it left off. But they otherwise do not have a role in the + * calling function other than just passing through. + * + * xfs_attr_remove_iter() + * │ + * v + * have attr to remove? ──n──> done + * │ + * y + * │ + * v + * are we short form? ──y──> xfs_attr_shortform_remove ──> done + * │ + * n + * │ + * V + * are we leaf form? ──y──> xfs_attr_leaf_removename ──> done + * │ + * n + * │ + * V + * ┌── need to setup state? + * │ │ + * n y + * │ │ + * │ v + * │ find attr and get state + * │ attr has remote blks? ──n─┐ + * │ │ v + * │ │ find and invalidate + * │ y the remote blocks. + * │ │ mark attr incomplete + * │ ├────────────────┘ + * └──────────┤ + * │ + * v + * Have remote blks to remove? ───y─────┐ + * │ ^ remove the blks + * │ │ │ + * │ │ v + * │ XFS_DAS_RMTBLK <─n── done? + * │ re-enter with │ + * │ one less blk to y + * │ remove │ + * │ V + * │ refill the state + * n │ + * │ v + * │ XFS_DAS_RM_NAME + * │ │ + * ├─────────────────────────┘ + * │ + * v + * remove leaf and + * update hash with + * xfs_attr_node_remove_cleanup + * │ + * v + * need to + * shrink tree? ─n─┐ + * │ │ + * y │ + * │ │ + * v │ + * join leaf │ + * │ │ + * v │ + * XFS_DAS_RM_SHRINK │ + * │ │ + * v │ + * do the shrink │ + * │ │ + * v │ + * free state <──┘ + * │ + * v + * done + * + * + * Below is a state machine diagram for attr set operations. + * + * It seems the challenge with understanding this system comes from trying to + * absorb the state machine all at once, when really one should only be looking + * at it with in the context of a single function. Once a state sensitive + * function is called, the idea is that it "takes ownership" of the + * state machine. It isn't concerned with the states that may have belonged to + * it's calling parent. Only the states relevant to itself or any other + * subroutines there in. Once a calling function hands off the state machine to + * a subroutine, it needs to respect the simple rule that it doesn't "own" the + * state machine anymore, and it's the responsibility of that calling function + * to propagate the -EAGAIN back up the call stack. Upon reentry, it is + * committed to re-calling that subroutine until it returns something other than + * -EAGAIN. Once that subroutine signals completion (by returning anything other + * than -EAGAIN), the calling function can resume using the state machine. + * + * xfs_attr_set_iter() + * │ + * v + * ┌─y─ has an attr fork? + * │ | + * │ n + * │ | + * │ V + * │ add a fork + * │ │ + * └──────────┤ + * │ + * V + * ┌─── is shortform? + * │ │ + * │ y + * │ │ + * │ V + * │ xfs_attr_set_fmt + * │ | + * │ V + * │ xfs_attr_try_sf_addname + * │ │ + * │ V + * │ had enough ──y──> done + * │ space? + * n │ + * │ n + * │ │ + * │ V + * │ transform to leaf + * │ │ + * │ V + * │ hold the leaf buffer + * │ │ + * │ V + * │ return -EAGAIN + * │ Re-enter in + * │ leaf form + * │ + * └─> release leaf buffer + * if needed + * │ + * V + * ┌───n── fork has + * │ only 1 blk? + * │ │ + * │ y + * │ │ + * │ v + * │ xfs_attr_leaf_try_add() + * │ │ + * │ v + * │ had enough ──────────────y─────────────┐ + * │ space? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ return -EAGAIN │ + * │ re-enter in │ + * │ node form │ + * │ │ │ + * └──────────┤ │ + * │ │ + * V │ + * xfs_attr_node_addname_find_attr │ + * determines if this │ + * is create or rename │ + * find space to store attr │ + * │ │ + * v │ + * xfs_attr_node_addname │ + * │ │ + * v │ + * fits in a node leaf? ────n─────┐ │ + * │ ^ v │ + * │ │ single leaf node? │ + * │ │ │ │ │ + * y │ y n │ + * │ │ │ │ │ + * v │ v v │ + * update │ grow the leaf split if │ + * hashvals └── return -EAGAIN needed │ + * │ retry leaf add │ │ + * │ on reentry │ │ + * ├────────────────────────────┘ │ + * │ │ + * v │ + * need to alloc │ + * ┌─y── or flip flag? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ done │ + * │ │ + * │ │ + * │ XFS_DAS_FOUND_LBLK <────────────────┘ + * │ │ + * │ V + * │ xfs_attr_leaf_addname() + * │ │ + * │ v + * │ ┌──first time through? + * │ │ │ + * │ │ y + * │ │ │ + * │ n v + * │ │ if we have rmt blks + * │ │ find space for them + * │ │ │ + * │ └──────────┤ + * │ │ + * │ v + * │ still have + * │ ┌─n─ blks to alloc? <──┐ + * │ │ │ │ + * │ │ y │ + * │ │ │ │ + * │ │ v │ + * │ │ alloc one blk │ + * │ │ return -EAGAIN ──┘ + * │ │ re-enter with one + * │ │ less blk to alloc + * │ │ + * │ │ + * │ └───> set the rmt + * │ value + * │ │ + * │ v + * │ was this + * │ a rename? ──n─┐ + * │ │ │ + * │ y │ + * │ │ │ + * │ v │ + * │ flip incomplete │ + * │ flag │ + * │ │ │ + * │ v │ + * │ XFS_DAS_FLIP_LFLAG │ + * │ │ │ + * │ v │ + * │ need to remove │ + * │ old bks? ──n──┤ + * │ │ │ + * │ y │ + * │ │ │ + * │ V │ + * │ remove │ + * │ ┌───> old blks │ + * │ │ │ │ + * │ XFS_DAS_RM_LBLK │ │ + * │ ^ │ │ + * │ │ v │ + * │ └──y── more to │ + * │ remove? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ XFS_DAS_RD_LEAF │ + * │ │ │ + * │ v │ + * │ remove leaf │ + * │ │ │ + * │ v │ + * │ shrink to sf │ + * │ if needed │ + * │ │ │ + * │ v │ + * │ done <──────┘ + * │ + * └──────> XFS_DAS_FOUND_NBLK + * │ + * v + * ┌─────n── need to + * │ alloc blks? + * │ │ + * │ y + * │ │ + * │ v + * │ find space + * │ │ + * │ v + * │ ┌─>XFS_DAS_ALLOC_NODE + * │ │ │ + * │ │ v + * │ │ alloc blk + * │ │ │ + * │ │ v + * │ └──y── need to alloc + * │ more blocks? + * │ │ + * │ n + * │ │ + * │ v + * │ set the rmt value + * │ │ + * │ v + * │ was this + * └────────> a rename? ──n─┐ + * │ │ + * y │ + * │ │ + * v │ + * flip incomplete │ + * flag │ + * │ │ + * v │ + * XFS_DAS_FLIP_NFLAG │ + * │ │ + * v │ + * need to │ + * remove blks? ─n──┤ + * │ │ + * y │ + * │ │ + * v │ + * remove │ + * ┌────────> old blks │ + * │ │ │ + * XFS_DAS_RM_NBLK │ │ + * ^ │ │ + * │ v │ + * └──────y── more to │ + * remove │ + * │ │ + * n │ + * │ │ + * v │ + * XFS_DAS_CLR_FLAG │ + * │ │ + * v │ + * clear flags │ + * │ │ + * ├──────────┘ + * │ + * v + * done + */ + +/* + * Enum values for xfs_attr_intent.xattri_da_state + * + * These values are used by delayed attribute operations to keep track of where + * they were before they returned -EAGAIN. A return code of -EAGAIN signals the + * calling function to roll the transaction, and then call the subroutine to + * finish the operation. The enum is then used by the subroutine to jump back + * to where it was and resume executing where it left off. + */ +enum xfs_delattr_state { + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + + /* + * Initial sequence states. The replace setup code relies on the + * ADD and REMOVE states for a specific format to be sequential so + * that we can transform the initial operation to be performed + * according to the xfs_has_larp() state easily. + */ + XFS_DAS_SF_ADD, /* Initial sf add state */ + XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */ + + XFS_DAS_LEAF_ADD, /* Initial leaf add state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */ + + XFS_DAS_NODE_ADD, /* Initial node add state */ + XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */ + + /* Leaf state set/replace/remove sequence */ + XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ + XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ + XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ + XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ + + /* Node state sequence, must match leaf state above */ + XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ + XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ + XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */ + + XFS_DAS_DONE, /* finished operation */ +}; + +#define XFS_DAS_STRINGS \ + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ + { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ + { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ + { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ + { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ + { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ + { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } + +struct xfs_attri_log_nameval; + +/* + * Context used for keeping track of delayed attribute operations + */ +struct xfs_attr_intent { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; + + /* Used in xfs_attr_node_removename to roll through removing blocks */ + struct xfs_da_state *xattri_da_state; + + struct xfs_da_args *xattri_da_args; + + /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; + + /* Used to keep track of current state of delayed operation */ + enum xfs_delattr_state xattri_dela_state; + + /* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* + */ + unsigned int xattri_op_flags; + + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; +}; /*======================================================================== @@ -143,21 +538,84 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); -int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_attr_list_ilocked(struct xfs_attr_list_context *); +int xfs_attr_list(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); -int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - size_t namelen, unsigned char **value, int *valuelenp, - int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, unsigned char *value, int valuelen, int flags); -int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, int flags); -int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); +bool xfs_attr_is_leaf(struct xfs_inode *ip); +int xfs_attr_get_ilocked(struct xfs_da_args *args); +int xfs_attr_get(struct xfs_da_args *args); +int xfs_attr_set(struct xfs_da_args *args); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); +int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0); +} + +static inline enum xfs_delattr_state +xfs_attr_init_add_state(struct xfs_da_args *args) +{ + /* + * When called from the completion of a attr remove to determine the + * next state, the attribute fork may be null. This can occur only occur + * on a pure remove, but we grab the next state before we check if a + * replace operation is being performed. If we are called from any other + * context, i_af is guaranteed to exist. Hence if the attr fork is + * null, we were called from a pure remove operation and so we are done. + */ + if (!xfs_inode_has_attr_fork(args->dp)) + return XFS_DAS_DONE; + + args->op_flags |= XFS_DA_OP_ADDNAME; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_ADD; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_ADD; + return XFS_DAS_NODE_ADD; +} + +static inline enum xfs_delattr_state +xfs_attr_init_remove_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_REMOVE; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_REMOVE; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_REMOVE; + return XFS_DAS_NODE_REMOVE; +} + +/* + * If we are logging the attributes, then we have to start with removal of the + * old attribute so that there is always consistent state that we can recover + * from if the system goes down part way through. We always log the new attr + * value, so even when we remove the attr first we still have the information in + * the log to finish the replace operation atomically. + */ +static inline enum xfs_delattr_state +xfs_attr_init_replace_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; + if (args->op_flags & XFS_DA_OP_LOGGED) + return xfs_attr_init_remove_state(args); + return xfs_attr_init_add_state(args); +} + +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fed537a4353d..beee51ad75ce 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -19,14 +19,16 @@ #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_attr_sf.h" -#include "xfs_attr_remote.h" #include "xfs_attr.h" +#include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_errortag.h" /* @@ -287,6 +289,23 @@ xfs_attr3_leaf_verify_entry( return NULL; } +/* + * Validate an attribute leaf block. + * + * Empty leaf blocks can occur under the following circumstances: + * + * 1. setxattr adds a new extended attribute to a file; + * 2. The file has zero existing attributes; + * 3. The attribute is too large to fit in the attribute fork; + * 4. The attribute is small enough to fit in a leaf block; + * 5. A log flush occurs after committing the transaction that creates + * the (empty) leaf block; and + * 6. The filesystem goes down after the log flush but before the new + * attribute can be committed to the leaf block. + * + * Hence we need to ensure that we don't fail the validation purely + * because the leaf is empty. + */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) @@ -309,14 +328,6 @@ xfs_attr3_leaf_verify( return fa; /* - * In recovery there is a transient state where count == 0 is valid - * because we may have transitioned an empty shortform attr to a leaf - * if the attr didn't fit in shortform. - */ - if (!xfs_log_in_recovery(mp) && ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -331,6 +342,13 @@ xfs_attr3_leaf_verify( (char *)bp->b_addr + ichdr.firstused) return __this_address; + /* + * NOTE: This verifier historically failed empty leaf buffers because + * we expect the fork to be in another format. Empty attr fork format + * conversions are possible during xattr set, however, and format + * conversion is not atomic with the xattr set that triggers it. We + * cannot assume leaf blocks are non-empty until that is addressed. + */ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, @@ -384,7 +402,7 @@ xfs_attr3_leaf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -406,7 +424,7 @@ xfs_attr3_leaf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -446,13 +464,36 @@ xfs_attr3_leaf_read( *========================================================================*/ /* - * If namespace bits don't match return 0. - * If all match then return 1. + * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE + * flag on disk - if there's an incomplete attr then recovery needs to tear it + * down. If there's no incomplete attr, then recovery needs to tear that attr + * down to replace it with the attr that has been logged. In this case, the + * INCOMPLETE flag will not be set in attr->attr_filter, but rather + * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ -STATIC int -xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +static bool +xfs_attr_match( + struct xfs_da_args *args, + uint8_t namelen, + unsigned char *name, + int flags) { - return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); + + if (args->namelen != namelen) + return false; + if (memcmp(args->name, name, namelen) != 0) + return false; + + /* Recovery ignores the INCOMPLETE flag. */ + if ((args->op_flags & XFS_DA_OP_RECOVERY) && + args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return true; + + /* All remaining matches need to be filtered by INCOMPLETE state. */ + if (args->attr_filter != + (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) + return false; + return true; } static int @@ -464,7 +505,7 @@ xfs_attr_copy_value( /* * No copy if all we have to do is get the length */ - if (args->flags & ATTR_KERNOVAL) { + if (!args->valuelen) { args->valuelen = valuelen; return 0; } @@ -477,8 +518,8 @@ xfs_attr_copy_value( return -ERANGE; } - if (args->op_flags & XFS_DA_OP_ALLOCVAL) { - args->value = kmem_alloc_large(valuelen, 0); + if (!args->value) { + args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP); if (!args->value) return -ENOMEM; } @@ -505,13 +546,13 @@ xfs_attr_copy_value( *========================================================================*/ /* - * Query whether the requested number of additional bytes of extended + * Query whether the total requested number of attr fork bytes of extended * attribute space will be able to fit inline. * - * Returns zero if not, else the di_forkoff fork offset to be used in the + * Returns zero if not, else the i_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * - * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value; + * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value; * special case for dev/uuid inodes, they have fixed size data forks. */ int @@ -525,10 +566,16 @@ xfs_attr_shortform_bytesfit( int maxforkoff; int offset; + /* + * Check if the new size could fit at all first: + */ + if (bytes > XFS_LITINO(mp)) + return 0; + /* rounded down */ - offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + offset = (XFS_LITINO(mp) - bytes) >> 3; - if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { + if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; } @@ -543,29 +590,29 @@ xfs_attr_shortform_bytesfit( * to real extents, or the delalloc conversion will take care of the * literal area rebalancing. */ - if (bytes <= XFS_IFORK_ASIZE(dp)) - return dp->i_d.di_forkoff; + if (bytes <= xfs_inode_attr_fork_size(dp)) + return dp->i_forkoff; /* * For attr2 we can try to move the forkoff if there is space in the * literal area, but for the old format we are done if there is no * space in the fixed attribute fork. */ - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) + if (!xfs_has_attr2(mp)) return 0; dsize = dp->i_df.if_bytes; - switch (dp->i_d.di_format) { + switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* - * If there is no attr fork and the data fork is extents, + * If there is no attr fork and the data fork is extents, * determine if creating the default attr fork will result * in the extents form migrating to btree. If so, the * minimum offset only needs to be the space required for * the btree root. */ - if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; @@ -576,10 +623,10 @@ xfs_attr_shortform_bytesfit( * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ - if (dp->i_d.di_forkoff) { - if (offset < dp->i_d.di_forkoff) + if (dp->i_forkoff) { + if (offset < dp->i_forkoff) return 0; - return dp->i_d.di_forkoff; + return dp->i_forkoff; } dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; @@ -593,8 +640,7 @@ xfs_attr_shortform_bytesfit( minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -605,99 +651,134 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: + * - noattr2 mount option is set, + * - on-disk version bit says it is already set, or + * - the attr2 mount option is not set to enable automatic upgrade from attr1. */ STATIC void -xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) +xfs_sbversion_add_attr2( + struct xfs_mount *mp, + struct xfs_trans *tp) { - if ((mp->m_flags & XFS_MOUNT_ATTR2) && - !(xfs_sb_version_hasattr2(&mp->m_sb))) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr2(&mp->m_sb)) { - xfs_sb_version_addattr2(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_log_sb(tp); - } else - spin_unlock(&mp->m_sb_lock); - } + if (xfs_has_noattr2(mp)) + return; + if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) + return; + if (!xfs_has_attr2(mp)) + return; + + spin_lock(&mp->m_sb_lock); + xfs_add_attr2(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); } /* * Create the initial contents of a shortform attribute list. */ void -xfs_attr_shortform_create(xfs_da_args_t *args) +xfs_attr_shortform_create( + struct xfs_da_args *args) { - xfs_attr_sf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); - dp = args->dp; - ASSERT(dp != NULL); - ifp = dp->i_afp; - ASSERT(ifp != NULL); ASSERT(ifp->if_bytes == 0); - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { - ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; - ifp->if_flags |= XFS_IFINLINE; - } else { - ASSERT(ifp->if_flags & XFS_IFINLINE); - } + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) + ifp->if_format = XFS_DINODE_FMT_LOCAL; xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; - hdr->count = 0; + hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } /* + * Return -EEXIST if attr is found, or -ENOATTR if not + * args: args containing attribute name and namelen + * sfep: If not null, pointer will be set to the last attr entry found on + -EEXIST. On -ENOATTR pointer is left at the last entry in the list + * basep: If not null, pointer is set to the byte offset of the entry in the + * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of + * the last entry in the list + */ +int +xfs_attr_sf_findname( + struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep) +{ + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + unsigned int base = sizeof(struct xfs_attr_sf_hdr); + int size = 0; + int end; + int i; + + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), + base += size, i++) { + size = xfs_attr_sf_entsize(sfe); + if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + continue; + break; + } + + if (sfep != NULL) + *sfep = sfe; + + if (basep != NULL) + *basep = base; + + if (i == end) + return -ENOATTR; + return -EEXIST; +} + +/* * Add a name/value pair to the shortform attribute list. * Overflow from the inode has already been checked for. */ void -xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +xfs_attr_shortform_add( + struct xfs_da_args *args, + int forkoff) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i, offset, size; - xfs_mount_t *mp; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int offset, size; + struct xfs_mount *mp; + struct xfs_inode *dp; + struct xfs_ifork *ifp; trace_xfs_attr_sf_add(args); dp = args->dp; mp = dp->i_mount; - dp->i_d.di_forkoff = forkoff; + dp->i_forkoff = forkoff; - ifp = dp->i_afp; - ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { -#ifdef DEBUG - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; + ifp = &dp->i_af; + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); -#endif - } offset = (char *)sfe - (char *)sf; - size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; @@ -716,13 +797,10 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - ip->i_d.di_forkoff = 0; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - - ASSERT(ip->i_d.di_anextents == 0); - ASSERT(ip->i_afp == NULL); + ASSERT(ip->i_af.if_nextents == 0); + xfs_ifork_zap_attr(ip); + ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -730,35 +808,35 @@ xfs_attr_fork_remove( * Remove an attribute from the shortform attribute list structure. */ int -xfs_attr_shortform_remove(xfs_da_args_t *args) +xfs_attr_sf_removename( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int base, size=0, end, totsize, i; - xfs_mount_t *mp; - xfs_inode_t *dp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int size = 0, end, totsize; + unsigned int base; + struct xfs_mount *mp; + struct xfs_inode *dp; + int error; trace_xfs_attr_sf_remove(args); dp = args->dp; mp = dp->i_mount; - base = sizeof(xfs_attr_sf_hdr_t); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), - base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); - if (sfe->namelen != args->namelen) - continue; - if (memcmp(sfe->nameval, args->name, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - break; - } - if (i == end) - return -ENOATTR; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; + + error = xfs_attr_sf_findname(args, &sfe, &base); + + /* + * If we are recovering an operation, finding nothing to + * remove is not an error - it just means there was nothing + * to clean up. + */ + if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) + return 0; + if (error != -EEXIST) + return error; + size = xfs_attr_sf_entsize(sfe); /* * Fix up the attribute fork data, covering the hole @@ -774,19 +852,18 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && - (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & XFS_DA_OP_ADDNAME)) { + if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); - dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); - ASSERT(dp->i_d.di_forkoff); + dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); + ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !(mp->m_flags & XFS_MOUNT_ATTR2) || - dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + !xfs_has_attr2(mp) || + dp->i_df.if_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -803,26 +880,22 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) int xfs_attr_shortform_lookup(xfs_da_args_t *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; int i; struct xfs_ifork *ifp; trace_xfs_attr_sf_lookup(args); - ifp = args->dp->i_afp; - ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + ifp = &args->dp->i_af; + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return -EEXIST; + sfe = xfs_attr_sf_nextentry(sfe), i++) { + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return -EEXIST; } return -ENOATTR; } @@ -830,9 +903,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) /* * Retrieve the attribute value and length. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr_shortform_getvalue( @@ -842,31 +915,23 @@ xfs_attr_shortform_getvalue( struct xfs_attr_sf_entry *sfe; int i; - ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], - sfe->valuelen); + sfe = xfs_attr_sf_nextentry(sfe), i++) { + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return xfs_attr_copy_value(args, + &sfe->nameval[args->namelen], sfe->valuelen); } return -ENOATTR; } -/* - * Convert from using the shortform to the leaf. On success, return the - * buffer so that we can keep it locked until we're totally done with it. - */ +/* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp; struct xfs_attr_shortform *sf; @@ -881,13 +946,13 @@ xfs_attr_shortform_to_leaf( trace_xfs_attr_sf_to_leaf(args); dp = args->dp; - ifp = dp->i_afp; - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + ifp = &dp->i_af; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (xfs_attr_shortform_t *)tmpbuffer; + sf = (struct xfs_attr_shortform *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -918,17 +983,16 @@ xfs_attr_shortform_to_leaf( nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != -ENOSPC); if (error) goto out; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } error = 0; - *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; @@ -966,12 +1030,11 @@ xfs_attr_shortform_allfit( return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; - bytes += sizeof(struct xfs_attr_sf_entry) - 1 - + name_loc->namelen - + be16_to_cpu(name_loc->valuelen); + bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); } - if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + if (xfs_has_attr2(dp->i_mount) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -990,8 +1053,8 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); + ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -1010,6 +1073,8 @@ xfs_attr_shortform_verify( * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are * within the data buffer. + * xfs_attr_sf_entry is defined with a 1-byte variable + * array at the end, so we must subtract that off. */ if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; @@ -1023,7 +1088,7 @@ xfs_attr_shortform_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; @@ -1093,9 +1158,17 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); - ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_remove(dp, args->trans); + /* + * Don't remove the attr fork if this operation is the first + * part of a attr replace operations. We're going to add a new + * attr immediately, so we need to keep the attr fork around in + * this case. + */ + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + ASSERT(xfs_has_attr2(dp->i_mount)); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + } goto out; } @@ -1124,7 +1197,7 @@ xfs_attr3_leaf_to_shortform( nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; @@ -1155,6 +1228,11 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + error = -EIO; + goto out; + } + error = xfs_da_grow_inode(args, &blkno); if (error) goto out; @@ -1170,9 +1248,9 @@ xfs_attr3_leaf_to_node( xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(bp2->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); } xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); @@ -1235,12 +1313,12 @@ xfs_attr3_leaf_create( memset(&ichdr, 0, sizeof(ichdr)); ichdr.firstused = args->geo->blksize; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; ichdr.magic = XFS_ATTR3_LEAF_MAGIC; - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); @@ -1449,10 +1527,12 @@ xfs_attr3_leaf_add_work( entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); - entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); - if (args->op_flags & XFS_DA_OP_RENAME) { - entry->flags |= XFS_ATTR_INCOMPLETE; + entry->flags = args->attr_filter; + if (tmp) + entry->flags |= XFS_ATTR_LOCAL; + if (args->op_flags & XFS_DA_OP_REPLACE) { + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; @@ -2346,7 +2426,7 @@ xfs_attr3_leaf_lookup_int( xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2365,11 +2445,11 @@ xfs_attr3_leaf_lookup_int( break; } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2399,33 +2479,17 @@ xfs_attr3_leaf_lookup_int( /* * GROT: Add code to remove incomplete entries. */ - /* - * If we are looking for INCOMPLETE entries, show only those. - * If we are looking for complete entries, show only those. - */ - if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != - !!(entry->flags & XFS_ATTR_INCOMPLETE)) { - continue; - } if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (name_loc->namelen != args->namelen) - continue; - if (memcmp(args->name, name_loc->nameval, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_loc->namelen, + name_loc->nameval, entry->flags)) continue; args->index = probe; return -EEXIST; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (name_rmt->namelen != args->namelen) - continue; - if (memcmp(args->name, name_rmt->name, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_rmt->namelen, + name_rmt->name, entry->flags)) continue; args->index = probe; args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); @@ -2444,9 +2508,9 @@ xfs_attr3_leaf_lookup_int( * Get the value associated with an attribute name from a leaf attribute * list structure. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr3_leaf_getvalue( @@ -2771,10 +2835,7 @@ xfs_attr3_leaf_clearflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* @@ -2822,10 +2883,7 @@ xfs_attr3_leaf_setflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* @@ -2940,10 +2998,5 @@ xfs_attr3_leaf_flipflags( XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - - return error; + return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 73615b1dd1a8..368f4d9fa1d5 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. @@ -8,7 +8,6 @@ #define __XFS_ATTR_LEAF_H__ struct attrlist; -struct attrlist_cursor_kern; struct xfs_attr_list_context; struct xfs_da_args; struct xfs_da_state; @@ -50,9 +49,11 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, - struct xfs_buf **leaf_bp); -int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_sf_removename(struct xfs_da_args *args); +int xfs_attr_sf_findname(struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 8b7f74b3bea2..d440393b40eb 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -51,7 +51,7 @@ xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); return (attrlen + buflen - 1) / buflen; } @@ -96,8 +96,6 @@ xfs_attr3_rmt_verify( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return __this_address; if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) @@ -128,11 +126,11 @@ __xfs_attr3_rmt_read_verify( int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; ptr = bp->b_addr; - bno = bp->b_bn; + bno = xfs_buf_daddr(bp); len = BBTOB(bp->b_length); ASSERT(len >= blksize); @@ -193,11 +191,11 @@ xfs_attr3_rmt_write_verify( xfs_daddr_t bno; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; ptr = bp->b_addr; - bno = bp->b_bn; + bno = xfs_buf_daddr(bp); len = BBTOB(bp->b_length); ASSERT(len >= blksize); @@ -248,7 +246,7 @@ xfs_attr3_rmt_hdr_set( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); @@ -286,7 +284,7 @@ xfs_attr_rmtval_copyout( uint8_t **dst) { char *src = bp->b_addr; - xfs_daddr_t bno = bp->b_bn; + xfs_daddr_t bno = xfs_buf_daddr(bp); int len = BBTOB(bp->b_length); int blksize = mp->m_attr_geo->blksize; @@ -298,7 +296,7 @@ xfs_attr_rmtval_copyout( byte_cnt = min(*valuelen, byte_cnt); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, @@ -334,7 +332,7 @@ xfs_attr_rmtval_copyin( uint8_t **src) { char *dst = bp->b_addr; - xfs_daddr_t bno = bp->b_bn; + xfs_daddr_t bno = xfs_buf_daddr(bp); int len = BBTOB(bp->b_length); int blksize = mp->m_attr_geo->blksize; @@ -397,7 +395,7 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); - ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->valuelen != 0); ASSERT(args->rmtvaluelen == args->valuelen); valuelen = args->rmtvaluelen; @@ -440,32 +438,23 @@ xfs_attr_rmtval_get( } /* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. + * Find a "hole" in the attribute address space large enough for us to drop the + * new attributes value into */ int -xfs_attr_rmtval_set( +xfs_attr_rmt_find_hole( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - xfs_fileoff_t lfileoff = 0; - uint8_t *src = args->value; - int blkcnt; - int valuelen; - int nmap; int error; - int offset = 0; - - trace_xfs_attr_rmtval_set(args); + int blkcnt; + xfs_fileoff_t lfileoff = 0; /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. Because CRC enable - * attributes have headers, we can't just do a straight byte to FSB - * conversion and have to take the header space into account. + * Because CRC enable attributes have headers, we can't just do a + * straight byte to FSB conversion and have to take the header space + * into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, @@ -473,48 +462,26 @@ xfs_attr_rmtval_set( if (error) return error; - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - * - * Note that we have to consider this a data allocation as we - * write the remote attribute without logging the contents. - * Hence we must ensure that we aren't using blocks that are on - * the busy list so that we don't overwrite blocks which have - * recently been freed but their transactions are not yet - * committed to disk. If we overwrite the contents of a busy - * extent and then crash then the block may not contain the - * correct metadata after log recovery occurs. - */ - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, - &nmap); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; + return 0; +} - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - } +int +xfs_attr_rmtval_set_value( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -576,6 +543,7 @@ xfs_attr_rmtval_stale( { struct xfs_mount *mp = ip->i_mount; struct xfs_buf *bp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -583,14 +551,82 @@ xfs_attr_rmtval_stale( XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) return -EFSCORRUPTED; - bp = xfs_buf_incore(mp->m_ddev_targp, + error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), - XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, map->br_blockcount), + incore_flags, &bp); + if (error) { + if (error == -ENOENT) + return 0; + return error; } + xfs_buf_stale(bp); + xfs_buf_relse(bp); + return 0; +} + +/* + * Find a hole for the attr and store it in the delayed attr context. This + * initializes the context to roll through allocating an attr extent for a + * delayed attr operation + */ +int +xfs_attr_rmtval_find_space( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_bmbt_irec *map = &attr->xattri_map; + int error; + + attr->xattri_lblkno = 0; + attr->xattri_blkcnt = 0; + args->rmtblkcnt = 0; + args->rmtblkno = 0; + memset(map, 0, sizeof(struct xfs_bmbt_irec)); + + error = xfs_attr_rmt_find_hole(args); + if (error) + return error; + + attr->xattri_blkcnt = args->rmtblkcnt; + attr->xattri_lblkno = args->rmtblkno; + + return 0; +} + +/* + * Write one block of the value associated with an attribute into the + * out-of-line buffer that we have defined for it. This is similar to a subset + * of xfs_attr_rmtval_set, but records the current block to the delayed attr + * context, and leaves transaction handling to the caller. + */ +int +xfs_attr_rmtval_set_blk( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_bmbt_irec *map = &attr->xattri_map; + int nmap; + int error; + + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, + (xfs_fileoff_t)attr->xattri_lblkno, + attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total, + map, &nmap); + if (error) + return error; + + ASSERT(nmap == 1); + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && + (map->br_startblock != HOLESTARTBLOCK)); + + /* roll attribute extent map forwards */ + attr->xattri_lblkno += map->br_blockcount; + attr->xattri_blkcnt -= map->br_blockcount; + return 0; } @@ -599,15 +635,12 @@ xfs_attr_rmtval_stale( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove( +xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; int blkcnt; int error; - int done; - - trace_xfs_attr_rmtval_remove(args); /* * Roll through the "value", invalidating the attribute value's blocks. @@ -635,28 +668,45 @@ xfs_attr_rmtval_remove( lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the out-of-line + * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the + * transaction and re-call the function. Callers should keep calling this + * routine until it returns something other than -EAGAIN. + */ +int +xfs_attr_rmtval_remove( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error, done; /* - * Keep de-allocating extents until the remote-value region is gone. + * Unmap value blocks for this attr. */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK, 1, &done); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno, + args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done); + if (error) + return error; - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - return error; + /* + * We don't need an explicit state here to pick up where we left off. We + * can figure it out using the !done return code. The actual value of + * attr->xattri_dela_state may be some value reminiscent of the calling + * function, but it's value is irrelevant with in the context of this + * function. Once we are done here, the next state is set as needed by + * the parent + */ + if (!done) { + trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state, + args->dp); + return -EAGAIN; } + + args->rmtblkno = 0; + args->rmtblkcnt = 0; return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 6fb4572845ce..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. @@ -9,9 +9,12 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); -int xfs_attr_rmtval_set(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); - +int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); +int xfs_attr_rmt_find_hole(struct xfs_da_args *args); +int xfs_attr_rmtval_set_value(struct xfs_da_args *args); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index aafa4fe70624..37578b369d9b 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -13,7 +13,6 @@ * to fit into the literal area of the inode. */ typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; -typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; /* * We generate this then sort it, attr_list() must return things in hash-order. @@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort { unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; -#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ - (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) -#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ - ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) -#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ - ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) -#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ - (be16_to_cpu(((xfs_attr_shortform_t *) \ - ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +/* space name/value uses */ +static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen) +{ + return sizeof(struct xfs_attr_sf_entry) + nlen + vlen; +} + +/* space an entry uses */ +static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) +{ + return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); +} + +/* next entry in struct */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) +{ + return (void *)sfep + xfs_attr_sf_entsize(sfep); +} #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index 99017b8df292..a04f266ae644 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9a6d7a84689a..49d0d4ea63fc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -31,13 +31,13 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" #include "xfs_iomap.h" - -kmem_zone_t *xfs_bmap_free_item_zone; +struct kmem_cache *xfs_bmap_intent_cache; /* * Miscellaneous helper functions @@ -52,46 +52,54 @@ xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { + uint64_t maxblocks; /* max blocks at this level */ + xfs_extnum_t maxleafents; /* max leaf entries possible */ int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). + * The maximum number of extents in a fork, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count. * - * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be - * (xfs_default_attroffset(ip) >> 3) because we could have mounted - * with ATTR2 and then mounted back with ATTR1, keeping the - * di_forkoff's fixed but probably at various positions. Therefore, - * for both ATTR1 and ATTR2 we have to assume the worst case scenario - * of a minimum size available. + * Note that we can no longer assume that if we are in ATTR1 that the + * fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted with + * ATTR2 and then mounted back with ATTR1, keeping the i_forkoff's fixed + * but probably at various positions. Therefore, for both ATTR1 and + * ATTR2 we have to assume the worst case scenario of a minimum size + * available. */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; + maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; + else sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; + ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); +} + +unsigned int +xfs_bmap_compute_attr_offset( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_inodesize == 256) + return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); } STATIC int /* error */ @@ -120,10 +128,11 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -131,10 +140,11 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_BTREE && + ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -190,24 +200,15 @@ uint xfs_default_attroffset( struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; - uint offset; - - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp, ip->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { - offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - - ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); - return offset; + if (ip->i_df.if_format == XFS_DINODE_FMT_DEV) + return roundup(sizeof(xfs_dev_t), 8); + return M_IGEO(ip->i_mount)->attr_fork_offset; } /* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. + * Helper routine to reset inode i_forkoff field when switching attribute fork + * from local to extent format - we reset it where possible to make space + * available for inline data fork extents. */ STATIC void xfs_bmap_forkoff_reset( @@ -215,12 +216,12 @@ xfs_bmap_forkoff_reset( int whichfork) { if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + ip->i_df.if_format != XFS_DINODE_FMT_DEV && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) - ip->i_d.di_forkoff = dfl_forkoff; + if (dfl_forkoff > ip->i_forkoff) + ip->i_forkoff = dfl_forkoff; } } @@ -236,11 +237,11 @@ xfs_bmap_get_bp( if (!cur) return NULL; - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - if (!cur->bc_bufs[i]) + for (i = 0; i < cur->bc_maxlevels; i++) { + if (!cur->bc_levels[i].bp) break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) - return cur->bc_bufs[i]; + if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno) + return cur->bc_levels[i].bp; } /* Chase down all the log items to see if the bp is there */ @@ -248,7 +249,7 @@ xfs_bmap_get_bp( struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) + xfs_buf_daddr(bip->bli_buf) == bno) return bip->bli_buf; } @@ -293,7 +294,7 @@ xfs_check_block( else thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", @@ -313,35 +314,32 @@ xfs_check_block( */ STATIC void xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ + struct xfs_btree_cur *cur, /* btree cursor or null */ xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ + struct xfs_buf *bp; /* buffer for "block" */ int error; /* error return value */ xfs_extnum_t i=0, j; /* index into the extents list */ - struct xfs_ifork *ifp; /* fork structure */ int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return; - } /* skip large extent count inodes */ - if (ip->i_d.di_nextents > 10000) + if (ip->i_df.if_nextents > 10000) return; bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -468,7 +466,7 @@ error0: if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -485,7 +483,7 @@ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) @@ -522,55 +520,6 @@ xfs_bmap_validate_ret( #endif /* DEBUG */ /* - * bmap free list manipulation functions - */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -__xfs_bmap_add_free( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - bool skip_discard) -{ - struct xfs_extent_free_item *new; /* new element */ -#ifdef DEBUG - struct xfs_mount *mp = tp->t_mountp; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); - new->xefi_startblock = bno; - new->xefi_blockcount = (xfs_extlen_t)len; - if (oinfo) - new->xefi_oinfo = *oinfo; - else - new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - new->xefi_skip_discard = skip_discard; - trace_xfs_bmap_free_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, - XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); -} - -/* * Inode fork format manipulation functions */ @@ -589,12 +538,12 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ - xfs_buf_t *cbp; /* child block's buffer */ + struct xfs_buf *cbp; /* child block's buffer */ int error; /* error return value */ __be64 *pp; /* ptr to block address */ struct xfs_owner_info oinfo; @@ -605,8 +554,7 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); @@ -625,16 +573,15 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo); - ip->i_d.di_nblocks--; + xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; + if (cur->bc_levels[0].bp == cbp) + cur->bc_levels[0].bp = NULL; xfs_iroot_realloc(ip, -1, whichfork); ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -669,15 +616,14 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* * Make space in the inode incore. This needs to be undone if we fail * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; /* * Fill in the root. @@ -690,11 +636,11 @@ xfs_bmap_extents_to_btree( * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + ifp->if_format = XFS_DINODE_FMT_BTREE; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; @@ -727,8 +673,8 @@ xfs_bmap_extents_to_btree( ASSERT(tp->t_firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; - ip->i_d.di_nblocks++; + cur->bc_ino.allocated++; + ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, args.fsbno), @@ -741,7 +687,7 @@ xfs_bmap_extents_to_btree( */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - xfs_btree_init_block_int(mp, ablock, abp->b_bn, + xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); @@ -752,7 +698,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_disk_set_all(arp, &rec); cnt++; } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + ASSERT(cnt == ifp->if_nextents); xfs_btree_set_numrecs(ablock, cnt); /* @@ -780,7 +726,7 @@ out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -799,19 +745,17 @@ xfs_bmap_local_to_extents_empty( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; ifp->if_u1.if_root = NULL; ifp->if_height = 0; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -832,7 +776,7 @@ xfs_bmap_local_to_extents( int flags; /* logging flags returned */ struct xfs_ifork *ifp; /* inode fork pointer */ xfs_alloc_arg_t args; /* allocation arguments */ - xfs_buf_t *bp; /* buffer for extent block */ + struct xfs_buf *bp; /* buffer for extent block */ struct xfs_bmbt_irec rec; struct xfs_iext_cursor icur; @@ -841,8 +785,8 @@ xfs_bmap_local_to_extents( * So sending the data fork of a regular inode is invalid. */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { xfs_bmap_local_to_extents_empty(tp, ip, whichfork); @@ -852,7 +796,6 @@ xfs_bmap_local_to_extents( flags = 0; error = 0; - ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; @@ -909,8 +852,8 @@ xfs_bmap_local_to_extents( xfs_iext_first(ifp, &icur); xfs_iext_insert(ip, &icur, &rec, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - ip->i_d.di_nblocks = 1; + ifp->if_nextents = 1; + ip->i_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); flags |= xfs_ilog_fext(whichfork); @@ -929,13 +872,15 @@ xfs_bmap_add_attrfork_btree( xfs_inode_t *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_btree_block *block = ip->i_df.if_broot; + struct xfs_btree_cur *cur; /* btree cursor */ int error; /* error return value */ xfs_mount_t *mp; /* file system mount struct */ int stat; /* newroot status */ mp = ip->i_mount; - if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + + if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -953,7 +898,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -971,16 +916,17 @@ xfs_bmap_add_attrfork_extents( struct xfs_inode *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { - xfs_btree_cur_t *cur; /* bmap btree cursor */ + struct xfs_btree_cur *cur; /* bmap btree cursor */ int error; /* error return value */ - if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= + xfs_inode_data_fork_size(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -1005,7 +951,7 @@ xfs_bmap_add_attrfork_local( { struct xfs_da_args dargs; /* args for dir/attr code */ - if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip)) return 0; if (S_ISDIR(VFS_I(ip)->i_mode)) { @@ -1028,24 +974,28 @@ xfs_bmap_add_attrfork_local( return -EFSCORRUPTED; } -/* Set an inode attr fork off based on the format */ -int +/* + * Set an inode attr fork offset based on the format of the data fork. + */ +static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, int size, int *version) { - switch (ip->i_d.di_format) { + int default_size = xfs_default_attroffset(ip) >> 3; + + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + ip->i_forkoff = default_size; break; case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) + ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_forkoff) + ip->i_forkoff = default_size; + else if (xfs_has_attr2(ip->i_mount) && version) *version = 2; break; default: @@ -1073,48 +1023,28 @@ xfs_bmap_add_attrfork( int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(XFS_IFORK_Q(ip) == 0); + ASSERT(xfs_inode_has_attr_fork(ip) == 0); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); blks = XFS_ADDAFORK_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, + rsvd, &tp); if (error) return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto trans_cancel; - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) goto trans_cancel; - if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) { - error = -EFSCORRUPTED; - goto trans_cancel; - } - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } - xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) goto trans_cancel; - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); - ip->i_afp->if_flags = XFS_IFEXTENTS; + + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &logflags); break; @@ -1132,17 +1062,17 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) goto trans_cancel; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + if (!xfs_has_attr(mp) || + (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); + if (!xfs_has_attr(mp)) { + xfs_add_attr(mp); log_sb = true; } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); + if (!xfs_has_attr2(mp) && version == 2) { + xfs_add_attr2(mp); log_sb = true; } spin_unlock(&mp->m_sb_lock); @@ -1178,20 +1108,20 @@ xfs_iread_bmbt_block( { struct xfs_iread_state *ir = priv; struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_btree_block *block; struct xfs_buf *bp; struct xfs_bmbt_rec *frp; xfs_extnum_t num_recs; xfs_extnum_t j; - int whichfork = cur->bc_private.b.whichfork; + int whichfork = cur->bc_ino.whichfork; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); /* Abort if we find more records than nextents. */ num_recs = xfs_btree_get_numrecs(block); - if (unlikely(ir->loaded + num_recs > - XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) { xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, @@ -1217,7 +1147,7 @@ xfs_iread_bmbt_block( xfs_bmap_fork_to_state(whichfork)); trace_xfs_read_extent(ip, &ir->icur, xfs_bmap_fork_to_state(whichfork), _THIS_IP_); - xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur); + xfs_iext_next(ifp, &ir->icur); } return 0; @@ -1233,19 +1163,15 @@ xfs_iread_extents( int whichfork) { struct xfs_iread_state ir; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (!xfs_need_iread_extents(ifp)) + return 0; - if (XFS_IS_CORRUPT(mp, - XFS_IFORK_FORMAT(ip, whichfork) != - XFS_DINODE_FMT_BTREE)) { - error = -EFSCORRUPTED; - goto out; - } + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ir.loaded = 0; xfs_iext_first(ifp, &ir.icur); @@ -1256,14 +1182,11 @@ xfs_iread_extents( if (error) goto out; - if (XFS_IS_CORRUPT(mp, - ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { error = -EFSCORRUPTED; goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); - - ifp->if_flags |= XFS_IFEXTENTS; return 0; out: xfs_iext_destroy(ifp); @@ -1284,26 +1207,23 @@ xfs_bmap_first_unused( xfs_fileoff_t *first_unused, /* unused block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fileoff_t lastaddr = 0; xfs_fileoff_t lowest, max; int error; - ASSERT(xfs_ifork_has_extents(ip, whichfork) || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + ASSERT(xfs_ifork_has_extents(ifp)); + + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; lowest = max = *first_unused; for_each_xfs_iext(ifp, &icur, &got) { @@ -1334,12 +1254,12 @@ xfs_bmap_last_before( xfs_fileoff_t *last_block, /* last block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; int error; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; @@ -1351,11 +1271,9 @@ xfs_bmap_last_before( return -EFSCORRUPTED; } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got)) *last_block = 0; @@ -1370,15 +1288,13 @@ xfs_bmap_last_extent( struct xfs_bmbt_irec *rec, int *is_empty) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; int error; - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; xfs_iext_last(ifp, &icur); if (!xfs_iext_get_extent(ifp, &icur, rec)) @@ -1438,16 +1354,17 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; *last_block = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); @@ -1459,39 +1376,6 @@ xfs_bmap_last_offset( } /* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ - struct xfs_iext_cursor icur; - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) - return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_first(ifp, &icur); - xfs_iext_get_extent(ifp, &icur, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -/* * Extent tree manipulation functions used during allocation. */ @@ -1503,32 +1387,26 @@ xfs_bmap_add_extent_delay_real( struct xfs_bmalloca *bma, int whichfork) { + struct xfs_mount *mp = bma->ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ - struct xfs_ifork *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - struct xfs_mount *mp; - xfs_extnum_t *nextents; struct xfs_bmbt_irec old; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(whichfork != XFS_ATTR_FORK); - nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : - &bma->ip->i_d.di_nextents); - ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || - (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1571,7 +1449,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -1589,13 +1467,13 @@ xfs_bmap_add_extent_delay_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1616,7 +1494,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); - (*nextents)--; + ifp->if_nextents--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1720,8 +1598,8 @@ xfs_bmap_add_extent_delay_real( PREV.br_startblock = new->br_startblock; PREV.br_state = new->br_state; xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + ifp->if_nextents++; - (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1786,7 +1664,8 @@ xfs_bmap_add_extent_delay_real( * The left neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1818,7 +1697,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1872,7 +1751,8 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1904,7 +1784,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -1957,7 +1837,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_next(ifp, &bma->icur); xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); - (*nextents)++; + ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -2025,8 +1905,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_private.b.allocated; - bma->cur->bc_private.b.allocated = 0; + da_new += bma->cur->bc_ino.allocated; + bma->cur->bc_ino.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2055,11 +1935,11 @@ xfs_bmap_add_extent_unwritten_real( xfs_inode_t *ip, /* incore inode pointer */ int whichfork, struct xfs_iext_cursor *icur, - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + struct xfs_btree_cur **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp) /* inode logging flags */ { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_btree_cur *cur; /* btree cursor */ int error; /* error return value */ int i; /* temp state */ struct xfs_ifork *ifp; /* inode fork pointer */ @@ -2067,14 +1947,14 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(!isnullstartblock(new->br_startblock)); @@ -2117,7 +1997,7 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -2135,13 +2015,13 @@ xfs_bmap_add_extent_unwritten_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; /* @@ -2161,8 +2041,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 2); + ifp->if_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2214,8 +2093,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2257,9 +2135,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &PREV); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2366,8 +2243,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2442,9 +2319,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, new, state); + ifp->if_nextents++; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2495,9 +2371,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &r[1], state); xfs_iext_insert(ip, icur, &r[0], state); + ifp->if_nextents += 2; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2573,7 +2448,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; *curp = cur; } @@ -2601,10 +2476,10 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2632,15 +2507,15 @@ xfs_bmap_add_extent_hole_delay( */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* @@ -2738,9 +2613,9 @@ xfs_bmap_add_extent_hole_real( struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, - int flags) + uint32_t flags) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur = *curp; int error; /* error return value */ @@ -2748,11 +2623,11 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2783,17 +2658,17 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2812,9 +2687,8 @@ xfs_bmap_add_extent_hole_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -2912,8 +2786,8 @@ xfs_bmap_add_extent_hole_real( * Insert a new entry. */ xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -2955,7 +2829,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -2968,7 +2842,7 @@ done: */ /* - * Adjust the size of the new extent based on di_extsize and rt extsize. + * Adjust the size of the new extent based on i_extsize and rt extsize. */ int xfs_bmap_extsize_align( @@ -3029,15 +2903,15 @@ xfs_bmap_extsize_align( /* * For large extent hint sizes, the aligned extent might be larger than - * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls - * the length back under MAXEXTLEN. The outer allocation loops handle - * short allocation just fine, so it is safe to do this. We only want to - * do it when we are forced to, though, because it means more allocation - * operations are required. + * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so + * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer + * allocation loops handle short allocation just fine, so it is safe to + * do this. We only want to do it when we are forced to, though, because + * it means more allocation operations are required. */ - while (align_alen > MAXEXTLEN) + while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; - ASSERT(align_alen <= MAXEXTLEN); + ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation @@ -3127,9 +3001,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - /* see MAXEXTLEN handling above */ + /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || - align_alen + extsz > MAXEXTLEN); + align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG @@ -3310,7 +3184,8 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, + NULL); if (error) { /* Couldn't lock the AGF, so skip this AG. */ if (error == -EAGAIN) { @@ -3474,7 +3349,7 @@ xfs_bmap_btalloc_accounting( } /* data/attr fork only */ - ap->ip->i_d.di_nblocks += args->len; + ap->ip->i_nblocks += args->len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= args->len; @@ -3485,35 +3360,17 @@ xfs_bmap_btalloc_accounting( args->len); } -STATIC int -xfs_bmap_btalloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +static int +xfs_bmap_compute_alignments( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) { - xfs_mount_t *mp; /* mount point structure */ - xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_extlen_t align = 0; /* minimum allocation alignment */ - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t ag; - xfs_alloc_arg_t args; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - xfs_extlen_t blen; - xfs_extlen_t nextminlen = 0; - int nullfb; /* true if ap->firstblock isn't set */ - int isaligned; - int tryagain; - int error; - int stripe_align; - - ASSERT(ap->length); - orig_offset = ap->offset; - orig_length = ap->length; - - mp = ap->ip->i_mount; + struct xfs_mount *mp = args->mp; + xfs_extlen_t align = 0; /* minimum allocation alignment */ + int stripe_align = 0; /* stripe alignment for allocation is determined by mount parameters */ - stripe_align = 0; - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + if (mp->m_swidth && xfs_has_swalloc(mp)) stripe_align = mp->m_swidth; else if (mp->m_dalign) stripe_align = mp->m_dalign; @@ -3523,13 +3380,172 @@ xfs_bmap_btalloc( else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); if (align) { - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 0, ap->eof, 0, ap->conv, - &ap->offset, &ap->length); - ASSERT(!error); + if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, + ap->eof, 0, ap->conv, &ap->offset, + &ap->length)) + ASSERT(0); ASSERT(ap->length); } + /* apply extent size hints if obtained earlier */ + if (align) { + args->prod = align; + div_u64_rem(ap->offset, args->prod, &args->mod); + if (args->mod) + args->mod = args->prod - args->mod; + } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { + args->prod = 1; + args->mod = 0; + } else { + args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; + div_u64_rem(ap->offset, args->prod, &args->mod); + if (args->mod) + args->mod = args->prod - args->mod; + } + + return stripe_align; +} + +static void +xfs_bmap_process_allocated_extent( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_fileoff_t orig_offset, + xfs_extlen_t orig_length) +{ + int nullfb; + + nullfb = ap->tp->t_firstblock == NULLFSBLOCK; + + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(nullfb || + XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <= + XFS_FSB_TO_AGNO(args->mp, args->fsbno)); + + ap->blkno = args->fsbno; + if (nullfb) + ap->tp->t_firstblock = args->fsbno; + ap->length = args->len; + /* + * If the extent size hint is active, we tried to round the + * caller's allocation request offset down to extsz and the + * length up to another extsz boundary. If we found a free + * extent we mapped it in starting at this new offset. If the + * newly mapped space isn't long enough to cover any of the + * range of offsets that was originally requested, move the + * mapping up so that we can fill as much of the caller's + * original request as possible. Free space is apparently + * very fragmented so we're unlikely to be able to satisfy the + * hints anyway. + */ + if (ap->length <= orig_length) + ap->offset = orig_offset; + else if (ap->offset + ap->length < orig_offset + orig_length) + ap->offset = orig_offset + orig_length - ap->length; + xfs_bmap_btalloc_accounting(ap, args); +} + +#ifdef DEBUG +static int +xfs_bmap_exact_minlen_extent_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + int error; + + ASSERT(ap->length); + + if (ap->minlen != 1) { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; + } + + orig_offset = ap->offset; + orig_length = ap->length; + + args.alloc_minlen_only = 1; + + xfs_bmap_compute_alignments(ap, &args); + + if (ap->tp->t_firstblock == NULLFSBLOCK) { + /* + * Unlike the longest extent available in an AG, we don't track + * the length of an AG's shortest extent. + * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and + * hence we can afford to start traversing from the 0th AG since + * we need not be concerned about a drop in performance in + * "debug only" code paths. + */ + ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); + } else { + ap->blkno = ap->tp->t_firstblock; + } + + args.fsbno = ap->blkno; + args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minlen = args.maxlen = ap->minlen; + args.total = ap->total; + + args.alignment = 1; + args.minalignslop = 0; + + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.resv = XFS_AG_RESV_NONE; + args.datatype = ap->datatype; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + if (args.fsbno != NULLFSBLOCK) { + xfs_bmap_process_allocated_extent(ap, &args, orig_offset, + orig_length); + } else { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + } + + return 0; +} +#else + +#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) + +#endif + +STATIC int +xfs_bmap_btalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; + xfs_alloctype_t atype = 0; + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_agnumber_t ag; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + xfs_extlen_t blen; + xfs_extlen_t nextminlen = 0; + int nullfb; /* true if ap->firstblock isn't set */ + int isaligned; + int tryagain; + int error; + int stripe_align; + + ASSERT(ap->length); + orig_offset = ap->offset; + orig_length = ap->length; + + stripe_align = xfs_bmap_compute_alignments(ap, &args); nullfb = ap->tp->t_firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, @@ -3560,9 +3576,6 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; - memset(&args, 0, sizeof(args)); - args.tp = ap->tp; - args.mp = mp; args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; @@ -3593,21 +3606,7 @@ xfs_bmap_btalloc( args.total = ap->total; args.minlen = ap->minlen; } - /* apply extent size hints if obtained earlier */ - if (align) { - args.prod = align; - div_u64_rem(ap->offset, args.prod, &args.mod); - if (args.mod) - args.mod = args.prod - args.mod; - } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { - args.prod = 1; - args.mod = 0; - } else { - args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; - div_u64_rem(ap->offset, args.prod, &args.mod); - if (args.mod) - args.mod = args.prod - args.mod; - } + /* * If we are not low on available data blocks, and the underlying * logical volume manager is a stripe, and the file offset is zero then @@ -3709,37 +3708,10 @@ xfs_bmap_btalloc( return error; ap->tp->t_flags |= XFS_TRANS_LOWMODE; } + if (args.fsbno != NULLFSBLOCK) { - /* - * check the allocation happened at the same or higher AG than - * the first block that was allocated. - */ - ASSERT(ap->tp->t_firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock) <= - XFS_FSB_TO_AGNO(mp, args.fsbno)); - - ap->blkno = args.fsbno; - if (ap->tp->t_firstblock == NULLFSBLOCK) - ap->tp->t_firstblock = args.fsbno; - ASSERT(nullfb || fb_agno <= args.agno); - ap->length = args.len; - /* - * If the extent size hint is active, we tried to round the - * caller's allocation request offset down to extsz and the - * length up to another extsz boundary. If we found a free - * extent we mapped it in starting at this new offset. If the - * newly mapped space isn't long enough to cover any of the - * range of offsets that was originally requested, move the - * mapping up so that we can fill as much of the caller's - * original request as possible. Free space is apparently - * very fragmented so we're unlikely to be able to satisfy the - * hints anyway. - */ - if (ap->length <= orig_length) - ap->offset = orig_offset; - else if (ap->offset + ap->length < orig_offset + orig_length) - ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, &args); + xfs_bmap_process_allocated_extent(ap, &args, orig_offset, + orig_length); } else { ap->blkno = NULLFSBLOCK; ap->length = 0; @@ -3792,7 +3764,7 @@ xfs_bmapi_trim_map( xfs_fileoff_t obno, xfs_fileoff_t end, int n, - int flags) + uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { @@ -3837,7 +3809,7 @@ xfs_bmapi_update_map( xfs_fileoff_t obno, xfs_fileoff_t end, int *n, - int flags) + uint32_t flags) { xfs_bmbt_irec_t *mval = *map; @@ -3890,10 +3862,11 @@ xfs_bmapi_read( xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3901,53 +3874,26 @@ xfs_bmapi_read( int error; bool eof = false; int n = 0; - int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); - ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_COWFORK))); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + if (WARN_ON_ONCE(!ifp)) + return -EFSCORRUPTED; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) return -EFSCORRUPTED; - } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!ifp) { - /* No CoW fork? Return a hole. */ - if (whichfork == XFS_COW_FORK) { - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = len; - mval->br_state = XFS_EXT_NORM; - *nmap = 1; - return 0; - } - - /* - * A missing attr ifork implies that the inode says we're in - * extents or btree format but failed to pass the inode fork - * verifier while trying to load it. Treat that as a file - * corruption too. - */ -#ifdef DEBUG - xfs_alert(mp, "%s: inode %llu missing fork %d", - __func__, ip->i_ino, whichfork); -#endif /* DEBUG */ - return -EFSCORRUPTED; - } - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) eof = true; @@ -4013,7 +3959,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; int error; @@ -4023,7 +3969,7 @@ xfs_bmapi_reserve_delalloc( * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ - alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) @@ -4047,8 +3993,7 @@ xfs_bmapi_reserve_delalloc( * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_reserve_blkres(ip, alen); if (error) return error; @@ -4094,8 +4039,7 @@ out_unreserve_blocks: xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + xfs_quota_unreserve_blkres(ip, alen); return error; } @@ -4129,6 +4073,10 @@ xfs_bmap_alloc_userdata( return xfs_bmap_rtalloc(bma); } + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + return xfs_bmap_exact_minlen_extent_alloc(bma); + return xfs_bmap_btalloc(bma); } @@ -4138,7 +4086,7 @@ xfs_bmapi_allocate( { struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4154,7 +4102,7 @@ xfs_bmapi_allocate( if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) bma->prev.br_startoff = NULLFILEOFF; } else { - bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); if (!bma->eof) bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); @@ -4165,10 +4113,15 @@ xfs_bmapi_allocate( else bma->minlen = 1; - if (bma->flags & XFS_BMAPI_METADATA) - error = xfs_bmap_btalloc(bma); - else + if (bma->flags & XFS_BMAPI_METADATA) { + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + error = xfs_bmap_exact_minlen_extent_alloc(bma); + else + error = xfs_bmap_btalloc(bma); + } else { error = xfs_bmap_alloc_userdata(bma); + } if (error || bma->blkno == NULLFSBLOCK) return error; @@ -4178,7 +4131,7 @@ xfs_bmapi_allocate( return error; } - if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) + if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); /* * Bump the number of extents we've allocated @@ -4187,25 +4140,15 @@ xfs_bmapi_allocate( bma->nallocs++; if (bma->cur) - bma->cur->bc_private.b.flags = - bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + bma->cur->bc_ino.flags = + bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; bma->got.br_blockcount = bma->length; bma->got.br_state = XFS_EXT_NORM; - /* - * In the data fork, a wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - * - * For the cow fork, however, we convert delalloc reservations - * (extents allocated for speculative preallocation) to - * allocated unwritten extents, and only convert the unwritten - * extents to real extents when we're about to write the data. - */ - if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC)) + if (bma->flags & XFS_BMAPI_PREALLOC) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -4239,10 +4182,10 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags) + uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4261,7 +4204,7 @@ xfs_bmapi_convert_unwritten( * Modify (by adding) the state flag, if writing. */ ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) { bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); } @@ -4319,11 +4262,13 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); + if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; - if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; - return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; + return be16_to_cpu(ifp->if_broot->bb_level) + 1; } /* @@ -4338,11 +4283,13 @@ xfs_bmapi_finish( int whichfork, int error) { + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); + if ((bma->logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) bma->logflags &= ~xfs_ilog_fext(whichfork); else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) bma->logflags &= ~xfs_ilog_fbroot(whichfork); if (bma->logflags) @@ -4363,7 +4310,7 @@ xfs_bmapi_write( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ + uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ @@ -4374,13 +4321,13 @@ xfs_bmapi_write( .total = total, }; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ - int whichfork; /* data or attr fork */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4395,13 +4342,12 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(tp != NULL); ASSERT(len > 0); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP)); @@ -4417,23 +4363,19 @@ xfs_bmapi_write( ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); - XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - goto error0; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; @@ -4480,8 +4422,8 @@ xfs_bmapi_write( * xfs_extlen_t and therefore 32 bits. Hence we have to * check for 32-bit overflows and handle them here. */ - if (len > (xfs_filblks_t)MAXEXTLEN) - bma.length = MAXEXTLEN; + if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) + bma.length = XFS_MAX_BMBT_EXTLEN; else bma.length = len; @@ -4536,9 +4478,8 @@ xfs_bmapi_write( if (error) goto error0; - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork)); + ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE || + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, orig_nmap, *nmap); @@ -4562,7 +4503,7 @@ xfs_bmapi_convert_delalloc( struct iomap *iomap, unsigned int *seq) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; @@ -4585,6 +4526,14 @@ xfs_bmapi_convert_delalloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, whichfork, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* @@ -4602,7 +4551,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4611,10 +4560,26 @@ xfs_bmapi_convert_delalloc( bma.ip = ip; bma.wasdel = true; bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, + XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + + /* + * When we're converting the delalloc reservations backing dirty pages + * in the page cache, we must be careful about how we create the new + * extents: + * + * New CoW fork extents are created unwritten, turned into real extents + * when we're about to write the data to disk, and mapped into the data + * fork after the write finishes. End of story. + * + * New data fork extents must be mapped in as unwritten and converted + * to real extents after the write succeeds to avoid exposing stale + * disk contents if we crash. + */ + bma.flags = XFS_BMAPI_PREALLOC; if (whichfork == XFS_COW_FORK) - bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + bma.flags |= XFS_BMAPI_COWFORK; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; @@ -4634,7 +4599,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) @@ -4665,7 +4630,7 @@ xfs_bmapi_remap( xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4675,28 +4640,26 @@ xfs_bmapi_remap( int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); - ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* make sure we only reflink into a hole. */ @@ -4704,12 +4667,12 @@ xfs_bmapi_remap( ASSERT(got.br_startoff - bno >= len); } - ip->i_d.di_nblocks += len; + ip->i_nblocks += len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } got.br_startoff = bno; @@ -4728,9 +4691,9 @@ xfs_bmapi_remap( error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~XFS_ILOG_DEXT; - else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE) logflags &= ~XFS_ILOG_DBROOT; if (logflags) @@ -4834,12 +4797,12 @@ xfs_bmap_del_extent_delay( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); int error = 0; bool isrt; @@ -4867,9 +4830,8 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -((long)del->br_blockcount), 0, - isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ASSERT(!isrt); + error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); if (error) return error; ip->i_delayed_blks -= del->br_blockcount; @@ -4962,10 +4924,10 @@ xfs_bmap_del_extent_cow( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; - int state = BMAP_COWFORK; + uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); @@ -5034,11 +4996,11 @@ xfs_bmap_del_extent_real( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ struct xfs_iext_cursor *icur, - xfs_btree_cur_t *cur, /* if null, not a btree */ + struct xfs_btree_cur *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ - int bflags) /* bmapi flags */ + uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ @@ -5054,13 +5016,13 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(del->br_blockcount > 0); xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); @@ -5080,28 +5042,32 @@ xfs_bmap_del_extent_real( * conversion to btree format, since the transaction will be dirty then. */ if (tp->t_blk_res == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= - XFS_IFORK_MAXEXT(ip, whichfork) && + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) && del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; xfs_filblks_t len; xfs_extlen_t mod; - bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, &mod); ASSERT(mod == 0); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; + if (!(bflags & XFS_BMAPI_REMAP)) { + xfs_fsblock_t bno; + + bno = div_u64_rem(del->br_startblock, + mp->m_sb.sb_rextsize, &mod); + ASSERT(mod == 0); + + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + } + do_fx = 0; nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; @@ -5134,8 +5100,8 @@ xfs_bmap_del_extent_real( */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; + flags |= XFS_ILOG_CORE; if (!cur) { flags |= xfs_ilog_fext(whichfork); @@ -5182,6 +5148,7 @@ xfs_bmap_del_extent_real( /* * Deleting the middle of the extent. */ + old = got; got.br_blockcount = del->br_startoff - got.br_startoff; @@ -5243,8 +5210,8 @@ xfs_bmap_del_extent_real( } } else flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + ifp->if_nextents++; xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); break; @@ -5260,7 +5227,7 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - __xfs_bmap_add_free(tp, del->br_startblock, + __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, (bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN); @@ -5271,7 +5238,7 @@ xfs_bmap_del_extent_real( * Adjust inode # blocks in the file. */ if (nblks) - ip->i_d.di_nblocks -= nblks; + ip->i_nblocks -= nblks; /* * Adjust quota data. */ @@ -5295,7 +5262,7 @@ __xfs_bunmapi( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ - int flags, /* misc flags */ + uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ @@ -5313,8 +5280,6 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ - xfs_fileoff_t max_len; - xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5323,29 +5288,20 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork))) + ifp = xfs_ifork_ptr(ip, whichfork); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); ASSERT(nexts >= 0); - /* - * Guesstimate how many blocks we can unmap without running the risk of - * blowing out the transaction with a mix of EFIs and reflink - * adjustments. - */ - if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) - max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); - else - max_len = len; - - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) + error = xfs_iread_extents(tp, ip, whichfork); + if (error) return error; + if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; @@ -5361,10 +5317,10 @@ __xfs_bunmapi( end--; logflags = 0; - if (ifp->if_flags & XFS_IFBROOT) { - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } else cur = NULL; @@ -5380,7 +5336,7 @@ __xfs_bunmapi( extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && - (nexts == 0 || extno < nexts) && max_len > 0) { + (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. @@ -5405,16 +5361,6 @@ __xfs_bunmapi( del = got; wasdel = isnullstartblock(del.br_startblock); - /* - * Make sure we don't touch multiple AGF headers out of order - * in a single transaction, as that could cause AB-BA deadlocks. - */ - if (!wasdel && !isrt) { - agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); - if (prev_agno != NULLAGNUMBER && prev_agno > agno) - break; - prev_agno = agno; - } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; @@ -5424,14 +5370,6 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - /* How much can we safely unmap? */ - if (max_len < del.br_blockcount) { - del.br_startoff += del.br_blockcount - max_len; - if (!wasdel) - del.br_startblock += del.br_blockcount - max_len; - del.br_blockcount = max_len; - } - if (!isrt) goto delete; @@ -5567,7 +5505,6 @@ delete: if (error) goto error0; - max_len -= del.br_blockcount; end = del.br_startoff - 1; nodelete: /* @@ -5607,10 +5544,10 @@ error0: * logging the extent records if we've converted to btree format. */ if ((logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~xfs_ilog_fext(whichfork); else if ((logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction @@ -5620,7 +5557,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5633,7 +5570,7 @@ xfs_bunmapi( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_extnum_t nexts, int *done) { @@ -5665,7 +5602,7 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) return false; return true; @@ -5692,6 +5629,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5710,8 +5648,7 @@ xfs_bmse_merge( * Update the on-disk extent count, the btree if necessary and log the * inode. */ - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; @@ -5749,7 +5686,7 @@ xfs_bmse_merge( done: xfs_iext_remove(ip, icur, 0); - xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); + xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, &new); @@ -5813,7 +5750,7 @@ xfs_bmap_collapse_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, prev; struct xfs_iext_cursor icur; @@ -5821,25 +5758,23 @@ xfs_bmap_collapse_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { @@ -5906,7 +5841,7 @@ xfs_bmap_can_insert_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -5930,7 +5865,7 @@ xfs_bmap_insert_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, next; struct xfs_iext_cursor icur; @@ -5938,25 +5873,23 @@ xfs_bmap_insert_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (*next_fsb == NULLFSBLOCK) { @@ -6025,39 +5958,36 @@ del_cursor: * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ -STATIC int -xfs_bmap_split_extent_at( +int +xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; xfs_fsblock_t gotblkcnt; /* new block count for got */ struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - /* Read in all the extents */ - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; /* * If there are not extents, or split_fsb lies in a hole we are done. @@ -6072,9 +6002,9 @@ xfs_bmap_split_extent_at( new.br_blockcount = got.br_blockcount - gotblkcnt; new.br_state = got.br_state; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; @@ -6099,8 +6029,7 @@ xfs_bmap_split_extent_at( /* Add new extent */ xfs_iext_next(ifp, &icur); xfs_iext_insert(ip, &icur, &new, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; if (cur) { error = xfs_bmbt_lookup_eq(cur, &new, &i); @@ -6133,7 +6062,7 @@ xfs_bmap_split_extent_at( del_cursor: if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6142,34 +6071,6 @@ del_cursor: return error; } -int -xfs_bmap_split_extent( - struct xfs_inode *ip, - xfs_fileoff_t split_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - error = xfs_bmap_split_extent_at(tp, ip, split_fsb); - if (error) - goto out; - - return xfs_trans_commit(tp); - -out: - xfs_trans_cancel(tp); - return error; -} - /* Deferred mapping is only for real extents in the data fork. */ static bool xfs_bmap_is_update_needed( @@ -6199,7 +6100,7 @@ __xfs_bmap_add( bmap->br_blockcount, bmap->br_state); - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); + bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; @@ -6293,26 +6194,37 @@ xfs_bmap_validate_extent( struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = ip->i_mount; - xfs_fsblock_t endfsb; - bool isrt; - isrt = XFS_IS_REALTIME_INODE(ip); - endfsb = irec->br_startblock + irec->br_blockcount - 1; - if (isrt) { - if (!xfs_verify_rtbno(mp, irec->br_startblock)) - return __this_address; - if (!xfs_verify_rtbno(mp, endfsb)) + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + return __this_address; + + if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { + if (!xfs_verify_rtext(mp, irec->br_startblock, + irec->br_blockcount)) return __this_address; } else { - if (!xfs_verify_fsbno(mp, irec->br_startblock)) - return __this_address; - if (!xfs_verify_fsbno(mp, endfsb)) - return __this_address; - if (XFS_FSB_TO_AGNO(mp, irec->br_startblock) != - XFS_FSB_TO_AGNO(mp, endfsb)) + if (!xfs_verify_fsbext(mp, irec->br_startblock, + irec->br_blockcount)) return __this_address; } if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK) return __this_address; return NULL; } + +int __init +xfs_bmap_intent_init_cache(void) +{ + xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent", + sizeof(struct xfs_bmap_intent), + 0, 0, NULL); + + return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_bmap_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_bmap_intent_cache); + xfs_bmap_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 14d25e0b7d9c..16db95b11589 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. @@ -13,8 +13,6 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; -extern kmem_zone_t *xfs_bmap_free_item_zone; - /* * Argument structure for xfs_bmap_alloc. */ @@ -41,20 +39,7 @@ struct xfs_bmalloca { bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ - int flags; -}; - -/* - * List of extents to be free "later". - * The list is kept sorted on xbf_startblock. - */ -struct xfs_extent_free_item -{ - xfs_fsblock_t xefi_startblock;/* starting fs block number */ - xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ - struct list_head xefi_list; - struct xfs_owner_info xefi_oinfo; /* extent owner */ - bool xefi_skip_discard; + uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 @@ -62,17 +47,17 @@ struct xfs_extent_free_item /* * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ -#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ +#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ +#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents @@ -80,7 +65,7 @@ struct xfs_extent_free_item * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ -#define XFS_BMAPI_ZERO 0x080 +#define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily @@ -90,16 +75,16 @@ struct xfs_extent_free_item * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ -#define XFS_BMAPI_REMAP 0x100 +#define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ -#define XFS_BMAPI_COWFORK 0x200 +#define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ -#define XFS_BMAPI_NODISCARD 0x1000 +#define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ -#define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -121,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w) (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } -static inline int xfs_bmapi_whichfork(int bmapi_flags) +static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; @@ -139,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* * Flags for xfs_bmap_add_extent*. */ -#define BMAP_LEFT_CONTIG (1 << 0) -#define BMAP_RIGHT_CONTIG (1 << 1) -#define BMAP_LEFT_FILLING (1 << 2) -#define BMAP_RIGHT_FILLING (1 << 3) -#define BMAP_LEFT_DELAY (1 << 4) -#define BMAP_RIGHT_DELAY (1 << 5) -#define BMAP_LEFT_VALID (1 << 6) -#define BMAP_RIGHT_VALID (1 << 7) -#define BMAP_ATTRFORK (1 << 8) -#define BMAP_COWFORK (1 << 9) +#define BMAP_LEFT_CONTIG (1u << 0) +#define BMAP_RIGHT_CONTIG (1u << 1) +#define BMAP_LEFT_FILLING (1u << 2) +#define BMAP_RIGHT_FILLING (1u << 3) +#define BMAP_LEFT_DELAY (1u << 4) +#define BMAP_RIGHT_DELAY (1u << 5) +#define BMAP_LEFT_VALID (1u << 6) +#define BMAP_RIGHT_VALID (1u << 7) +#define BMAP_ATTRFORK (1u << 8) +#define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ @@ -158,17 +143,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } +/* Return true if the extent is an allocated extent, written or not. */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) { - return irec->br_state != XFS_EXT_UNWRITTEN && - irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !isnullstartblock(irec->br_startblock); + return xfs_bmap_is_real_extent(irec) && + irec->br_state != XFS_EXT_UNWRITTEN; } /* @@ -180,13 +170,10 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); +unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); -int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); -void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, - xfs_filblks_t len, const struct xfs_owner_info *oinfo, - bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -194,18 +181,17 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); -int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); + int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, @@ -222,7 +208,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); -int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, @@ -234,16 +221,6 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp); -static inline void -xfs_bmap_add_free( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo) -{ - __xfs_bmap_add_free(tp, bno, len, oinfo, false); -} - enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, @@ -252,8 +229,8 @@ enum xfs_bmap_intent_type { struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; - struct xfs_inode *bi_owner; int bi_whichfork; + struct xfs_inode *bi_owner; struct xfs_bmbt_irec bi_bmap; }; @@ -266,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -static inline int xfs_bmap_fork_to_state(int whichfork) +static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: @@ -283,6 +260,11 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags); + uint32_t flags); + +extern struct kmem_cache *xfs_bmap_intent_cache; + +int __init xfs_bmap_intent_init_cache(void); +void xfs_bmap_intent_destroy_cache(void); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ffe608d2a2d9..cfa052d40105 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -22,6 +22,8 @@ #include "xfs_trace.h" #include "xfs_rmap.h" +static struct kmem_cache *xfs_bmbt_cur_cache; + /* * Convert on-disk form of btree root to in-memory form. */ @@ -58,7 +60,7 @@ xfs_bmdr_to_bmbt( void xfs_bmbt_disk_get_all( - struct xfs_bmbt_rec *rec, + const struct xfs_bmbt_rec *rec, struct xfs_bmbt_irec *irec) { uint64_t l0 = get_unaligned_be64(&rec->l0); @@ -78,7 +80,7 @@ xfs_bmbt_disk_get_all( */ xfs_filblks_t xfs_bmbt_disk_get_blockcount( - xfs_bmbt_rec_t *r) + const struct xfs_bmbt_rec *r) { return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); } @@ -88,7 +90,7 @@ xfs_bmbt_disk_get_blockcount( */ xfs_fileoff_t xfs_bmbt_disk_get_startoff( - xfs_bmbt_rec_t *r) + const struct xfs_bmbt_rec *r) { return ((xfs_fileoff_t)be64_to_cpu(r->l0) & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; @@ -136,7 +138,7 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); @@ -166,13 +168,13 @@ xfs_bmbt_dup_cursor( struct xfs_btree_cur *new; new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.b.ip, cur->bc_private.b.whichfork); + cur->bc_ino.ip, cur->bc_ino.whichfork); /* * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ - new->bc_private.b.flags = cur->bc_private.b.flags; + new->bc_ino.flags = cur->bc_ino.flags; return new; } @@ -183,20 +185,20 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *dst) { ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || - (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); - dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_ino.allocated += src->bc_ino.allocated; dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; - src->bc_private.b.allocated = 0; + src->bc_ino.allocated = 0; } STATIC int xfs_bmbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -205,8 +207,8 @@ xfs_bmbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.fsbno = cur->bc_tp->t_firstblock; - xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -230,7 +232,7 @@ xfs_bmbt_alloc_block( } args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) { error = -ENOSPC; goto error0; @@ -259,10 +261,10 @@ xfs_bmbt_alloc_block( ASSERT(args.len == 1); cur->bc_tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + cur->bc_ino.allocated++; + cur->bc_ino.ip->i_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, XFS_TRANS_DQ_BCOUNT, 1L); new->l = cpu_to_be64(args.fsbno); @@ -280,14 +282,14 @@ xfs_bmbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; - xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); - xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); - ip->i_d.di_nblocks--; + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); + xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + ip->i_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); @@ -302,8 +304,8 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = xfs_ifork_ptr(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -320,8 +322,8 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = xfs_ifork_ptr(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -347,13 +349,13 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); + return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0); } STATIC void xfs_bmbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->bmbt.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); @@ -361,8 +363,8 @@ xfs_bmbt_init_key_from_rec( STATIC void xfs_bmbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->bmbt.br_startoff = cpu_to_be64( xfs_bmbt_disk_get_startoff(&rec->bmbt) + @@ -387,8 +389,8 @@ xfs_bmbt_init_ptr_from_cur( STATIC int64_t xfs_bmbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - cur->bc_rec.b.br_startoff; @@ -396,12 +398,12 @@ xfs_bmbt_key_diff( STATIC int64_t xfs_bmbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); - uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); /* * Note: This routine previously casted a and b to int64 and subtracted @@ -428,7 +430,7 @@ xfs_bmbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -497,9 +499,9 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { STATIC int xfs_bmbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be64_to_cpu(k1->bmbt.br_startoff) < be64_to_cpu(k2->bmbt.br_startoff); @@ -507,9 +509,9 @@ xfs_bmbt_keys_inorder( STATIC int xfs_bmbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return xfs_bmbt_disk_get_startoff(&r1->bmbt) + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= @@ -548,33 +550,40 @@ xfs_bmbt_init_cursor( struct xfs_inode *ip, /* inode owning the btree */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - - cur->bc_tp = tp; - cur->bc_mp = mp; + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, + mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - cur->bc_btnum = XFS_BTNUM_BMAP; - cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + cur->bc_ino.ip = ip; + cur->bc_ino.allocated = 0; + cur->bc_ino.flags = 0; + cur->bc_ino.whichfork = whichfork; return cur; } +/* Calculate number of records in a block mapping btree block. */ +static inline unsigned int +xfs_bmbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + /* * Calculate number of records in a bmap btree block. */ @@ -585,10 +594,29 @@ xfs_bmbt_maxrecs( int leaf) { blocklen -= XFS_BMBT_BLOCK_LEN(mp); + return xfs_bmbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(xfs_bmbt_rec_t); - return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +/* + * Calculate the maximum possible height of the btree that the on-disk format + * supports. This is used for sizing structures large enough to support every + * possible configuration of a filesystem that might get mounted. + */ +unsigned int +xfs_bmbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, + XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); + + minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; + + /* One extra level for the inode root. */ + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1; } /* @@ -636,15 +664,10 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - if (whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); - else - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); - if (!cur) - return -ENOMEM; - cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; + cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); @@ -659,3 +682,22 @@ xfs_bmbt_calc_size( { return xfs_btree_calc_size(mp->m_bmap_dmnr, len); } + +int __init +xfs_bmbt_init_cur_cache(void) +{ + xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur", + xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_bmbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_bmbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_bmbt_cur_cache); + xfs_bmbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 29b407d053b4..3e7a40a83835 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -16,7 +16,7 @@ struct xfs_trans; * Btree block header size depends on a superblock flag. */ #define XFS_BMBT_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ @@ -88,9 +88,10 @@ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); -extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r); +extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r); +void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r, + struct xfs_bmbt_irec *s); extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); @@ -109,4 +110,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); +unsigned int xfs_bmbt_maxlevels_ondisk(void); + +int __init xfs_bmbt_init_cur_cache(void); +void xfs_bmbt_destroy_cur_cache(void); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index fd300dc93ca4..4c16c8c31fcb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -20,11 +20,13 @@ #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" - -/* - * Cursor allocation zone. - */ -kmem_zone_t *xfs_btree_cur_zone; +#include "xfs_btree_staging.h" +#include "xfs_ag.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" /* * Btree magic numbers. @@ -50,6 +52,70 @@ xfs_btree_magic( } /* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + __be64 dsibling) +{ + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_perag *pag, + struct xfs_btree_cur *cur, + int level, + xfs_agblock_t agbno, + __be32 dsibling) +{ + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) + return NULL; + + sibling = be32_to_cpu(dsibling); + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(pag, sibling)) + return __this_address; + } + return NULL; +} + +/* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ @@ -62,13 +128,15 @@ __xfs_btree_check_lblock( { struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != - cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; if (block->bb_u.l.bb_pad != cpu_to_be32(0)) return __this_address; @@ -81,16 +149,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /* Check a long btree block header. */ @@ -126,14 +194,17 @@ __xfs_btree_check_sblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.s.bb_blkno != - cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; } @@ -144,16 +215,16 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* Check a short btree block header. */ @@ -214,7 +285,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); + return xfs_verify_agbno(cur->bc_ag.pag, agbno); } /* @@ -223,10 +294,10 @@ xfs_btree_check_sptr( */ static int xfs_btree_check_ptr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int index, - int level) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int index, + int level) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), @@ -234,8 +305,8 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork, cur->bc_btnum, + cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork, cur->bc_btnum, level, index); } else { if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), @@ -243,7 +314,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.a.agno, cur->bc_btnum, + cur->bc_ag.pag->pag_agno, cur->bc_btnum, level, index); } @@ -271,7 +342,7 @@ xfs_btree_lblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -285,7 +356,7 @@ xfs_btree_lblock_verify_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); @@ -309,7 +380,7 @@ xfs_btree_sblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -323,7 +394,7 @@ xfs_btree_sblock_verify_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); @@ -352,37 +423,38 @@ xfs_btree_free_block( */ void xfs_btree_del_cursor( - xfs_btree_cur_t *cur, /* btree cursor */ - int error) /* del because of error */ + struct xfs_btree_cur *cur, /* btree cursor */ + int error) /* del because of error */ { - int i; /* btree level */ + int i; /* btree level */ /* - * Clear the buffer pointers, and release the buffers. - * If we're doing this in the face of an error, we - * need to make sure to inspect all of the entries - * in the bc_bufs array for buffers to be unlocked. - * This is because some of the btree code works from - * level n down to 0, and if we get an error along - * the way we won't have initialized all the entries - * down to 0. + * Clear the buffer pointers and release the buffers. If we're doing + * this because of an error, inspect all of the entries in the bc_bufs + * array for buffers to be unlocked. This is because some of the btree + * code works from level n down to 0, and if we get an error along the + * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { - if (cur->bc_bufs[i]) - xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + if (cur->bc_levels[i].bp) + xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); else if (!error) break; } + /* - * Can't free a bmap cursor without having dealt with the - * allocated indirect blocks' accounting. - */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_private.b.allocated == 0); - /* - * Free the cursor. + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. */ - kmem_cache_free(xfs_btree_cur_zone, cur); + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || + xfs_is_shutdown(cur->bc_mp) || error != 0); + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) + kmem_free(cur->bc_ops); + if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) + xfs_perag_put(cur->bc_ag.pag); + kmem_cache_free(cur->bc_cache, cur); } /* @@ -391,14 +463,14 @@ xfs_btree_del_cursor( */ int /* error */ xfs_btree_dup_cursor( - xfs_btree_cur_t *cur, /* input cursor */ - xfs_btree_cur_t **ncur) /* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur) /* output cursor */ { - xfs_buf_t *bp; /* btree block's buffer pointer */ + struct xfs_buf *bp; /* btree block's buffer pointer */ int error; /* error return value */ int i; /* level number of btree block */ xfs_mount_t *mp; /* mount structure for filesystem */ - xfs_btree_cur_t *new; /* new cursor value */ + struct xfs_btree_cur *new; /* new cursor value */ xfs_trans_t *tp; /* transaction pointer, can be NULL */ tp = cur->bc_tp; @@ -418,12 +490,12 @@ xfs_btree_dup_cursor( * For each level current, re-get the buffer and copy the ptr value. */ for (i = 0; i < new->bc_nlevels; i++) { - new->bc_ptrs[i] = cur->bc_ptrs[i]; - new->bc_ra[i] = cur->bc_ra[i]; - bp = cur->bc_bufs[i]; + new->bc_levels[i].ptr = cur->bc_levels[i].ptr; + new->bc_levels[i].ra = cur->bc_levels[i].ra; + bp = cur->bc_levels[i].bp; if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, + xfs_buf_daddr(bp), mp->m_bsize, 0, &bp, cur->bc_ops->buf_ops); if (error) { @@ -432,7 +504,7 @@ xfs_btree_dup_cursor( return error; } } - new->bc_bufs[i] = bp; + new->bc_levels[i].bp = bp; } *ncur = new; return 0; @@ -642,6 +714,17 @@ xfs_btree_ptr_addr( ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } +struct xfs_ifork * +xfs_btree_ifork_ptr( + struct xfs_btree_cur *cur) +{ + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + if (cur->bc_flags & XFS_BTREE_STAGING) + return cur->bc_ino.ifake->if_fork; + return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); +} + /* * Get the root block which is stored in the inode. * @@ -652,9 +735,8 @@ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); return (struct xfs_btree_block *)ifp->if_broot; } @@ -674,7 +756,7 @@ xfs_btree_get_block( return xfs_btree_get_iroot(cur); } - *bpp = cur->bc_bufs[level]; + *bpp = cur->bc_levels[level].bp; return XFS_BUF_TO_BLOCK(*bpp); } @@ -684,11 +766,11 @@ xfs_btree_get_block( */ STATIC int /* success=1, failure=0 */ xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ + struct xfs_buf *bp; /* buffer containing block */ /* * Get the block pointer for this level. @@ -704,7 +786,7 @@ xfs_btree_firstrec( /* * Set the ptr value to 1, that's the first record/key. */ - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; return 1; } @@ -714,11 +796,11 @@ xfs_btree_firstrec( */ STATIC int /* success=1, failure=0 */ xfs_btree_lastrec( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ + struct xfs_buf *bp; /* buffer containing block */ /* * Get the block pointer for this level. @@ -734,7 +816,7 @@ xfs_btree_lastrec( /* * Set the ptr value to numrecs, that's the last record/key. */ - cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); + cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); return 1; } @@ -744,20 +826,20 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - int64_t imask; /* mask for current bit number */ + uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ - for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; @@ -766,7 +848,7 @@ xfs_btree_offsets( /* * Find the highest bit, so the last byte offset. */ - for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; @@ -881,13 +963,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -915,11 +997,11 @@ xfs_btree_readahead( (lev == cur->bc_nlevels - 1)) return 0; - if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) return 0; - cur->bc_ra[lev] |= lr; - block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + cur->bc_levels[lev].ra |= lr; + block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return xfs_btree_readahead_lblock(cur, lr, block); @@ -928,9 +1010,9 @@ xfs_btree_readahead( STATIC int xfs_btree_ptr_to_daddr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - xfs_daddr_t *daddr) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + xfs_daddr_t *daddr) { xfs_fsblock_t fsbno; xfs_agblock_t agbno; @@ -945,7 +1027,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); } @@ -978,35 +1060,35 @@ xfs_btree_readahead_ptr( */ STATIC void xfs_btree_setbuf( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ - xfs_buf_t *bp) /* new buffer to set */ + struct xfs_buf *bp) /* new buffer to set */ { struct xfs_btree_block *b; /* btree block */ - if (cur->bc_bufs[lev]) - xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); - cur->bc_bufs[lev] = bp; - cur->bc_ra[lev] = 0; + if (cur->bc_levels[lev].bp) + xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); + cur->bc_levels[lev].bp = bp; + cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } else { if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } } bool xfs_btree_ptr_is_null( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return ptr->l == cpu_to_be64(NULLFSBLOCK); @@ -1014,7 +1096,7 @@ xfs_btree_ptr_is_null( return ptr->s == cpu_to_be32(NULLAGBLOCK); } -STATIC void +void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1050,12 +1132,12 @@ xfs_btree_get_sibling( } } -STATIC void +void xfs_btree_set_sibling( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_ptr *ptr, - int lr) + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + const union xfs_btree_ptr *ptr, + int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); @@ -1083,7 +1165,7 @@ xfs_btree_init_block_int( __u64 owner, unsigned int flags) { - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); __u32 magic = xfs_btree_magic(crc, btnum); buf->bb_magic = cpu_to_be32(magic); @@ -1124,11 +1206,11 @@ xfs_btree_init_block( __u16 numrecs, __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), btnum, level, numrecs, owner, 0); } -STATIC void +void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, @@ -1144,13 +1226,13 @@ xfs_btree_init_block_cur( * code. */ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_private.b.ip->i_ino; + owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_private.a.agno; + owner = cur->bc_ag.pag->pag_agno; - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - cur->bc_btnum, level, numrecs, - owner, cur->bc_flags); + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), + xfs_buf_daddr(bp), cur->bc_btnum, level, + numrecs, owner, cur->bc_flags); } /* @@ -1185,10 +1267,10 @@ xfs_btree_buf_to_ptr( { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); else { ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); } } @@ -1220,12 +1302,12 @@ xfs_btree_set_refs( } } -STATIC int +int xfs_btree_get_buf_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - struct xfs_btree_block **block, - struct xfs_buf **bpp) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, + struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; @@ -1250,11 +1332,11 @@ xfs_btree_get_buf_block( */ STATIC int xfs_btree_read_buf_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int flags, - struct xfs_btree_block **block, - struct xfs_buf **bpp) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; @@ -1280,12 +1362,12 @@ xfs_btree_read_buf_block( /* * Copy keys from one btree block to another. */ -STATIC void +void xfs_btree_copy_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *dst_key, - union xfs_btree_key *src_key, - int numkeys) + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + const union xfs_btree_key *src_key, + int numkeys) { ASSERT(numkeys >= 0); memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); @@ -1308,11 +1390,11 @@ xfs_btree_copy_recs( /* * Copy block pointers from one btree block to another. */ -STATIC void +void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, - union xfs_btree_ptr *src_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); @@ -1393,8 +1475,8 @@ xfs_btree_log_keys( xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1436,8 +1518,8 @@ xfs_btree_log_ptrs( xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1449,7 +1531,7 @@ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ + uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ @@ -1505,8 +1587,8 @@ xfs_btree_log_block( xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1541,7 +1623,7 @@ xfs_btree_increment( #endif /* We're done if we remain in the block after the increment. */ - if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) goto out1; /* Fail if we just went off the right edge of the tree. */ @@ -1564,7 +1646,7 @@ xfs_btree_increment( goto error0; #endif - if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) break; /* Read-ahead the right block for the next loop. */ @@ -1591,14 +1673,14 @@ xfs_btree_increment( for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; - ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); - cur->bc_ptrs[lev] = 1; + cur->bc_levels[lev].ptr = 1; } out1: *stat = 1; @@ -1623,7 +1705,7 @@ xfs_btree_decrement( int *stat) /* success/failure */ { struct xfs_btree_block *block; - xfs_buf_t *bp; + struct xfs_buf *bp; int error; /* error return value */ int lev; union xfs_btree_ptr ptr; @@ -1634,7 +1716,7 @@ xfs_btree_decrement( xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); /* We're done if we remain in the block after the decrement. */ - if (--cur->bc_ptrs[level] > 0) + if (--cur->bc_levels[level].ptr > 0) goto out1; /* Get a pointer to the btree block. */ @@ -1658,7 +1740,7 @@ xfs_btree_decrement( * Stop when we don't go off the left edge of a block. */ for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) + if (--cur->bc_levels[lev].ptr > 0) break; /* Read-ahead the left block for the next loop. */ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); @@ -1684,13 +1766,13 @@ xfs_btree_decrement( for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; - ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); - cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); } out1: *stat = 1; @@ -1706,10 +1788,10 @@ error0: int xfs_btree_lookup_get_block( - struct xfs_btree_cur *cur, /* btree cursor */ - int level, /* level in the btree */ - union xfs_btree_ptr *pp, /* ptr to btree block */ - struct xfs_btree_block **blkp) /* return btree block */ + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + const union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ { struct xfs_buf *bp; /* buffer pointer for btree block */ xfs_daddr_t daddr; @@ -1728,11 +1810,11 @@ xfs_btree_lookup_get_block( * * Otherwise throw it away and get a new one. */ - bp = cur->bc_bufs[level]; + bp = cur->bc_levels[level].bp; error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; - if (bp && XFS_BUF_ADDR(bp) == daddr) { + if (bp && xfs_buf_daddr(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } @@ -1742,11 +1824,11 @@ xfs_btree_lookup_get_block( return error; /* Check the inode owner since the verifiers don't. */ - if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && - !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && + if (xfs_has_crc(cur->bc_mp) && + !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_private.b.ip->i_ino) + cur->bc_ino.ip->i_ino) goto out_bad; /* Did we get the level we were looking for? */ @@ -1762,7 +1844,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -1857,7 +1939,7 @@ xfs_btree_lookup( return -EFSCORRUPTED; } - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; *stat = 0; return 0; } @@ -1909,7 +1991,7 @@ xfs_btree_lookup( if (error) goto error0; - cur->bc_ptrs[level] = keyno; + cur->bc_levels[level].ptr = keyno; } } @@ -1926,7 +2008,7 @@ xfs_btree_lookup( !xfs_btree_ptr_is_null(cur, &ptr)) { int i; - cur->bc_ptrs[0] = keyno; + cur->bc_levels[0].ptr = keyno; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; @@ -1937,7 +2019,7 @@ xfs_btree_lookup( } } else if (dir == XFS_LOOKUP_LE && diff > 0) keyno--; - cur->bc_ptrs[0] = keyno; + cur->bc_levels[0].ptr = keyno; /* Return if we succeeded or not. */ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) @@ -2097,7 +2179,7 @@ __xfs_btree_updkeys( if (error) return error; #endif - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && @@ -2164,7 +2246,7 @@ xfs_btree_update_keys( if (error) return error; #endif - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; kp = xfs_btree_key_addr(cur, ptr, block); xfs_btree_copy_keys(cur, kp, &key, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); @@ -2198,7 +2280,7 @@ xfs_btree_update( goto error0; #endif /* Get the address of the rec to be updated. */ - ptr = cur->bc_ptrs[0]; + ptr = cur->bc_levels[0].ptr; rp = xfs_btree_rec_addr(cur, ptr, block); /* Fill in the new contents and log them. */ @@ -2273,7 +2355,7 @@ xfs_btree_lshift( * If the cursor entry is the one that would be moved, don't * do it... it's too complicated. */ - if (cur->bc_ptrs[level] <= 1) + if (cur->bc_levels[level].ptr <= 1) goto out0; /* Set up the left neighbor as "left". */ @@ -2407,7 +2489,7 @@ xfs_btree_lshift( goto error0; /* Slide the cursor value left one. */ - cur->bc_ptrs[level]--; + cur->bc_levels[level].ptr--; *stat = 1; return 0; @@ -2469,7 +2551,7 @@ xfs_btree_rshift( * do it... it's too complicated. */ lrecs = xfs_btree_get_numrecs(left); - if (cur->bc_ptrs[level] >= lrecs) + if (cur->bc_levels[level].ptr >= lrecs) goto out0; /* Set up the right neighbor as "right". */ @@ -2657,7 +2739,7 @@ __xfs_btree_split( */ lrecs = xfs_btree_get_numrecs(left); rrecs = lrecs / 2; - if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) rrecs++; src_index = (lrecs - rrecs + 1); @@ -2753,9 +2835,9 @@ __xfs_btree_split( * If it's just pointing past the last entry in left, then we'll * insert there, so don't change anything in that case. */ - if (cur->bc_ptrs[level] > lrecs + 1) { + if (cur->bc_levels[level].ptr > lrecs + 1) { xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= lrecs; + cur->bc_levels[level].ptr -= lrecs; } /* * If there are more levels, we'll need another cursor which refers @@ -2765,7 +2847,7 @@ __xfs_btree_split( error = xfs_btree_dup_cursor(cur, curp); if (error) goto error0; - (*curp)->bc_ptrs[level + 1]++; + (*curp)->bc_levels[level + 1].ptr++; } *ptrp = rptr; *stat = 1; @@ -2778,6 +2860,7 @@ error0: return error; } +#ifdef __KERNEL__ struct xfs_btree_split_args { struct xfs_btree_cur *cur; int level; @@ -2801,7 +2884,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_MEMALLOC_NOFS; + unsigned long new_pflags = 0; /* * we are in a transaction context here, but may also be doing work @@ -2810,15 +2893,23 @@ xfs_btree_split_worker( * in any way. */ if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + new_pflags |= PF_MEMALLOC | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); + xfs_trans_set_context(args->cur->bc_tp); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); - complete(args->done); + xfs_trans_clear_context(args->cur->bc_tp); current_restore_flags_nested(&pflags, new_pflags); + + /* + * Do not access args after complete() has run here. We don't own args + * and the owner may run and free args before we return here. + */ + complete(args->done); + } /* @@ -2855,6 +2946,9 @@ xfs_btree_split( destroy_work_on_stack(&args.work); return args.result; } +#else +#define xfs_btree_split __xfs_btree_split +#endif /* __KERNEL__ */ /* @@ -2908,16 +3002,18 @@ xfs_btree_new_iroot( */ memcpy(cblock, block, xfs_btree_block_len(cur)); if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + cblock->bb_u.l.bb_blkno = bno; else - cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + cblock->bb_u.s.bb_blkno = bno; } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); cur->bc_nlevels++; - cur->bc_ptrs[level + 1] = 1; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); + cur->bc_levels[level + 1].ptr = 1; kp = xfs_btree_key_addr(cur, 1, block); ckp = xfs_btree_key_addr(cur, 1, cblock); @@ -2938,9 +3034,9 @@ xfs_btree_new_iroot( xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - xfs_iroot_realloc(cur->bc_private.b.ip, + xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), - cur->bc_private.b.whichfork); + cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); @@ -2953,7 +3049,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3078,8 +3174,9 @@ xfs_btree_new_root( /* Fix up the cursor. */ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); - cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_levels[cur->bc_nlevels].ptr = nptr; cur->bc_nlevels++; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); *stat = 1; return 0; error0: @@ -3105,11 +3202,11 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ @@ -3136,7 +3233,7 @@ xfs_btree_make_block_unfull( return error; if (*stat) { - *oindex = *index = cur->bc_ptrs[level]; + *oindex = *index = cur->bc_levels[level].ptr; return 0; } @@ -3151,7 +3248,7 @@ xfs_btree_make_block_unfull( return error; - *index = cur->bc_ptrs[level]; + *index = cur->bc_levels[level].ptr; return 0; } @@ -3172,7 +3269,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3198,7 +3295,7 @@ xfs_btree_insrec( } /* If we're off the left edge, return failure. */ - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; @@ -3210,7 +3307,7 @@ xfs_btree_insrec( /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); - old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL; + old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG @@ -3252,7 +3349,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3272,7 +3369,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3326,7 +3423,7 @@ xfs_btree_insrec( * some records into the new tree block), so use the regular key * update mechanism. */ - if (bp && bp->b_bn != old_bn) { + if (bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -3357,6 +3454,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -3455,9 +3554,9 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_private.b.whichfork; - struct xfs_inode *ip = cur->bc_private.b.ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int whichfork = cur->bc_ino.whichfork; + struct xfs_inode *ip = cur->bc_ino.ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; union xfs_btree_key *kp; @@ -3514,8 +3613,8 @@ xfs_btree_kill_iroot( index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { - xfs_iroot_realloc(cur->bc_private.b.ip, index, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, index, + cur->bc_ino.whichfork); block = ifp->if_broot; } @@ -3541,10 +3640,10 @@ xfs_btree_kill_iroot( if (error) return error; - cur->bc_bufs[level - 1] = NULL; + cur->bc_levels[level - 1].bp = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; @@ -3574,8 +3673,8 @@ xfs_btree_kill_root( if (error) return error; - cur->bc_bufs[level] = NULL; - cur->bc_ra[level] = 0; + cur->bc_levels[level].bp = NULL; + cur->bc_levels[level].ra = 0; cur->bc_nlevels--; return 0; @@ -3634,7 +3733,7 @@ xfs_btree_delrec( tcur = NULL; /* Get the index of the entry being deleted, check for nothing there. */ - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; @@ -3712,8 +3811,8 @@ xfs_btree_delrec( */ if (level == cur->bc_nlevels - 1) { if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, -1, + cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) @@ -3944,7 +4043,7 @@ xfs_btree_delrec( xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); tcur = NULL; if (level == 0) - cur->bc_ptrs[0]++; + cur->bc_levels[0].ptr++; *stat = 1; return 0; @@ -4057,7 +4156,7 @@ xfs_btree_delrec( * surviving block, and log it. */ xfs_btree_set_numrecs(left, lrecs + rrecs); - xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), + xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); @@ -4081,9 +4180,9 @@ xfs_btree_delrec( * cursor to the left block, and fix up the index. */ if (bp != lbp) { - cur->bc_bufs[level] = lbp; - cur->bc_ptrs[level] += lrecs; - cur->bc_ra[level] = 0; + cur->bc_levels[level].bp = lbp; + cur->bc_levels[level].ptr += lrecs; + cur->bc_levels[level].ra = 0; } /* * If we joined with the right neighbor and there's a level above @@ -4103,16 +4202,16 @@ xfs_btree_delrec( * We can't use decrement because it would change the next level up. */ if (level > 0) - cur->bc_ptrs[level]--; + cur->bc_levels[level].ptr--; /* * We combined blocks, so we have to update the parent keys if the - * btree supports overlapped intervals. However, bc_ptrs[level + 1] - * points to the old block so that the caller knows which record to - * delete. Therefore, the caller must be savvy enough to call updkeys - * for us if we return stat == 2. The other exit points from this - * function don't require deletions further up the tree, so they can - * call updkeys directly. + * btree supports overlapped intervals. However, + * bc_levels[level + 1].ptr points to the old block so that the caller + * knows which record to delete. Therefore, the caller must be savvy + * enough to call updkeys for us if we return stat == 2. The other + * exit points from this function don't require deletions further up + * the tree, so they can call updkeys directly. */ /* Return value means the next level up has something to do. */ @@ -4166,7 +4265,7 @@ xfs_btree_delete( if (i == 0) { for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { + if (cur->bc_levels[level].ptr == 0) { error = xfs_btree_decrement(cur, level, &i); if (error) goto error0; @@ -4197,7 +4296,7 @@ xfs_btree_get_rec( int error; /* error return value */ #endif - ptr = cur->bc_ptrs[0]; + ptr = cur->bc_levels[0].ptr; block = xfs_btree_get_block(cur, 0, &bp); #ifdef DEBUG @@ -4249,6 +4348,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4403,11 +4517,11 @@ xfs_btree_lblock_v5hdr_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn)) + if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (owner != XFS_RMAP_OWN_UNKNOWN && be64_to_cpu(block->bb_u.l.bb_owner) != owner) @@ -4423,20 +4537,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /** @@ -4453,11 +4568,11 @@ xfs_btree_sblock_v5hdr_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) return __this_address; @@ -4477,40 +4592,94 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* - * Calculate the number of btree levels needed to store a given number of - * records in a short-format btree. + * For the given limits on leaf and keyptr records per block, calculate the + * height of the tree needed to index the number of leaf records. */ -uint +unsigned int xfs_btree_compute_maxlevels( - uint *limits, - unsigned long len) + const unsigned int *limits, + unsigned long long records) { - uint level; - unsigned long maxblocks; + unsigned long long level_blocks = howmany_64(records, limits[0]); + unsigned int height = 1; + + while (level_blocks > 1) { + level_blocks = howmany_64(level_blocks, limits[1]); + height++; + } - maxblocks = (len + limits[0] - 1) / limits[0]; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + limits[1] - 1) / limits[1]; - return level; + return height; +} + +/* + * For the given limits on leaf and keyptr records per block, calculate the + * number of blocks needed to index the given number of leaf records. + */ +unsigned long long +xfs_btree_calc_size( + const unsigned int *limits, + unsigned long long records) +{ + unsigned long long level_blocks = howmany_64(records, limits[0]); + unsigned long long blocks = level_blocks; + + while (level_blocks > 1) { + level_blocks = howmany_64(level_blocks, limits[1]); + blocks += level_blocks; + } + + return blocks; +} + +/* + * Given a number of available blocks for the btree to consume with records and + * pointers, calculate the height of the tree needed to index all the records + * that space can hold based on the number of pointers each interior node + * holds. + * + * We start by assuming a single level tree consumes a single block, then track + * the number of blocks each node level consumes until we no longer have space + * to store the next node level. At this point, we are indexing all the leaf + * blocks in the space, and there's no more free space to split the tree any + * further. That's our maximum btree height. + */ +unsigned int +xfs_btree_space_to_height( + const unsigned int *limits, + unsigned long long leaf_blocks) +{ + unsigned long long node_blocks = limits[1]; + unsigned long long blocks_left = leaf_blocks - 1; + unsigned int height = 1; + + if (leaf_blocks < 1) + return 0; + + while (node_blocks < blocks_left) { + blocks_left -= node_blocks; + node_blocks *= limits[1]; + height++; + } + + return height; } /* @@ -4521,8 +4690,8 @@ xfs_btree_compute_maxlevels( STATIC int xfs_btree_simple_query_range( struct xfs_btree_cur *cur, - union xfs_btree_key *low_key, - union xfs_btree_key *high_key, + const union xfs_btree_key *low_key, + const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { @@ -4612,8 +4781,8 @@ out: STATIC int xfs_btree_overlapped_query_range( struct xfs_btree_cur *cur, - union xfs_btree_key *low_key, - union xfs_btree_key *high_key, + const union xfs_btree_key *low_key, + const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { @@ -4645,23 +4814,25 @@ xfs_btree_overlapped_query_range( if (error) goto out; #endif - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; while (level < cur->bc_nlevels) { block = xfs_btree_get_block(cur, level, &bp); /* End of node, pop back towards the root. */ - if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { + if (cur->bc_levels[level].ptr > + be16_to_cpu(block->bb_numrecs)) { pop_up: if (level < cur->bc_nlevels - 1) - cur->bc_ptrs[level + 1]++; + cur->bc_levels[level + 1].ptr++; level++; continue; } if (level == 0) { /* Handle a leaf node. */ - recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, + block); cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, @@ -4684,14 +4855,15 @@ pop_up: /* Record is larger than high key; pop. */ goto pop_up; } - cur->bc_ptrs[level]++; + cur->bc_levels[level].ptr++; continue; } /* Handle an internal node. */ - lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); - hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); - pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); + lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); + hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, + block); + pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); @@ -4714,13 +4886,13 @@ pop_up: if (error) goto out; #endif - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; continue; } else if (hdiff < 0) { /* The low key is larger than the upper range; pop. */ goto pop_up; } - cur->bc_ptrs[level]++; + cur->bc_levels[level].ptr++; } out: @@ -4731,13 +4903,14 @@ out: * with a zero-results range query, so release the buffers if we * failed to return any results. */ - if (cur->bc_bufs[0] == NULL) { + if (cur->bc_levels[0].bp == NULL) { for (i = 0; i < cur->bc_nlevels; i++) { - if (cur->bc_bufs[i]) { - xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); - cur->bc_bufs[i] = NULL; - cur->bc_ptrs[i] = 0; - cur->bc_ra[i] = 0; + if (cur->bc_levels[i].bp) { + xfs_trans_brelse(cur->bc_tp, + cur->bc_levels[i].bp); + cur->bc_levels[i].bp = NULL; + cur->bc_levels[i].ptr = 0; + cur->bc_levels[i].ra = 0; } } } @@ -4754,8 +4927,8 @@ out: int xfs_btree_query_range( struct xfs_btree_cur *cur, - union xfs_btree_irec *low_rec, - union xfs_btree_irec *high_rec, + const union xfs_btree_irec *low_rec, + const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv) { @@ -4800,29 +4973,6 @@ xfs_btree_query_all( return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); } -/* - * Calculate the number of blocks needed to store a given number of records - * in a short-format (per-AG metadata) btree. - */ -unsigned long long -xfs_btree_calc_size( - uint *limits, - unsigned long long len) -{ - int level; - int maxrecs; - unsigned long long rval; - - maxrecs = limits[0]; - for (level = 0, rval = 0; len > 1; level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - maxrecs = limits[1]; - rval += len; - } - return rval; -} - static int xfs_btree_count_blocks_helper( struct xfs_btree_cur *cur, @@ -4862,7 +5012,7 @@ xfs_btree_diff_two_ptrs( STATIC int xfs_btree_has_record_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { return -ECANCELED; @@ -4871,12 +5021,12 @@ xfs_btree_has_record_helper( /* Is there a record covering a given range of keys? */ int xfs_btree_has_record( - struct xfs_btree_cur *cur, - union xfs_btree_irec *low, - union xfs_btree_irec *high, - bool *exists) + struct xfs_btree_cur *cur, + const union xfs_btree_irec *low, + const union xfs_btree_irec *high, + bool *exists) { - int error; + int error; error = xfs_btree_query_range(cur, low, high, &xfs_btree_has_record_helper, NULL); @@ -4899,7 +5049,7 @@ xfs_btree_has_more_records( block = xfs_btree_get_block(cur, 0, &bp); /* There are still records in this block. */ - if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block)) + if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) return true; /* There are more record blocks. */ @@ -4908,3 +5058,42 @@ xfs_btree_has_more_records( else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); } + +/* Set up all the btree cursor caches. */ +int __init +xfs_btree_init_cur_caches(void) +{ + int error; + + error = xfs_allocbt_init_cur_cache(); + if (error) + return error; + error = xfs_inobt_init_cur_cache(); + if (error) + goto err; + error = xfs_bmbt_init_cur_cache(); + if (error) + goto err; + error = xfs_rmapbt_init_cur_cache(); + if (error) + goto err; + error = xfs_refcountbt_init_cur_cache(); + if (error) + goto err; + + return 0; +err: + xfs_btree_destroy_cur_caches(); + return error; +} + +/* Destroy all the btree cursor caches, if they've been allocated. */ +void +xfs_btree_destroy_cur_caches(void) +{ + xfs_allocbt_destroy_cur_cache(); + xfs_inobt_destroy_cur_cache(); + xfs_bmbt_destroy_cur_cache(); + xfs_rmapbt_destroy_cur_cache(); + xfs_refcountbt_destroy_cur_cache(); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3eff7c321d43..eef27858a013 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -10,8 +10,8 @@ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; - -extern kmem_zone_t *xfs_btree_cur_zone; +struct xfs_ifork; +struct xfs_perag; /* * Generic key, ptr and record wrapper structures. @@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ -#define XFS_BB_MAGIC (1 << 0) -#define XFS_BB_LEVEL (1 << 1) -#define XFS_BB_NUMRECS (1 << 2) -#define XFS_BB_LEFTSIB (1 << 3) -#define XFS_BB_RIGHTSIB (1 << 4) -#define XFS_BB_BLKNO (1 << 5) -#define XFS_BB_LSN (1 << 6) -#define XFS_BB_UUID (1 << 7) -#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_MAGIC (1u << 0) +#define XFS_BB_LEVEL (1u << 1) +#define XFS_BB_NUMRECS (1u << 2) +#define XFS_BB_LEFTSIB (1u << 3) +#define XFS_BB_RIGHTSIB (1u << 4) +#define XFS_BB_BLKNO (1u << 5) +#define XFS_BB_LSN (1u << 6) +#define XFS_BB_UUID (1u << 7) +#define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 -#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) +#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -90,8 +90,6 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) -#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ - struct xfs_btree_ops { /* size of the key and record structures */ size_t key_len; @@ -104,19 +102,19 @@ struct xfs_btree_ops { /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, int level_change); + const union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *start_bno, + const union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ void (*update_lastrec)(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_rec *rec, + const struct xfs_btree_block *block, + const union xfs_btree_rec *rec, int ptr, int reason); /* records in block/level */ @@ -128,37 +126,37 @@ struct xfs_btree_ops { /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, - union xfs_btree_rec *rec); + const union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, - union xfs_btree_rec *rec); + const union xfs_btree_rec *rec); /* difference between key value and cursor value */ int64_t (*key_diff)(struct xfs_btree_cur *cur, - union xfs_btree_key *key); + const union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, - union xfs_btree_key *key1, - union xfs_btree_key *key2); + const union xfs_btree_key *key1, + const union xfs_btree_key *key2); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2); + const union xfs_btree_key *k1, + const union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2); + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2); }; /* @@ -177,54 +175,94 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree private information. */ -union xfs_btree_cur_private { - struct { - unsigned long nr_ops; /* # record updates */ - int shape_changes; /* # of extent splits */ - } refc; - struct { - bool active; /* allocation cursor state */ - } abt; +/* Per-AG btree information. */ +struct xfs_btree_cur_ag { + struct xfs_perag *pag; + union { + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + }; + union { + struct { + unsigned int nr_ops; /* # record updates */ + unsigned int shape_changes; /* # of extent splits */ + } refc; + struct { + bool active; /* allocation cursor state */ + } abt; + }; +}; + +/* Btree-in-inode cursor information */ +struct xfs_btree_cur_ino { + struct xfs_inode *ip; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + int allocated; + short forksize; + char whichfork; + char flags; +/* We are converting a delalloc reservation */ +#define XFS_BTCUR_BMBT_WASDEL (1 << 0) + +/* For extent swap, ignore owner check in verifier */ +#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) +}; + +struct xfs_btree_level { + /* buffer pointer */ + struct xfs_buf *bp; + + /* key/record number */ + uint16_t ptr; + + /* readahead info */ +#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */ + uint16_t ra; }; /* * Btree cursor structure. * This collects all information needed by the btree code in one place. */ -typedef struct xfs_btree_cur +struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; - uint bc_flags; /* btree features - below */ + struct kmem_cache *bc_cache; /* cursor cache */ + unsigned int bc_flags; /* btree features - below */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ union xfs_btree_irec bc_rec; /* current insert/search record value */ - struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ - int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ - uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ -#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ -#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ - uint8_t bc_nlevels; /* number of levels in the tree */ - uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ - xfs_btnum_t bc_btnum; /* identifies which btree type */ - int bc_statoff; /* offset of btre stats array */ + uint8_t bc_nlevels; /* number of levels in the tree */ + uint8_t bc_maxlevels; /* maximum levels for this btree type */ + int bc_statoff; /* offset of btree stats array */ + + /* + * Short btree pointers need an agno to be able to turn the pointers + * into physical addresses for IO, so the btree cursor switches between + * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the + * cursor. + */ union { - struct { /* needed for BNO, CNT, INO */ - struct xfs_buf *agbp; /* agf/agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - union xfs_btree_cur_private priv; - } a; - struct { /* needed for BMAP */ - struct xfs_inode *ip; /* pointer to our inode */ - int allocated; /* count of alloced */ - short forksize; /* fork's inode space */ - char whichfork; /* data or attr fork */ - char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ -#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ - } b; - } bc_private; /* per-btree type data */ -} xfs_btree_cur_t; + struct xfs_btree_cur_ag bc_ag; + struct xfs_btree_cur_ino bc_ino; + }; + + /* Must be at the end of the struct! */ + struct xfs_btree_level bc_levels[]; +}; + +/* + * Compute the size of a btree cursor that can handle a btree of a given + * height. The bc_levels array handles node and leaf blocks, so its size + * is exactly nlevels. + */ +static inline size_t +xfs_btree_cur_sizeof(unsigned int nlevels) +{ + return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels); +} /* cursor flags */ #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ @@ -232,7 +270,12 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ - +/* + * The root of this btree is a fakeroot structure so that we can stage a btree + * rebuild without leaving it accessible via primary metadata. The ops struct + * is dynamically allocated and must be freed when the cursor is deleted. + */ +#define XFS_BTREE_STAGING (1<<5) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 @@ -284,7 +327,7 @@ xfs_btree_check_sptr( */ void xfs_btree_del_cursor( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int error); /* del because of error */ /* @@ -293,8 +336,8 @@ xfs_btree_del_cursor( */ int /* error */ xfs_btree_dup_cursor( - xfs_btree_cur_t *cur, /* input cursor */ - xfs_btree_cur_t **ncur);/* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur);/* output cursor */ /* * Compute first and last byte offsets for the fields given. @@ -302,7 +345,7 @@ xfs_btree_dup_cursor( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -392,13 +435,13 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ -void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ -static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } @@ -409,7 +452,7 @@ static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, block->bb_numrecs = cpu_to_be16(numrecs); } -static inline int xfs_btree_get_level(struct xfs_btree_block *block) +static inline int xfs_btree_get_level(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } @@ -435,8 +478,12 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); -uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); -unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); +unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, + unsigned long long records); +unsigned long long xfs_btree_calc_size(const unsigned int *limits, + unsigned long long records); +unsigned int xfs_btree_space_to_height(const unsigned int *limits, + unsigned long long blocks); /* * Return codes for the query range iterator function are 0 to continue @@ -446,10 +493,11 @@ unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); * code on its own. */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, void *priv); + const union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, - union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, + const union xfs_btree_irec *low_rec, + const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); @@ -477,10 +525,11 @@ union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, - union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); + const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); -bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); +bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr); int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); @@ -491,14 +540,16 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); -int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, - union xfs_btree_irec *high, bool *exists); +int xfs_btree_has_record(struct xfs_btree_cur *cur, + const union xfs_btree_irec *low, + const union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); +struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( - xfs_btree_cur_t *cur, + struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; @@ -512,4 +563,44 @@ xfs_btree_islastblock( return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } +void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, + struct xfs_buf **bpp); +void xfs_btree_set_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, + int lr); +void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, int numrecs); +void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs); +void xfs_btree_copy_keys(struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + const union xfs_btree_key *src_key, int numkeys); + +static inline struct xfs_btree_cur * +xfs_btree_alloc_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_btnum_t btnum, + uint8_t maxlevels, + struct kmem_cache *cache) +{ + struct xfs_btree_cur *cur; + + cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL); + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + cur->bc_maxlevels = maxlevels; + cur->bc_cache = cache; + + return cur; +} + +int __init xfs_btree_init_cur_caches(void); +void xfs_btree_destroy_cur_caches(void); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c new file mode 100644 index 000000000000..dd75e208b543 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -0,0 +1,880 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_trace.h" +#include "xfs_btree_staging.h" + +/* + * Staging Cursors and Fake Roots for Btrees + * ========================================= + * + * A staging btree cursor is a special type of btree cursor that callers must + * use to construct a new btree index using the btree bulk loader code. The + * bulk loading code uses the staging btree cursor to abstract the details of + * initializing new btree blocks and filling them with records or key/ptr + * pairs. Regular btree operations (e.g. queries and modifications) are not + * supported with staging cursors, and callers must not invoke them. + * + * Fake root structures contain all the information about a btree that is under + * construction by the bulk loading code. Staging btree cursors point to fake + * root structures instead of the usual AG header or inode structure. + * + * Callers are expected to initialize a fake root structure and pass it into + * the _stage_cursor function for a specific btree type. When bulk loading is + * complete, callers should call the _commit_staged_btree function for that + * specific btree type to commit the new btree into the filesystem. + */ + +/* + * Don't allow staging cursors to be duplicated because they're supposed to be + * kept private to a single thread. + */ +STATIC struct xfs_btree_cur * +xfs_btree_fakeroot_dup_cursor( + struct xfs_btree_cur *cur) +{ + ASSERT(0); + return NULL; +} + +/* + * Don't allow block allocation for a staging cursor, because staging cursors + * do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ +STATIC int +xfs_btree_fakeroot_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Don't allow block freeing for a staging cursor, because staging cursors + * do not support regular btree modifications. + */ +STATIC int +xfs_btree_fakeroot_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* Initialize a pointer to the root block from the fakeroot. */ +STATIC void +xfs_btree_fakeroot_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xbtree_afakeroot *afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + afake = cur->bc_ag.afake; + ptr->s = cpu_to_be32(afake->af_root); +} + +/* + * Bulk Loading for AG Btrees + * ========================== + * + * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the + * staging cursor. Callers should initialize this to zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_afakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* Update the btree root information for a per-AG fake root. */ +STATIC void +xfs_btree_afakeroot_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + afake->af_root = be32_to_cpu(ptr->s); + afake->af_levels += inc; +} + +/* + * Initialize a AG-rooted btree cursor with the given AG btree fake root. + * The btree cursor's bc_ops will be overridden as needed to make the staging + * functionality work. + */ +void +xfs_btree_stage_afakeroot( + struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->set_root = xfs_btree_afakeroot_set_root; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ag.afake = afake; + cur->bc_nlevels = afake->af_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; +} + +/* + * Transform an AG-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_afakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_afakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ag.agbp = agbp; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading for Inode-Rooted Btrees + * ==================================== + * + * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to + * the staging cursor. This structure should be initialized as follows: + * + * - if_fork_size field should be set to the number of bytes available to the + * fork in the inode. + * + * - if_fork should point to a freshly allocated struct xfs_ifork. + * + * - if_format should be set to the appropriate fork type (e.g. + * XFS_DINODE_FMT_BTREE). + * + * All other fields must be zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_ifakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* + * Initialize an inode-rooted btree cursor with the given inode btree fake + * root. The btree cursor's bc_ops will be overridden as needed to make the + * staging functionality work. If new_ops is not NULL, these new ops will be + * passed out to the caller for further overriding. + */ +void +xfs_btree_stage_ifakeroot( + struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ino.ifake = ifake; + cur->bc_nlevels = ifake->if_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; + + if (new_ops) + *new_ops = nops; +} + +/* + * Transform an inode-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_ifakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_ifakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ino.ifake = NULL; + cur->bc_ino.whichfork = whichfork; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading of Staged Btrees + * ============================= + * + * This interface is used with a staged btree cursor to create a totally new + * btree with a large number of records (i.e. more than what would fit in a + * single root block). When the creation is complete, the new root can be + * linked atomically into the filesystem by committing the staged cursor. + * + * Creation of a new btree proceeds roughly as follows: + * + * The first step is to initialize an appropriate fake btree root structure and + * then construct a staged btree cursor. Refer to the block comments about + * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for + * more information about how to do this. + * + * The second step is to initialize a struct xfs_btree_bload context as + * documented in the structure definition. + * + * The third step is to call xfs_btree_bload_compute_geometry to compute the + * height of and the number of blocks needed to construct the btree. See the + * section "Computing the Geometry of the New Btree" for details about this + * computation. + * + * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and + * save them for later use by ->claim_block(). Bulk loading requires all + * blocks to be allocated beforehand to avoid ENOSPC failures midway through a + * rebuild, and to minimize seek distances of the new btree. + * + * Step five is to call xfs_btree_bload() to start constructing the btree. + * + * The final step is to commit the staging btree cursor, which logs the new + * btree root and turns the staging cursor into a regular cursor. The caller + * is responsible for cleaning up the previous btree blocks, if any. + * + * Computing the Geometry of the New Btree + * ======================================= + * + * The number of items placed in each btree block is computed via the following + * algorithm: For leaf levels, the number of items for the level is nr_records + * in the bload structure. For node levels, the number of items for the level + * is the number of blocks in the next lower level of the tree. For each + * level, the desired number of items per block is defined as: + * + * desired = max(minrecs, maxrecs - slack factor) + * + * The number of blocks for the level is defined to be: + * + * blocks = floor(nr_items / desired) + * + * Note this is rounded down so that the npb calculation below will never fall + * below minrecs. The number of items that will actually be loaded into each + * btree block is defined as: + * + * npb = nr_items / blocks + * + * Some of the leftmost blocks in the level will contain one extra record as + * needed to handle uneven division. If the number of records in any block + * would exceed maxrecs for that level, blocks is incremented and npb is + * recalculated. + * + * In other words, we compute the number of blocks needed to satisfy a given + * loading level, then spread the items as evenly as possible. + * + * The height and number of fs blocks required to create the btree are computed + * and returned via btree_height and nr_blocks. + */ + +/* + * Put a btree block that we're loading onto the ordered list and release it. + * The btree blocks will be written to disk when bulk loading is finished. + */ +static void +xfs_btree_bload_drop_buf( + struct list_head *buffers_list, + struct xfs_buf **bpp) +{ + if (*bpp == NULL) + return; + + if (!xfs_buf_delwri_queue(*bpp, buffers_list)) + ASSERT(0); + + xfs_buf_relse(*bpp); + *bpp = NULL; +} + +/* + * Allocate and initialize one btree block for bulk loading. + * + * The new btree block will have its level and numrecs fields set to the values + * of the level and nr_this_block parameters, respectively. + * + * The caller should ensure that ptrp, bpp, and blockp refer to the left + * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp + * will all point to the new block. + */ +STATIC int +xfs_btree_bload_prep_block( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + unsigned int level, + unsigned int nr_this_block, + union xfs_btree_ptr *ptrp, /* in/out */ + struct xfs_buf **bpp, /* in/out */ + struct xfs_btree_block **blockp, /* in/out */ + void *priv) +{ + union xfs_btree_ptr new_ptr; + struct xfs_buf *new_bp; + struct xfs_btree_block *new_block; + int ret; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + size_t new_size; + + ASSERT(*bpp == NULL); + + /* Allocate a new incore btree root block. */ + new_size = bbl->iroot_size(cur, nr_this_block, priv); + ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot_bytes = (int)new_size; + + /* Initialize it and send it out. */ + xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, + XFS_BUF_DADDR_NULL, cur->bc_btnum, level, + nr_this_block, cur->bc_ino.ip->i_ino, + cur->bc_flags); + + *bpp = NULL; + *blockp = ifp->if_broot; + xfs_btree_set_ptr_null(cur, ptrp); + return 0; + } + + /* Claim one of the caller's preallocated blocks. */ + xfs_btree_set_ptr_null(cur, &new_ptr); + ret = bbl->claim_block(cur, &new_ptr, priv); + if (ret) + return ret; + + ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); + + ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); + if (ret) + return ret; + + /* + * The previous block (if any) is the left sibling of the new block, + * so set its right sibling pointer to the new block and drop it. + */ + if (*blockp) + xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); + xfs_btree_bload_drop_buf(buffers_list, bpp); + + /* Initialize the new btree block. */ + xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); + xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); + + /* Set the out parameters. */ + *bpp = new_bp; + *blockp = new_block; + xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); + return 0; +} + +/* Load one leaf block. */ +STATIC int +xfs_btree_bload_leaf( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + xfs_btree_bload_get_record_fn get_record, + struct xfs_btree_block *block, + void *priv) +{ + unsigned int j; + int ret; + + /* Fill the leaf block with records. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_rec *block_rec; + + ret = get_record(cur, priv); + if (ret) + return ret; + block_rec = xfs_btree_rec_addr(cur, j, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return 0; +} + +/* + * Load one node block with key/ptr pairs. + * + * child_ptr must point to a block within the next level down in the tree. A + * key/ptr entry will be created in the new node block to the block pointed to + * by child_ptr. On exit, child_ptr points to the next block on the child + * level that needs processing. + */ +STATIC int +xfs_btree_bload_node( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + union xfs_btree_ptr *child_ptr, + struct xfs_btree_block *block) +{ + unsigned int j; + int ret; + + /* Fill the node block with keys and pointers. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_key child_key; + union xfs_btree_ptr *block_ptr; + union xfs_btree_key *block_key; + struct xfs_btree_block *child_block; + struct xfs_buf *child_bp; + + ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); + + ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + &child_bp); + if (ret) + return ret; + + block_ptr = xfs_btree_ptr_addr(cur, j, block); + xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); + + block_key = xfs_btree_key_addr(cur, j, block); + xfs_btree_get_keys(cur, child_block, &child_key); + xfs_btree_copy_keys(cur, block_key, &child_key, 1); + + xfs_btree_get_sibling(cur, child_block, child_ptr, + XFS_BB_RIGHTSIB); + xfs_buf_relse(child_bp); + } + + return 0; +} + +/* + * Compute the maximum number of records (or keyptrs) per block that we want to + * install at this level in the btree. Caller is responsible for having set + * @cur->bc_ino.forksize to the desired fork size, if appropriate. + */ +STATIC unsigned int +xfs_btree_bload_max_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int ret; + + if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) + return cur->bc_ops->get_dmaxrecs(cur, level); + + ret = cur->bc_ops->get_maxrecs(cur, level); + if (level == 0) + ret -= bbl->leaf_slack; + else + ret -= bbl->node_slack; + return ret; +} + +/* + * Compute the desired number of records (or keyptrs) per block that we want to + * install at this level in the btree, which must be somewhere between minrecs + * and max_npb. The caller is free to install fewer records per block. + */ +STATIC unsigned int +xfs_btree_bload_desired_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); + + /* Root blocks are not subject to minrecs rules. */ + if (level == cur->bc_nlevels - 1) + return max(1U, npb); + + return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); +} + +/* + * Compute the number of records to be stored in each block at this level and + * the number of blocks for this level. For leaf levels, we must populate an + * empty root block even if there are no records, so we have to have at least + * one block. + */ +STATIC void +xfs_btree_bload_level_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level, + uint64_t nr_this_level, + unsigned int *avg_per_block, + uint64_t *blocks, + uint64_t *blocks_with_extra) +{ + uint64_t npb; + uint64_t dontcare; + unsigned int desired_npb; + unsigned int maxnr; + + maxnr = cur->bc_ops->get_maxrecs(cur, level); + + /* + * Compute the number of blocks we need to fill each block with the + * desired number of records/keyptrs per block. Because desired_npb + * could be minrecs, we use regular integer division (which rounds + * the block count down) so that in the next step the effective # of + * items per block will never be less than desired_npb. + */ + desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); + *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); + *blocks = max(1ULL, *blocks); + + /* + * Compute the number of records that we will actually put in each + * block, assuming that we want to spread the records evenly between + * the blocks. Take care that the effective # of items per block (npb) + * won't exceed maxrecs even for the blocks that get an extra record, + * since desired_npb could be maxrecs, and in the previous step we + * rounded the block count down. + */ + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { + (*blocks)++; + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + } + + *avg_per_block = min_t(uint64_t, npb, nr_this_level); + + trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, + *avg_per_block, desired_npb, *blocks, + *blocks_with_extra); +} + +/* + * Ensure a slack value is appropriate for the btree. + * + * If the slack value is negative, set slack so that we fill the block to + * halfway between minrecs and maxrecs. Make sure the slack is never so large + * that we can underflow minrecs. + */ +static void +xfs_btree_bload_ensure_slack( + struct xfs_btree_cur *cur, + int *slack, + int level) +{ + int maxr; + int minr; + + maxr = cur->bc_ops->get_maxrecs(cur, level); + minr = cur->bc_ops->get_minrecs(cur, level); + + /* + * If slack is negative, automatically set slack so that we load the + * btree block approximately halfway between minrecs and maxrecs. + * Generally, this will net us 75% loading. + */ + if (*slack < 0) + *slack = maxr - ((maxr + minr) >> 1); + + *slack = min(*slack, maxr - minr); +} + +/* + * Prepare a btree cursor for a bulk load operation by computing the geometry + * fields in bbl. Caller must ensure that the btree cursor is a staging + * cursor. This function can be called multiple times. + */ +int +xfs_btree_bload_compute_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + uint64_t nr_records) +{ + uint64_t nr_blocks = 0; + uint64_t nr_this_level; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + /* + * Make sure that the slack values make sense for traditional leaf and + * node blocks. Inode-rooted btrees will return different minrecs and + * maxrecs values for the root block (bc_nlevels == level - 1). We're + * checking levels 0 and 1 here, so set bc_nlevels such that the btree + * code doesn't interpret either as the root level. + */ + cur->bc_nlevels = cur->bc_maxlevels - 1; + xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); + xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); + + bbl->nr_records = nr_this_level = nr_records; + for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) { + uint64_t level_blocks; + uint64_t dontcare64; + unsigned int level = cur->bc_nlevels - 1; + unsigned int avg_per_block; + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &level_blocks, &dontcare64); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * If all the items we want to store at this level + * would fit in the inode root block, then we have our + * btree root and are done. + * + * Note that bmap btrees forbid records in the root. + */ + if (level != 0 && nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* + * Otherwise, we have to store all the items for this + * level in traditional btree blocks and therefore need + * another level of btree to point to those blocks. + * + * We have to re-compute the geometry for each level of + * an inode-rooted btree because the geometry differs + * between a btree root in an inode fork and a + * traditional btree block. + * + * This distinction is made in the btree code based on + * whether level == bc_nlevels - 1. Based on the + * previous root block size check against the root + * block geometry, we know that we aren't yet ready to + * populate the root. Increment bc_nevels and + * recalculate the geometry for a traditional + * block-based btree level. + */ + cur->bc_nlevels++; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); + xfs_btree_bload_level_geometry(cur, bbl, level, + nr_this_level, &avg_per_block, + &level_blocks, &dontcare64); + } else { + /* + * If all the items we want to store at this level + * would fit in a single root block, we're done. + */ + if (nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* Otherwise, we need another level of btree. */ + cur->bc_nlevels++; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); + } + + nr_blocks += level_blocks; + nr_this_level = level_blocks; + } + + if (cur->bc_nlevels > cur->bc_maxlevels) + return -EOVERFLOW; + + bbl->btree_height = cur->bc_nlevels; + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + bbl->nr_blocks = nr_blocks - 1; + else + bbl->nr_blocks = nr_blocks; + return 0; +} + +/* Bulk load a btree given the parameters and geometry established in bbl. */ +int +xfs_btree_bload( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + void *priv) +{ + struct list_head buffers_list; + union xfs_btree_ptr child_ptr; + union xfs_btree_ptr ptr; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block = NULL; + uint64_t nr_this_level = bbl->nr_records; + uint64_t blocks; + uint64_t i; + uint64_t blocks_with_extra; + uint64_t total_blocks = 0; + unsigned int avg_per_block; + unsigned int level = 0; + int ret; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + INIT_LIST_HEAD(&buffers_list); + cur->bc_nlevels = bbl->btree_height; + xfs_btree_set_ptr_null(cur, &child_ptr); + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each leaf block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + /* + * Due to rounding, btree blocks will not be evenly populated + * in most cases. blocks_with_extra tells us how many blocks + * will receive an extra record to distribute the excess across + * the current level as evenly as possible. + */ + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, + nr_this_block, &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, + nr_this_block); + + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + block, priv); + if (ret) + goto out; + + /* + * Record the leftmost leaf pointer so we know where to start + * with the first node level. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + + /* Populate the internal btree nodes. */ + for (level = 1; level < cur->bc_nlevels; level++) { + union xfs_btree_ptr first_ptr; + + nr_this_level = blocks; + block = NULL; + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each node block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, + &buffers_list, level, nr_this_block, + &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, + &ptr, nr_this_block); + + ret = xfs_btree_bload_node(cur, nr_this_block, + &child_ptr, block); + if (ret) + goto out; + + /* + * Record the leftmost node pointer so that we know + * where to start the next node level above this one. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); + } + + /* Initialize the new root. */ + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + cur->bc_ino.ifake->if_levels = cur->bc_nlevels; + cur->bc_ino.ifake->if_blocks = total_blocks - 1; + } else { + cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); + cur->bc_ag.afake->af_levels = cur->bc_nlevels; + cur->bc_ag.afake->af_blocks = total_blocks; + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + ret = xfs_buf_delwri_submit(&buffers_list); + if (ret) + goto out; + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + ret = -EIO; + } + +out: + xfs_buf_delwri_cancel(&buffers_list); + if (bp) + xfs_buf_relse(bp); + return ret; +} diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h new file mode 100644 index 000000000000..f0d2976050ae --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_BTREE_STAGING_H__ +#define __XFS_BTREE_STAGING_H__ + +/* Fake root for an AG-rooted btree. */ +struct xbtree_afakeroot { + /* AG block number of the new btree root. */ + xfs_agblock_t af_root; + + /* Height of the new btree. */ + unsigned int af_levels; + + /* Number of blocks used by the btree. */ + unsigned int af_blocks; +}; + +/* Cursor interactions with fake roots for AG-rooted btrees. */ +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake); +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + +/* Fake root for an inode-rooted btree. */ +struct xbtree_ifakeroot { + /* Fake inode fork. */ + struct xfs_ifork *if_fork; + + /* Number of blocks used by the btree. */ + int64_t if_blocks; + + /* Height of the new btree. */ + unsigned int if_levels; + + /* Number of bytes available for this fork in the inode. */ + unsigned int if_fork_size; + + /* Fork format. */ + unsigned int if_format; + + /* Number of records. */ + unsigned int if_extents; +}; + +/* Cursor interactions with fake roots for inode-rooted btrees. */ +void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops); +void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + int whichfork, const struct xfs_btree_ops *ops); + +/* Bulk loading of staged btrees. */ +typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, void *priv); +typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, + unsigned int nr_this_level, void *priv); + +struct xfs_btree_bload { + /* + * This function will be called nr_records times to load records into + * the btree. The function does this by setting the cursor's bc_rec + * field in in-core format. Records must be returned in sort order. + */ + xfs_btree_bload_get_record_fn get_record; + + /* + * This function will be called nr_blocks times to obtain a pointer + * to a new btree block on disk. Callers must preallocate all space + * for the new btree before calling xfs_btree_bload, and this function + * is what claims that reservation. + */ + xfs_btree_bload_claim_block_fn claim_block; + + /* + * This function should return the size of the in-core btree root + * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree + * types. + */ + xfs_btree_bload_iroot_size_fn iroot_size; + + /* + * The caller should set this to the number of records that will be + * stored in the new btree. + */ + uint64_t nr_records; + + /* + * Number of free records to leave in each leaf block. If the caller + * sets this to -1, the slack value will be calculated to be halfway + * between maxrecs and minrecs. This typically leaves the block 75% + * full. Note that slack values are not enforced on inode root blocks. + */ + int leaf_slack; + + /* + * Number of free key/ptrs pairs to leave in each node block. This + * field has the same semantics as leaf_slack. + */ + int node_slack; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * number of btree blocks needed to store nr_records records. + */ + uint64_t nr_blocks; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * height of the new btree. + */ + unsigned int btree_height; +}; + +int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, uint64_t nr_records); +int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl, + void *priv); + +#endif /* __XFS_BTREE_STAGING_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 875e04f82541..e576560b46e9 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_errortag.h" /* * xfs_da_btree.c @@ -72,16 +73,22 @@ STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *save_blk); -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ +struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */ /* * Allocate a dir-state structure. * We don't put them on the stack since they're large. */ -xfs_da_state_t * -xfs_da_state_alloc(void) +struct xfs_da_state * +xfs_da_state_alloc( + struct xfs_da_args *args) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); + struct xfs_da_state *state; + + state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL); + state->args = args; + state->mp = args->dp->i_mount; + return state; } /* @@ -107,7 +114,18 @@ xfs_da_state_free(xfs_da_state_t *state) #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ - kmem_cache_free(xfs_da_state_zone, state); + kmem_cache_free(xfs_da_state_cache, state); +} + +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; } static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) @@ -123,7 +141,7 @@ xfs_da3_node_hdr_from_disk( struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from; to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); @@ -150,7 +168,7 @@ xfs_da3_node_hdr_to_disk( struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to; ASSERT(from->magic == XFS_DA3_NODE_MAGIC); @@ -185,10 +203,10 @@ xfs_da3_blkinfo_verify( if (!xfs_verify_magic16(bp, hdr->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -247,7 +265,7 @@ xfs_da3_node_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -276,7 +294,7 @@ xfs_da3_node_read_verify( __this_address); break; } - /* fall through */ + fallthrough; case XFS_DA_NODE_MAGIC: fa = xfs_da3_node_verify(bp); if (fa) @@ -436,12 +454,12 @@ xfs_da3_node_create( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; - hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { @@ -476,6 +494,9 @@ xfs_da3_split( trace_xfs_da_split(state->args); + if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + return -EIO; + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then @@ -590,7 +611,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -603,7 +624,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -705,7 +726,7 @@ xfs_da3_root_split( oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); } xfs_trans_log_buf(tp, bp, 0, size - 1); @@ -858,7 +879,6 @@ xfs_da3_node_rebalance( { struct xfs_da_intnode *node1; struct xfs_da_intnode *node2; - struct xfs_da_intnode *tmpnode; struct xfs_da_node_entry *btree1; struct xfs_da_node_entry *btree2; struct xfs_da_node_entry *btree_s; @@ -888,9 +908,7 @@ xfs_da3_node_rebalance( ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { - tmpnode = node1; - node1 = node2; - node2 = tmpnode; + swap(node1, node2); xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); btree1 = nodehdr1.btree; @@ -1213,7 +1231,7 @@ xfs_da3_root_join( xfs_trans_buf_copy_type(root_blk->bp, bp); if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); } xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); @@ -1624,7 +1642,7 @@ xfs_da3_node_lookup_int( } if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1639,7 +1657,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1647,7 +1665,7 @@ xfs_da3_node_lookup_int( if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } else expected_level--; @@ -1986,7 +2004,8 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { + for (; level >= 0; level--) { + blk = &path->blk[level]; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, blk->bp->b_addr); @@ -2138,7 +2157,7 @@ xfs_da_grow_inode_int( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; int w = args->whichfork; - xfs_rfsblock_t nblks = dp->i_d.di_nblocks; + xfs_rfsblock_t nblks = dp->i_nblocks; struct xfs_bmbt_irec map, *mapp; int nmap, error, got, i, mapi; @@ -2173,8 +2192,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap); @@ -2204,7 +2223,7 @@ xfs_da_grow_inode_int( } /* account for newly allocated blocks in reserved blocks total */ - args->total -= dp->i_d.di_nblocks - nblks; + args->total -= dp->i_nblocks - nblks; out_free_map: if (mapp != &map) @@ -2520,8 +2539,10 @@ xfs_dabuf_map( */ if (nirecs > 1) { map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); - if (!map) + if (!map) { + error = -ENOMEM; goto out_free_irecs; + } *mapp = map; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0f4fbb0889ff..ffa3df5b2893 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. @@ -9,7 +9,6 @@ struct xfs_inode; struct xfs_trans; -struct zone; /* * Directory/attribute geometry information. There will be one of these for each @@ -31,6 +30,7 @@ struct xfs_da_geometry { unsigned int free_hdr_size; /* dir2 free header size */ unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + xfs_extnum_t max_extents; /* Max. extents in corresponding fork */ xfs_dir2_data_aoff_t data_first_offset; size_t data_entry_offset; @@ -57,9 +57,10 @@ typedef struct xfs_da_args { const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ uint8_t filetype; /* filetype of inode for directories */ - uint8_t *value; /* set of bytes (maybe contain NULLs) */ + void *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ - int flags; /* argument flags (eg: ATTR_NOCREATE) */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ xfs_dahash_t hashval; /* hash value of name */ xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ @@ -76,29 +77,33 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - int op_flags; /* operation flags */ + uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* * Operation flags: */ -#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ -#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ -#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ -#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ -#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ -#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ +#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ +#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */ +#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ +#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ +#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ +#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ +#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ - { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_REPLACE, "REPLACE" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ - { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } + { XFS_DA_OP_NOTIME, "NOTIME" }, \ + { XFS_DA_OP_REMOVE, "REMOVE" }, \ + { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ + { XFS_DA_OP_LOGGED, "LOGGED" } /* * Storage for holding state during Btree searches and split/join ops. @@ -199,7 +204,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, * Utility routines. */ -#define XFS_DABUF_MAP_HOLE_OK (1 << 0) +#define XFS_DABUF_MAP_HOLE_OK (1u << 0) int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, @@ -220,14 +225,15 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const unsigned char *name, int len); -xfs_da_state_t *xfs_da_state_alloc(void); +struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); -extern struct kmem_zone *xfs_da_state_zone; +extern struct kmem_cache *xfs_da_state_cache; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 734837a9b51a..25e2841084e1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. @@ -15,8 +15,8 @@ */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ #define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ typedef struct xfs_da_blkinfo { __be32 forw; /* previous block in list */ @@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ #define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ -#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */ struct xfs_da3_blkinfo { /* @@ -61,7 +61,7 @@ struct xfs_da3_blkinfo { * Since we have duplicate keys, use a binary search but always follow * all match in the block, not just the first match found. */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ typedef struct xfs_da_node_hdr { struct xfs_da_blkinfo info; /* block type, links, etc. */ @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) * Directory address space divided into sections, * spaces separated by 32GB. */ +#define XFS_DIR2_MAX_SPACES 3 #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) @@ -579,7 +580,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) /* * Entries are packed toward the top as tight as possible. */ -typedef struct xfs_attr_shortform { +struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ @@ -589,9 +590,9 @@ typedef struct xfs_attr_shortform { uint8_t namelen; /* actual length of name (no NULL) */ uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t nameval[]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ -} xfs_attr_shortform_t; +}; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ __be16 base; /* base of free region */ @@ -688,23 +689,11 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) /* * Alignment for namelist and valuelist entries (since they are mixed @@ -758,14 +747,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) @@ -801,7 +790,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) #define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + ((bufsize) - (xfs_has_crc((mp)) ? \ sizeof(struct xfs_attr3_rmt_hdr) : 0)) /* Number of bytes in a directory block. */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 22557527cfdb..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -16,6 +16,18 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_buf.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" + +static struct kmem_cache *xfs_defer_pending_cache; /* * Deferred Operations in XFS @@ -176,29 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, + [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; /* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int +xfs_defer_create_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp, + bool sort) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; + + if (dfp->dfp_intent) + return 1; + + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; +} + +/* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -STATIC void +static int xfs_defer_create_intents( struct xfs_trans *tp) { - struct list_head *li; struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } + return ret; } /* Abort all the intents that were committed. */ @@ -223,20 +267,20 @@ xfs_defer_trans_abort( } } -/* Roll a transaction so we can do some deferred op processing. */ -STATIC int -xfs_defer_trans_roll( - struct xfs_trans **tpp) +/* + * Capture resources that the caller said not to release ("held") when the + * transaction commits. Caller is responsible for zero-initializing @dres. + */ +static int +xfs_defer_save_resources( + struct xfs_defer_resources *dres, + struct xfs_trans *tp) { - struct xfs_trans *tp = *tpp; struct xfs_buf_log_item *bli; struct xfs_inode_log_item *ili; struct xfs_log_item *lip; - struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS]; - struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES]; - int bpcount = 0, ipcount = 0; - int i; - int error; + + BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { @@ -244,25 +288,29 @@ xfs_defer_trans_roll( bli = container_of(lip, struct xfs_buf_log_item, bli_item); if (bli->bli_flags & XFS_BLI_HOLD) { - if (bpcount >= XFS_DEFER_OPS_NR_BUFS) { + if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { ASSERT(0); return -EFSCORRUPTED; } - xfs_trans_dirty_buf(tp, bli->bli_buf); - bplist[bpcount++] = bli->bli_buf; + if (bli->bli_flags & XFS_BLI_ORDERED) + dres->dr_ordered |= + (1U << dres->dr_bufs); + else + xfs_trans_dirty_buf(tp, bli->bli_buf); + dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; } break; case XFS_LI_INODE: ili = container_of(lip, struct xfs_inode_log_item, ili_item); if (ili->ili_lock_flags == 0) { - if (ipcount >= XFS_DEFER_OPS_NR_INODES) { + if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { ASSERT(0); return -EFSCORRUPTED; } xfs_trans_log_inode(tp, ili->ili_inode, XFS_ILOG_CORE); - iplist[ipcount++] = ili->ili_inode; + dres->dr_ip[dres->dr_inos++] = ili->ili_inode; } break; default: @@ -270,7 +318,43 @@ xfs_defer_trans_roll( } } - trace_xfs_defer_trans_roll(tp, _RET_IP_); + return 0; +} + +/* Attach the held resources to the transaction. */ +static void +xfs_defer_restore_resources( + struct xfs_trans *tp, + struct xfs_defer_resources *dres) +{ + unsigned short i; + + /* Rejoin the joined inodes. */ + for (i = 0; i < dres->dr_inos; i++) + xfs_trans_ijoin(tp, dres->dr_ip[i], 0); + + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < dres->dr_bufs; i++) { + xfs_trans_bjoin(tp, dres->dr_bp[i]); + if (dres->dr_ordered & (1U << i)) + xfs_trans_ordered_buf(tp, dres->dr_bp[i]); + xfs_trans_bhold(tp, dres->dr_bp[i]); + } +} + +/* Roll a transaction so we can do some deferred op processing. */ +STATIC int +xfs_defer_trans_roll( + struct xfs_trans **tpp) +{ + struct xfs_defer_resources dres = { }; + int error; + + error = xfs_defer_save_resources(&dres, *tpp); + if (error) + return error; + + trace_xfs_defer_trans_roll(*tpp, _RET_IP_); /* * Roll the transaction. Rolling always given a new transaction (even @@ -280,40 +364,15 @@ xfs_defer_trans_roll( * happened. */ error = xfs_trans_roll(tpp); - tp = *tpp; - /* Rejoin the joined inodes. */ - for (i = 0; i < ipcount; i++) - xfs_trans_ijoin(tp, iplist[i], 0); - - /* Rejoin the buffers and dirty them so the log moves forward. */ - for (i = 0; i < bpcount; i++) { - xfs_trans_bjoin(tp, bplist[i]); - xfs_trans_bhold(tp, bplist[i]); - } + xfs_defer_restore_resources(*tpp, &dres); if (error) - trace_xfs_defer_trans_roll_error(tp, error); + trace_xfs_defer_trans_roll_error(*tpp, error); return error; } /* - * Reset an already used dfops after finish. - */ -static void -xfs_defer_reset( - struct xfs_trans *tp) -{ - ASSERT(list_empty(&tp->t_dfops)); - - /* - * Low mode state transfers across transaction rolls to mirror dfops - * lifetime. Clear it now that dfops is reset. - */ - tp->t_flags &= ~XFS_TRANS_LOWMODE; -} - -/* * Free up any items left in the list. */ static void @@ -341,8 +400,112 @@ xfs_defer_cancel_list( ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); - kmem_free(dfp); + kmem_cache_free(xfs_defer_pending_cache, dfp); + } +} + +/* + * Prevent a log intent item from pinning the tail of the log by logging a + * done item to release the intent item; and then log a new intent item. + * The caller should provide a fresh transaction and roll it after we're done. + */ +static int +xfs_defer_relog( + struct xfs_trans **tpp, + struct list_head *dfops) +{ + struct xlog *log = (*tpp)->t_mountp->m_log; + struct xfs_defer_pending *dfp; + xfs_lsn_t threshold_lsn = NULLCOMMITLSN; + + + ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + list_for_each_entry(dfp, dfops, dfp_list) { + /* + * If the log intent item for this deferred op is not a part of + * the current log checkpoint, relog the intent item to keep + * the log tail moving forward. We're ok with this being racy + * because an incorrect decision means we'll be a little slower + * at pushing the tail. + */ + if (dfp->dfp_intent == NULL || + xfs_log_item_in_current_chkpt(dfp->dfp_intent)) + continue; + + /* + * Figure out where we need the tail to be in order to maintain + * the minimum required free space in the log. Only sample + * the log threshold once per call. + */ + if (threshold_lsn == NULLCOMMITLSN) { + threshold_lsn = xlog_grant_push_threshold(log, 0); + if (threshold_lsn == NULLCOMMITLSN) + break; + } + if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) + continue; + + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); + XFS_STATS_INC((*tpp)->t_mountp, defer_relog); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + } + + if ((*tpp)->t_flags & XFS_TRANS_DIRTY) + return xfs_defer_trans_roll(tpp); + return 0; +} + +/* + * Log an intent-done item for the first pending intent, and finish the work + * items. + */ +static int +xfs_defer_finish_one( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_btree_cur *state = NULL; + struct list_head *li, *n; + int error; + + trace_xfs_defer_pending_finish(tp->t_mountp, dfp); + + dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = ops->finish_item(tp, dfp->dfp_done, li, &state); + if (error == -EAGAIN) { + int ret; + + /* + * Caller wants a fresh transaction; put the work item + * back on the list and log a new log intent item to + * replace the old one. See "Requesting a Fresh + * Transaction while Finishing Deferred Work" above. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + dfp->dfp_done = NULL; + dfp->dfp_intent = NULL; + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; + } + + if (error) + goto out; } + + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_cache_free(xfs_defer_pending_cache, dfp); +out: + if (ops->finish_cleanup) + ops->finish_cleanup(tp, state, error); + return error; } /* @@ -357,12 +520,8 @@ int xfs_defer_finish_noroll( struct xfs_trans **tp) { - struct xfs_defer_pending *dfp; - struct list_head *li; - struct list_head *n; - void *state; + struct xfs_defer_pending *dfp = NULL; int error = 0; - const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -371,87 +530,51 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { - /* log intents and pull in intake items */ - xfs_defer_create_intents(*tp); - list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); - /* - * Roll the transaction. + * Deferred items that are created in the process of finishing + * other deferred work items should be queued at the head of + * the pending list, which puts them ahead of the deferred work + * that was created by the caller. This keeps the number of + * pending work items to a minimum, which decreases the amount + * of time that any one intent item can stick around in memory, + * pinning the log tail. */ - error = xfs_defer_trans_roll(tp); - if (error) - goto out; + int has_intents = xfs_defer_create_intents(*tp); - /* Log an intent-done item for the first pending item. */ - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, - dfp->dfp_count); - - /* Finish the work items. */ - state = NULL; - list_for_each_safe(li, n, &dfp->dfp_work) { - list_del(li); - dfp->dfp_count--; - error = ops->finish_item(*tp, li, dfp->dfp_done, - &state); - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction; - * put the work item back on the list - * and jump out. - */ - list_add(li, &dfp->dfp_work); - dfp->dfp_count++; - break; - } else if (error) { - /* - * Clean up after ourselves and jump out. - * xfs_defer_cancel will take care of freeing - * all these lists and stuff. - */ - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - goto out; - } + list_splice_init(&(*tp)->t_dfops, &dop_pending); + + if (has_intents < 0) { + error = has_intents; + goto out_shutdown; } - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction, so log a - * new log intent item to replace the old one - * and roll the transaction. See "Requesting - * a Fresh Transaction while Finishing - * Deferred Work" above. - */ - dfp->dfp_intent = ops->create_intent(*tp, - dfp->dfp_count); - dfp->dfp_done = NULL; - list_for_each(li, &dfp->dfp_work) - ops->log_item(*tp, dfp->dfp_intent, li); - } else { - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); + if (has_intents || dfp) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + + /* Relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; } - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - } - -out: - if (error) { - xfs_defer_trans_abort(*tp, &dop_pending); - xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); - trace_xfs_defer_finish_error(*tp, error); - xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); - xfs_defer_cancel(*tp); - return error; + dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, + dfp_list); + error = xfs_defer_finish_one(*tp, dfp); + if (error && error != -EAGAIN) + goto out_shutdown; } trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; + +out_shutdown: + xfs_defer_trans_abort(*tp, &dop_pending); + xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); + trace_xfs_defer_finish_error(*tp, error); + xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); + xfs_defer_cancel(*tp); + return error; } int @@ -475,7 +598,10 @@ xfs_defer_finish( return error; } } - xfs_defer_reset(*tp); + + /* Reset LOWMODE now that we've finished all the dfops. */ + ASSERT(list_empty(&(*tp)->t_dfops)); + (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -516,8 +642,8 @@ xfs_defer_add( dfp = NULL; } if (!dfp) { - dfp = kmem_alloc(sizeof(struct xfs_defer_pending), - KM_NOFS); + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; @@ -549,6 +675,256 @@ xfs_defer_move( * that behavior. */ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); + stp->t_flags &= ~XFS_TRANS_LOWMODE; +} + +/* + * Prepare a chain of fresh deferred ops work items to be completed later. Log + * recovery requires the ability to put off until later the actual finishing + * work so that it can process unfinished items recovered from the log in + * correct order. + * + * Create and log intent items for all the work that we're capturing so that we + * can be assured that the items will get replayed if the system goes down + * before log recovery gets a chance to finish the work it put off. The entire + * deferred ops state is transferred to the capture structure and the + * transaction is then ready for the caller to commit it. If there are no + * intent items to capture, this function returns NULL. + * + * If capture_ip is not NULL, the capture structure will obtain an extra + * reference to the inode. + */ +static struct xfs_defer_capture * +xfs_defer_ops_capture( + struct xfs_trans *tp) +{ + struct xfs_defer_capture *dfc; + unsigned short i; + int error; + + if (list_empty(&tp->t_dfops)) + return NULL; + + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + + /* Create an object to capture the defer ops. */ + dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + INIT_LIST_HEAD(&dfc->dfc_list); + INIT_LIST_HEAD(&dfc->dfc_dfops); + + /* Move the dfops chain and transaction state to the capture struct. */ + list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); + dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; + tp->t_flags &= ~XFS_TRANS_LOWMODE; + + /* Capture the remaining block reservations along with the dfops. */ + dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; + dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; + + /* Preserve the log reservation size. */ + dfc->dfc_logres = tp->t_log_res; + + error = xfs_defer_save_resources(&dfc->dfc_held, tp); + if (error) { + /* + * Resource capture should never fail, but if it does, we + * still have to shut down the log and release things + * properly. + */ + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); + } + + /* + * Grab extra references to the inodes and buffers because callers are + * expected to release their held references after we commit the + * transaction. + */ + for (i = 0; i < dfc->dfc_held.dr_inos; i++) { + ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL)); + ihold(VFS_I(dfc->dfc_held.dr_ip[i])); + } + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_hold(dfc->dfc_held.dr_bp[i]); + + return dfc; +} + +/* Release all resources that we used to capture deferred ops. */ +void +xfs_defer_ops_capture_free( + struct xfs_mount *mp, + struct xfs_defer_capture *dfc) +{ + unsigned short i; + + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_relse(dfc->dfc_held.dr_bp[i]); + + for (i = 0; i < dfc->dfc_held.dr_inos; i++) + xfs_irele(dfc->dfc_held.dr_ip[i]); + + kmem_free(dfc); +} + +/* + * Capture any deferred ops and commit the transaction. This is the last step + * needed to finish a log intent item that we recovered from the log. If any + * of the deferred ops operate on an inode, the caller must pass in that inode + * so that the reference can be transferred to the capture structure. The + * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling + * xfs_defer_ops_continue. + */ +int +xfs_defer_ops_capture_and_commit( + struct xfs_trans *tp, + struct list_head *capture_list) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_defer_capture *dfc; + int error; + + /* If we don't capture anything, commit transaction and exit. */ + dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } + if (!dfc) + return xfs_trans_commit(tp); + + /* Commit the transaction and add the capture structure to the list. */ + error = xfs_trans_commit(tp); + if (error) { + xfs_defer_ops_capture_free(mp, dfc); + return error; + } + + list_add_tail(&dfc->dfc_list, capture_list); + return 0; +} + +/* + * Attach a chain of captured deferred ops to a new transaction and free the + * capture structure. If an inode was captured, it will be passed back to the + * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. + * The caller now owns the inode reference. + */ +void +xfs_defer_ops_continue( + struct xfs_defer_capture *dfc, + struct xfs_trans *tp, + struct xfs_defer_resources *dres) +{ + unsigned int i; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); + + /* Lock the captured resources to the new transaction. */ + if (dfc->dfc_held.dr_inos == 2) + xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, + dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); + else if (dfc->dfc_held.dr_inos == 1) + xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_lock(dfc->dfc_held.dr_bp[i]); + + /* Join the captured resources to the new transaction. */ + xfs_defer_restore_resources(tp, &dfc->dfc_held); + memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); + dres->dr_bufs = 0; + + /* Move captured dfops chain and state to the transaction. */ + list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); + tp->t_flags |= dfc->dfc_tpflags; + + kmem_free(dfc); +} + +/* Release the resources captured and continued during recovery. */ +void +xfs_defer_resources_rele( + struct xfs_defer_resources *dres) +{ + unsigned short i; - xfs_defer_reset(stp); + for (i = 0; i < dres->dr_inos; i++) { + xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); + xfs_irele(dres->dr_ip[i]); + dres->dr_ip[i] = NULL; + } + + for (i = 0; i < dres->dr_bufs; i++) { + xfs_buf_relse(dres->dr_bp[i]); + dres->dr_bp[i] = NULL; + } + + dres->dr_inos = 0; + dres->dr_bufs = 0; + dres->dr_ordered = 0; +} + +static inline int __init +xfs_defer_init_cache(void) +{ + xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", + sizeof(struct xfs_defer_pending), + 0, 0, NULL); + + return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; +} + +static inline void +xfs_defer_destroy_cache(void) +{ + kmem_cache_destroy(xfs_defer_pending_cache); + xfs_defer_pending_cache = NULL; +} + +/* Set up caches for deferred work items. */ +int __init +xfs_defer_init_item_caches(void) +{ + int error; + + error = xfs_defer_init_cache(); + if (error) + return error; + error = xfs_rmap_intent_init_cache(); + if (error) + goto err; + error = xfs_refcount_intent_init_cache(); + if (error) + goto err; + error = xfs_bmap_intent_init_cache(); + if (error) + goto err; + error = xfs_extfree_intent_init_cache(); + if (error) + goto err; + error = xfs_attr_intent_init_cache(); + if (error) + goto err; + return 0; +err: + xfs_defer_destroy_item_caches(); + return error; +} + +/* Destroy all the deferred work item caches, if they've been allocated. */ +void +xfs_defer_destroy_item_caches(void) +{ + xfs_attr_intent_destroy_cache(); + xfs_extfree_intent_destroy_cache(); + xfs_bmap_intent_destroy_cache(); + xfs_refcount_intent_destroy_cache(); + xfs_rmap_intent_destroy_cache(); + xfs_defer_destroy_cache(); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7c28d7608ac6..114a3a4930a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> @@ -6,7 +6,9 @@ #ifndef __XFS_DEFER_H__ #define __XFS_DEFER_H__ +struct xfs_btree_cur; struct xfs_defer_op_type; +struct xfs_defer_capture; /* * Header for deferred operation list. @@ -17,6 +19,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, + XFS_DEFER_OPS_TYPE_ATTR, XFS_DEFER_OPS_TYPE_MAX, }; @@ -28,8 +31,8 @@ enum xfs_defer_ops_type { struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ - void *dfp_intent; /* log intent item */ - void *dfp_done; /* log done item */ + struct xfs_log_item *dfp_intent; /* log intent item */ + struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ enum xfs_defer_ops_type dfp_type; }; @@ -43,15 +46,16 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - void (*abort_intent)(void *); - void *(*create_done)(struct xfs_trans *, void *, unsigned int); - int (*finish_item)(struct xfs_trans *, struct list_head *, void *, - void **); - void (*finish_cleanup)(struct xfs_trans *, void *, int); - void (*cancel_item)(struct list_head *); - int (*diff_items)(void *, struct list_head *, struct list_head *); - void *(*create_intent)(struct xfs_trans *, uint); - void (*log_item)(struct xfs_trans *, void *, struct list_head *); + struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, + struct list_head *items, unsigned int count, bool sort); + void (*abort_intent)(struct xfs_log_item *intent); + struct xfs_log_item *(*create_done)(struct xfs_trans *tp, + struct xfs_log_item *intent, unsigned int count); + int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done, + struct list_head *item, struct xfs_btree_cur **state); + void (*finish_cleanup)(struct xfs_trans *tp, + struct xfs_btree_cur *state, int error); + void (*cancel_item)(struct list_head *item); unsigned int max_items; }; @@ -60,5 +64,68 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_attr_defer_type; + + +/* + * Deferred operation item relogging limits. + */ +#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ + +/* Resources that must be held across a transaction roll. */ +struct xfs_defer_resources { + /* held buffers */ + struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS]; + + /* inodes with no unlock flags */ + struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES]; + + /* number of held buffers */ + unsigned short dr_bufs; + + /* bitmap of ordered buffers */ + unsigned short dr_ordered; + + /* number of held inodes */ + unsigned short dr_inos; +}; + +/* + * This structure enables a dfops user to detach the chain of deferred + * operations from a transaction so that they can be continued later. + */ +struct xfs_defer_capture { + /* List of other capture structures. */ + struct list_head dfc_list; + + /* Deferred ops state saved from the transaction. */ + struct list_head dfc_dfops; + unsigned int dfc_tpflags; + + /* Block reservations for the data and rt devices. */ + unsigned int dfc_blkres; + unsigned int dfc_rtxres; + + /* Log reservation saved from the transaction. */ + unsigned int dfc_logres; + + struct xfs_defer_resources dfc_held; +}; + +/* + * Functions to capture a chain of deferred operations and continue them later. + * This doesn't normally happen except log recovery. + */ +int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, + struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, + struct xfs_defer_resources *dres); +void xfs_defer_ops_capture_free(struct xfs_mount *mp, + struct xfs_defer_capture *d); +void xfs_defer_resources_rele(struct xfs_defer_resources *dres); + +int __init xfs_defer_init_item_caches(void); +void xfs_defer_destroy_item_caches(void); #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index dd6fcaaea318..92bac3373f1f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,7 +19,11 @@ #include "xfs_error.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +const struct xfs_name xfs_name_dotdot = { + .name = (const unsigned char *)"..", + .len = 2, + .type = XFS_DIR3_FT_DIR, +}; /* * Convert inode mode to directory entry filetype @@ -54,10 +58,10 @@ xfs_mode_to_ftype( */ xfs_dahash_t xfs_ascii_ci_hashname( - struct xfs_name *name) + const struct xfs_name *name) { - xfs_dahash_t hash; - int i; + xfs_dahash_t hash; + int i; for (i = 0, hash = 0; i < name->len; i++) hash = tolower(name->name[i]) ^ rol32(hash, 7); @@ -115,7 +119,7 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); @@ -146,6 +150,8 @@ xfs_da_mount( dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> + mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ @@ -157,6 +163,12 @@ xfs_da_mount( dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + + if (xfs_has_large_extent_counts(mp)) + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + else + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } @@ -179,9 +191,9 @@ xfs_dir_isempty( xfs_dir2_sf_hdr_t *sfp; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - if (dp->i_d.di_size == 0) /* might happen during shutdown. */ + if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; - if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; return !sfp->count; @@ -243,13 +255,13 @@ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -278,7 +290,7 @@ xfs_dir_createname( if (!inum) args->op_flags |= XFS_DA_OP_JUSTCHECK; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); goto out_free; } @@ -337,16 +349,16 @@ xfs_dir_cilookup_result( int xfs_dir_lookup( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name, - xfs_ino_t *inum, /* out: inode number */ - struct xfs_name *ci_name) /* out: actual name if CI match */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - int lock_mode; + struct xfs_da_args *args; + int rval; + bool v; + int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); @@ -373,7 +385,7 @@ xfs_dir_lookup( args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; } @@ -423,7 +435,7 @@ xfs_dir_removename( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -443,7 +455,7 @@ xfs_dir_removename( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_removename(args); goto out_free; } @@ -475,13 +487,13 @@ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, /* name of entry to replace */ + const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -504,7 +516,7 @@ xfs_dir_replace( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_replace(args); goto out_free; } @@ -584,8 +596,8 @@ xfs_dir2_grow_inode( xfs_fsize_t size; /* directory file (data) size */ size = XFS_FSB_TO_B(mp, bno + count); - if (size > dp->i_d.di_size) { - dp->i_d.di_size = size; + if (size > dp->i_disk_size) { + dp->i_disk_size = size; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } @@ -598,19 +610,23 @@ xfs_dir2_grow_inode( int xfs_dir2_isblock( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isblock) { - xfs_fileoff_t last; /* last file offset */ - int rval; + struct xfs_mount *mp = args->dp->i_mount; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (XFS_IS_CORRUPT(args->dp->i_mount, - rval != 0 && - args->dp->i_d.di_size != args->geo->blksize)) + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isblock = false; + if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) + return 0; + + *isblock = true; + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; - *vp = rval; return 0; } @@ -620,14 +636,20 @@ xfs_dir2_isblock( int xfs_dir2_isleaf( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isleaf) { - xfs_fileoff_t last; /* last file offset */ - int rval; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - *vp = last == args->geo->leafblk + args->geo->fsbcount; + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isleaf = false; + if (eof != args->geo->leafblk + args->geo->fsbcount) + return 0; + + *isleaf = true; return 0; } @@ -687,7 +709,7 @@ xfs_dir2_shrink_inode( /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) + if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { @@ -703,7 +725,7 @@ xfs_dir2_shrink_inode( /* * Set the size to the new last block. */ - dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + dp->i_disk_size = XFS_FSB_TO_B(mp, bno); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } @@ -728,9 +750,9 @@ xfs_dir2_namecheck( xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, - struct xfs_name *name) + const struct xfs_name *name) { - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); return xfs_da_hashname(name->name, name->len); } @@ -741,7 +763,7 @@ xfs_dir2_compname( const unsigned char *name, int len) { - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + if (unlikely(xfs_has_asciici(args->dp->i_mount))) return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 033777e282f2..dd39f17dd9a9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -21,7 +21,7 @@ struct xfs_dir2_data_unused; struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; -extern struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dotdot; /* * Convert inode mode to directory entry filetype @@ -39,18 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t *inum, + const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); -extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp, - xfs_ino_t inum); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); @@ -63,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index d6ced59b9567..00f960a703b2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,10 +53,10 @@ xfs_dir3_block_verify( if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -71,7 +71,7 @@ xfs_dir3_block_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -96,7 +96,7 @@ xfs_dir3_block_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; +static xfs_failaddr_t +xfs_dir3_block_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_has_crc(mp)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} + int xfs_dir3_block_read( struct xfs_trans *tp, @@ -121,12 +138,24 @@ xfs_dir3_block_read( struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_block_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } @@ -142,10 +171,10 @@ xfs_dir3_block_init( bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; @@ -813,7 +842,7 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; /* @@ -932,7 +961,7 @@ xfs_dir2_leaf_to_block( * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ - while (dp->i_d.di_size > args->geo->blksize) { + while (dp->i_disk_size > args->geo->blksize) { int hdrsz; hdrsz = args->geo->data_entry_offset; @@ -1026,7 +1055,7 @@ xfs_dir2_leaf_to_block( * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; return xfs_dir2_block_to_sf(args, dbp, size, &sfh); @@ -1042,7 +1071,7 @@ xfs_dir2_sf_to_block( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ @@ -1067,15 +1096,15 @@ xfs_dir2_sf_to_block( trace_xfs_dir2_sf_to_block(args); - ASSERT(ifp->if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(ifp->if_bytes == dp->i_d.di_size); + ASSERT(ifp->if_bytes == dp->i_disk_size); ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); - ASSERT(dp->i_d.di_nextents == 0); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + ASSERT(dp->i_df.if_nextents == 0); /* * Copy the directory into a temporary buffer. @@ -1086,7 +1115,7 @@ xfs_dir2_sf_to_block( xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); - dp->i_d.di_size = 0; + dp->i_disk_size = 0; /* * Add block 0 to the inode. diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index b9eba8213180..dbcf58979a59 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -29,7 +29,7 @@ xfs_dir2_data_bestfree_p( struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) { - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) return ((struct xfs_dir3_data_hdr *)hdr)->best_free; return hdr->bestfree; } @@ -51,7 +51,7 @@ xfs_dir2_data_get_ftype( struct xfs_mount *mp, struct xfs_dir2_data_entry *dep) { - if (xfs_sb_version_hasftype(&mp->m_sb)) { + if (xfs_has_ftype(mp)) { uint8_t ftype = dep->name[dep->namelen]; if (likely(ftype < XFS_DIR3_FT_MAX)) @@ -70,7 +70,7 @@ xfs_dir2_data_put_ftype( ASSERT(ftype < XFS_DIR3_FT_MAX); ASSERT(dep->namelen != 0); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) dep->name[dep->namelen] = ftype; } @@ -218,7 +218,7 @@ __xfs_dir3_data_check( */ if (dep->namelen == 0) return __this_address; - if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))) + if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) return __this_address; if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) return __this_address; @@ -297,10 +297,10 @@ xfs_dir3_data_verify( if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -343,7 +343,7 @@ xfs_dir3_data_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -368,7 +368,7 @@ xfs_dir3_data_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -394,6 +394,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; +static xfs_failaddr_t +xfs_dir3_data_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_has_crc(mp)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} int xfs_dir3_data_read( @@ -403,12 +419,24 @@ xfs_dir3_data_read( unsigned int flags, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_data_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } @@ -689,12 +717,12 @@ xfs_dir3_data_init( * Initialize the header. */ hdr = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a131b520aac7..cb9e950a911d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -37,7 +37,7 @@ xfs_dir2_leaf_hdr_from_disk( struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from; to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); @@ -68,7 +68,7 @@ xfs_dir2_leaf_hdr_to_disk( struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to; ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || @@ -108,12 +108,12 @@ xfs_dir3_leaf1_check( if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp)) return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false); } static inline void @@ -139,29 +139,34 @@ xfs_failaddr_t xfs_dir3_leaf_check_int( struct xfs_mount *mp, struct xfs_dir3_icleaf_hdr *hdr, - struct xfs_dir2_leaf *leaf) + struct xfs_dir2_leaf *leaf, + bool expensive_checking) { struct xfs_da_geometry *geo = mp->m_dir_geo; xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC); ltp = xfs_dir2_leaf_tail_p(geo, leaf); /* * XXX (dgc): This value is not restrictive enough. * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. + * We can deduce a value for that from i_disk_size. */ if (hdr->count > geo->leaf_max_ents) return __this_address; /* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; + if (!expensive_checking) + return NULL; + /* Check hash value order, count stale entries. */ for (i = stale = 0; i < hdr->count; i++) { if (i + 1 < hdr->count) { @@ -171,6 +176,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address; @@ -195,7 +204,7 @@ xfs_dir3_leaf_verify( return fa; xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr); - return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr); + return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true); } static void @@ -205,7 +214,7 @@ xfs_dir3_leaf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -230,7 +239,7 @@ xfs_dir3_leaf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -304,7 +313,7 @@ xfs_dir3_leaf_init( ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; memset(leaf3, 0, sizeof(*leaf3)); @@ -312,7 +321,7 @@ xfs_dir3_leaf_init( leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); - leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); leaf3->info.owner = cpu_to_be64(owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { @@ -1383,7 +1392,7 @@ xfs_dir2_leaf_removename( ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index a0cc5e240306..7a03aeb9f4c9 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -68,12 +68,12 @@ xfs_dir3_leafn_check( if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp)) return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false); } static inline void @@ -105,12 +105,12 @@ xfs_dir3_free_verify( if (!xfs_verify_magic(bp, hdr->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -128,7 +128,7 @@ xfs_dir3_free_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -153,7 +153,7 @@ xfs_dir3_free_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -185,7 +185,7 @@ xfs_dir3_free_header_check( firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * maxbests; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (be32_to_cpu(hdr3->firstdb) != firstdb) @@ -194,6 +194,8 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -226,8 +228,9 @@ __xfs_dir3_free_read( /* Check things that we can't do in the verifier. */ fa = xfs_dir3_free_header_check(dp, fbno, *bpp); if (fa) { - xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); + __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); + *bpp = NULL; return -EFSCORRUPTED; } @@ -244,7 +247,7 @@ xfs_dir2_free_hdr_from_disk( struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from; to->magic = be32_to_cpu(from3->hdr.hdr.magic); @@ -271,7 +274,7 @@ xfs_dir2_free_hdr_to_disk( struct xfs_dir2_free *to, struct xfs_dir3_icfree_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to; ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); @@ -338,12 +341,12 @@ xfs_dir3_free_get_buf( memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); memset(&hdr, 0, sizeof(hdr)); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; - hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else @@ -438,8 +441,8 @@ xfs_dir2_leaf_to_node( leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > - (uint)dp->i_d.di_size / args->geo->blksize) { - xfs_buf_corruption_error(lbp); + (uint)dp->i_disk_size / args->geo->blksize) { + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } @@ -513,7 +516,7 @@ xfs_dir2_leafn_add( * into other peoples memory */ if (index < 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -800,7 +803,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2012,9 +2015,7 @@ xfs_dir2_node_addname( /* * Allocate and initialize the state (btree cursor). */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. @@ -2083,9 +2084,8 @@ xfs_dir2_node_lookup( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); + /* * Fill in the path to the entry in the cursor. */ @@ -2136,9 +2136,7 @@ xfs_dir2_node_removename( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); @@ -2203,9 +2201,7 @@ xfs_dir2_node_replace( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * We have to save new inode number and ftype since diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 01ee0b926572..7404a9ff1a92 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr { }; /* xfs_dir2.c */ -xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name); enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -127,7 +127,8 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp, - struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf, + bool expensive_checks); /* xfs_dir2_node.c */ void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp, @@ -195,12 +196,13 @@ xfs_dir2_data_entsize( len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen + sizeof(xfs_dir2_data_off_t) /* tag */; - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) len += sizeof(uint8_t); return round_up(len, XFS_DIR2_DATA_ALIGN); } -xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, + const struct xfs_name *name); enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 7b7f6fb2ea3b..8cd37e6e9d38 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -48,7 +48,7 @@ xfs_dir2_sf_entsize( count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) count += sizeof(uint8_t); return count; } @@ -76,7 +76,7 @@ xfs_dir2_sf_get_ino( { uint8_t *from = sfep->name + sfep->namelen; - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) from++; if (!hdr->i8count) @@ -95,7 +95,7 @@ xfs_dir2_sf_put_ino( ASSERT(ino <= XFS_MAXINUMBER); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) to++; if (hdr->i8count) @@ -135,7 +135,7 @@ xfs_dir2_sf_get_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep) { - if (xfs_sb_version_hasftype(&mp->m_sb)) { + if (xfs_has_ftype(mp)) { uint8_t ftype = sfep->name[sfep->namelen]; if (ftype < XFS_DIR3_FT_MAX) @@ -153,7 +153,7 @@ xfs_dir2_sf_put_ftype( { ASSERT(ftype < XFS_DIR3_FT_MAX); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) sfep->name[sfep->namelen] = ftype; } @@ -192,7 +192,7 @@ xfs_dir2_block_sfsize( * if there is a filetype field, add the extra byte to the namelen * for each entry that we see. */ - has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; + has_ftype = xfs_has_ftype(mp) ? 1 : 0; count = i8count = namelen = 0; btp = xfs_dir2_block_tail_p(geo, hdr); @@ -237,7 +237,7 @@ xfs_dir2_block_sfsize( (i8count ? /* inumber */ count * XFS_INO64_SIZE : count * XFS_INO32_SIZE); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return size; /* size value is a failure */ } /* @@ -343,8 +343,8 @@ xfs_dir2_block_to_sf( */ ASSERT(dp->i_df.if_bytes == 0); xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; - dp->i_d.di_size = size; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; + dp->i_disk_size = size; logflags |= XFS_ILOG_DDATA; xfs_dir2_sf_check(args); @@ -367,7 +367,7 @@ xfs_dir2_sf_addname( xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int incr_isize; /* total change in size */ - int new_isize; /* di_size after adding name */ + int new_isize; /* size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int pick; /* which algorithm to use */ @@ -378,12 +378,12 @@ xfs_dir2_sf_addname( ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); dp = args->dp; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. */ @@ -401,12 +401,12 @@ xfs_dir2_sf_addname( objchange = 1; } - new_isize = (int)dp->i_d.di_size + incr_isize; + new_isize = (int)dp->i_disk_size + incr_isize; /* * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). */ - if (new_isize > XFS_IFORK_DSIZE(dp) || + if (new_isize > xfs_inode_data_fork_size(dp) || (pick = xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { /* @@ -492,7 +492,7 @@ xfs_dir2_sf_addname_easy( sfp->count++; if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) sfp->i8count++; - dp->i_d.di_size = new_isize; + dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -519,7 +519,7 @@ xfs_dir2_sf_addname_hard( int nbytes; /* temp for byte copies */ xfs_dir2_data_aoff_t new_offset; /* next offset value */ xfs_dir2_data_aoff_t offset; /* current offset value */ - int old_isize; /* previous di_size */ + int old_isize; /* previous size */ xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ @@ -529,7 +529,7 @@ xfs_dir2_sf_addname_hard( * Copy the old directory to the stack buffer. */ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - old_isize = (int)dp->i_d.di_size; + old_isize = (int)dp->i_disk_size; buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); @@ -586,7 +586,7 @@ xfs_dir2_sf_addname_hard( memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); - dp->i_d.di_size = new_isize; + dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -697,7 +697,7 @@ xfs_dir2_sf_check( ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); - ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); + ASSERT((char *)sfep - (char *)sfp == dp->i_disk_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); @@ -710,11 +710,11 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; @@ -723,9 +723,8 @@ xfs_dir2_sf_verify( int error; uint8_t filetype; - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -822,18 +821,16 @@ xfs_dir2_sf_create( dp = args->dp; ASSERT(dp != NULL); - ASSERT(dp->i_d.di_size == 0); + ASSERT(dp->i_disk_size == 0); /* * If it's currently a zero-length extent file, * convert it to local format. */ - if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { - dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) { + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); - dp->i_df.if_flags |= XFS_IFINLINE; } - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; size = xfs_dir2_sf_hdr_size(i8count); @@ -851,7 +848,7 @@ xfs_dir2_sf_create( */ xfs_dir2_sf_put_parent_ino(sfp, pino); sfp->count = 0; - dp->i_d.di_size = size; + dp->i_disk_size = size; xfs_dir2_sf_check(args); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); return 0; @@ -868,7 +865,6 @@ xfs_dir2_sf_lookup( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ - int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ @@ -878,12 +874,12 @@ xfs_dir2_sf_lookup( xfs_dir2_sf_check(args); - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . */ @@ -932,8 +928,7 @@ xfs_dir2_sf_lookup( if (!ci_sfep) return -ENOENT; /* otherwise process the CI match as required by the caller */ - error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return error; + return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); } /* @@ -955,8 +950,8 @@ xfs_dir2_sf_removename( trace_xfs_dir2_sf_removename(args); - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - oldsize = (int)dp->i_d.di_size; + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + oldsize = (int)dp->i_disk_size; ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); @@ -996,7 +991,7 @@ xfs_dir2_sf_removename( * Fix up the header and file size. */ sfp->count--; - dp->i_d.di_size = newsize; + dp->i_disk_size = newsize; /* * Reallocate, making it smaller. */ @@ -1019,7 +1014,7 @@ xfs_dir2_sf_removename( /* * Check whether the sf dir replace operation need more blocks. */ -bool +static bool xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) @@ -1027,14 +1022,14 @@ xfs_dir2_sf_replace_needblock( int newsize; struct xfs_dir2_sf_hdr *sfp; - if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && - sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); + sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp); } /* @@ -1054,12 +1049,12 @@ xfs_dir2_sf_replace( trace_xfs_dir2_sf_replace(args); - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * New inode number is large, and need to convert to 8-byte inodes. @@ -1220,7 +1215,7 @@ xfs_dir2_sf_toino4( * Clean up the inode. */ kmem_free(buf); - dp->i_d.di_size = newsize; + dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1293,6 +1288,6 @@ xfs_dir2_sf_toino8( * Clean up the inode. */ kmem_free(buf); - dp->i_d.di_size = newsize; + dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index bedc1e752b60..15a362e2f5ea 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -22,7 +22,7 @@ xfs_calc_dquots_per_chunk( unsigned int nbblks) /* basic block units */ { ASSERT(nbblks > 0); - return BBTOB(nbblks) / sizeof(xfs_dqblk_t); + return BBTOB(nbblks) / sizeof(struct xfs_dqblk); } /* @@ -37,9 +37,10 @@ xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, struct xfs_disk_dquot *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { + __u8 ddq_type; + /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got @@ -60,11 +61,19 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (type && ddq->d_flags != type) + if (ddq->d_type & ~XFS_DQTYPE_ANY) return __this_address; - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) + ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK; + if (ddq_type != XFS_DQTYPE_USER && + ddq_type != XFS_DQTYPE_PROJ && + ddq_type != XFS_DQTYPE_GROUP) + return __this_address; + + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && + !xfs_has_bigtime(mp)) + return __this_address; + + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) return __this_address; if (id != -1 && id != be32_to_cpu(ddq->d_id)) @@ -95,14 +104,13 @@ xfs_failaddr_t xfs_dqblk_verify( struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id); } /* @@ -113,20 +121,20 @@ xfs_dqblk_repair( struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - memset(dqb, 0, sizeof(xfs_dqblk_t)); + memset(dqb, 0, sizeof(struct xfs_dqblk)); dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; - dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_type = type; dqb->dd_diskdq.d_id = cpu_to_be32(id); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -143,7 +151,7 @@ xfs_dquot_buf_verify_crc( int ndquots; int i; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return true; /* @@ -205,7 +213,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + fa = xfs_dqblk_verify(mp, &dqb[i], id + i); if (fa) { if (!readahead) xfs_buf_verifier_error(bp, -EFSCORRUPTED, @@ -287,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; + +/* Convert an on-disk timer value into an incore timer value. */ +time64_t +xfs_dquot_from_disk_ts( + struct xfs_disk_dquot *ddq, + __be32 dtimer) +{ + uint32_t t = be32_to_cpu(dtimer); + + if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME)) + return xfs_dq_bigtime_to_unix(t); + + return t; +} + +/* Convert an incore timer value into an on-disk timer value. */ +__be32 +xfs_dquot_to_disk_ts( + struct xfs_dquot *dqp, + time64_t timer) +{ + uint32_t t = timer; + + if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME)) + t = xfs_dq_unix_to_bigtime(timer); + + return cpu_to_be32(t); +} diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 79e6c4fb1d8a..5362908164b0 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (C) 2017 Oracle. @@ -55,7 +55,14 @@ #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 #define XFS_ERRTAG_IUNLINK_FALLBACK 34 -#define XFS_ERRTAG_MAX 35 +#define XFS_ERRTAG_BUF_IOERROR 35 +#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 +#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 +#define XFS_ERRTAG_AG_RESV_FAIL 38 +#define XFS_ERRTAG_LARP 39 +#define XFS_ERRTAG_DA_LEAF_SPLIT 40 +#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 +#define XFS_ERRTAG_MAX 42 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -95,5 +102,12 @@ #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT +#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 +#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 +#define XFS_RANDOM_AG_RESV_FAIL 1 +#define XFS_RANDOM_LARP 1 +#define XFS_RANDOM_DA_LEAF_SPLIT 1 +#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 77e9fa385980..371dc07233e0 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -9,7 +9,7 @@ /* * XFS On Disk Format Definitions * - * This header file defines all the on-disk format definitions for + * This header file defines all the on-disk format definitions for * general XFS objects. Directory and attribute related objects are defined in * xfs_da_format.h, which log and log item formats are defined in * xfs_log_format.h. Everything else goes here. @@ -184,7 +184,7 @@ typedef struct xfs_sb { * Superblock - on disk version. Must match the in core version above. * Must be padded to 64 bit alignment. */ -typedef struct xfs_dsb { +struct xfs_dsb { __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ __be32 sb_blocksize; /* logical block size, bytes */ __be64 sb_dblocks; /* number of data blocks */ @@ -263,8 +263,7 @@ typedef struct xfs_dsb { uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ -} xfs_dsb_t; - +}; /* * Misc. Flags - warning - these will be cleared by xfs_repair unless @@ -280,37 +279,9 @@ typedef struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -/* - * The first XFS version we support is a v4 superblock with V2 directories. - */ -static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) -{ - if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) - return false; - if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) - return false; - - /* check for unknown features in the fs */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - - return true; -} - -static inline bool xfs_sb_good_version(struct xfs_sb *sbp) -{ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return true; - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - return xfs_sb_good_v4_features(sbp); - return false; -} - -static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) { - return sbp->sb_rblocks > 0; + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } /* @@ -322,9 +293,10 @@ static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); + return xfs_sb_is_v5(sbp) || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) @@ -332,87 +304,18 @@ static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); -} - static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); -} - -static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); -} - -static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); -} - -static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); -} - -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); -} - -/* - * sb_features2 bit version macros. - */ -static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); -} - -static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); -} - static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) -{ - sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - if (!sbp->sb_features2) - sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; -} - -static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); -} - -static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; @@ -449,10 +352,12 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ +#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ - XFS_SB_FEAT_RO_COMPAT_REFLINK) + XFS_SB_FEAT_RO_COMPAT_REFLINK| \ + XFS_SB_FEAT_RO_COMPAT_INOBTCNT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -465,10 +370,16 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ +#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID) + XFS_SB_FEAT_INCOMPAT_META_UUID| \ + XFS_SB_FEAT_INCOMPAT_BIGTIME| \ + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ + XFS_SB_FEAT_INCOMPAT_NREXT64) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -479,7 +390,9 @@ xfs_sb_has_incompat_feature( return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */ +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \ + (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS) #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( @@ -489,67 +402,27 @@ xfs_sb_has_incompat_log_feature( return (sbp->sb_features_log_incompat & feature) != 0; } -/* - * V5 superblock specific feature checks - */ -static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); -} - -static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); -} - -static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); -} - -/* - * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID - * is stored separately from the user-visible UUID; this allows the - * user-visible UUID to be changed on V5 filesystems which have a - * filesystem UUID stamped into every piece of metadata. - */ -static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) +static inline void +xfs_sb_remove_incompat_log_features( + struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); + sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL; } -static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) +static inline void +xfs_sb_add_incompat_log_features( + struct xfs_sb *sbp, + unsigned int features) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); + sbp->sb_features_log_incompat |= features; } -static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) +static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); + return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); } -/* - * end of superblock version macros - */ - static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) { @@ -560,7 +433,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ @@ -588,7 +460,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_B_TO_FSB(mp,b) \ ((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) #define XFS_B_TO_FSBT(mp,b) (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) /* * Allocation group header @@ -663,26 +534,26 @@ typedef struct xfs_agf { #define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 -#define XFS_AGF_REFCOUNT_ROOT 0x00008000 -#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 -#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_MAGICNUM (1u << 0) +#define XFS_AGF_VERSIONNUM (1u << 1) +#define XFS_AGF_SEQNO (1u << 2) +#define XFS_AGF_LENGTH (1u << 3) +#define XFS_AGF_ROOTS (1u << 4) +#define XFS_AGF_LEVELS (1u << 5) +#define XFS_AGF_FLFIRST (1u << 6) +#define XFS_AGF_FLLAST (1u << 7) +#define XFS_AGF_FLCOUNT (1u << 8) +#define XFS_AGF_FREEBLKS (1u << 9) +#define XFS_AGF_LONGEST (1u << 10) +#define XFS_AGF_BTREEBLKS (1u << 11) +#define XFS_AGF_UUID (1u << 12) +#define XFS_AGF_RMAP_BLOCKS (1u << 13) +#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14) +#define XFS_AGF_REFCOUNT_ROOT (1u << 15) +#define XFS_AGF_REFCOUNT_LEVEL (1u << 16) +#define XFS_AGF_SPARE64 (1u << 17) #define XFS_AGF_NUM_BITS 18 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ @@ -707,7 +578,6 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) /* * Size of the unlinked inode hash table in the agi. @@ -750,32 +620,35 @@ typedef struct xfs_agi { __be32 agi_free_root; /* root of the free inode btree */ __be32 agi_free_level;/* levels in free inode btree */ + __be32 agi_iblocks; /* inobt blocks used */ + __be32 agi_fblocks; /* finobt blocks used */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_MAGICNUM (1u << 0) +#define XFS_AGI_VERSIONNUM (1u << 1) +#define XFS_AGI_SEQNO (1u << 2) +#define XFS_AGI_LENGTH (1u << 3) +#define XFS_AGI_COUNT (1u << 4) +#define XFS_AGI_ROOT (1u << 5) +#define XFS_AGI_LEVEL (1u << 6) +#define XFS_AGI_FREECOUNT (1u << 7) +#define XFS_AGI_NEWINO (1u << 8) +#define XFS_AGI_DIRINO (1u << 9) +#define XFS_AGI_UNLINKED (1u << 10) #define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 +#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1u << 11) +#define XFS_AGI_FREE_LEVEL (1u << 12) +#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) /* * The third a.g. block contains the a.g. freelist, an array @@ -783,21 +656,15 @@ typedef struct xfs_agi { */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) - -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) +#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr)) -typedef struct xfs_agfl { +struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ -} __attribute__((packed)) xfs_agfl_t; +} __attribute__((packed)); #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) @@ -823,10 +690,87 @@ typedef struct xfs_agfl { ASSERT(xfs_daddr_to_agno(mp, d) == \ xfs_daddr_to_agno(mp, (d) + (len) - 1))) -typedef struct xfs_timestamp { +/* + * XFS Timestamps + * ============== + * + * Traditional ondisk inode timestamps consist of signed 32-bit counters for + * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC + * 1970, which means that the timestamp epoch is the same as the Unix epoch. + * Therefore, the ondisk min and max defined here can be used directly to + * constrain the incore timestamps on a Unix system. Note that we actually + * encode a __be64 value on disk. + * + * When the bigtime feature is enabled, ondisk inode timestamps become an + * unsigned 64-bit nanoseconds counter. This means that the bigtime inode + * timestamp epoch is the start of the classic timestamp range, which is + * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers + * /must/ use the bigtime conversion functions when encoding and decoding raw + * timestamps. + */ +typedef __be64 xfs_timestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; +}; + +/* + * Smallest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901. + */ +#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN) + +/* + * Largest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038. + */ +#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX) + +/* + * Smallest possible ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with the traditional + * minimum timestamp of Dec 13 20:45:52 UTC 1901. + */ +#define XFS_BIGTIME_TIME_MIN ((int64_t)0) + +/* + * Largest supported ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with an incore timestamp + * of Jul 2 20:20:24 UTC 2486. + * + * We round down the ondisk limit so that the bigtime quota and inode max + * timestamps will be the same. + */ +#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL)) + +/* + * Bigtime epoch is set exactly to the minimum time value that a traditional + * 32-bit timestamp can represent when using the Unix epoch as a reference. + * Hence the Unix epoch is at a fixed offset into the supported bigtime + * timestamp range. + * + * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS + * timestamp can represent so we will not lose any fidelity in converting + * to/from unix and bigtime timestamps. + * + * The following conversion factor converts a seconds counter from the Unix + * epoch to the bigtime epoch. + */ +#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN) + +/* Convert a timestamp from the Unix epoch to the bigtime epoch. */ +static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds) +{ + return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET; +} + +/* Convert a timestamp from the bigtime epoch to the Unix epoch. */ +static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; +} /* * On-disk inode structure. @@ -838,15 +782,14 @@ typedef struct xfs_timestamp { * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros * below. * - * There is a very similar struct icdinode in xfs_inode which matches the - * layout of the first 96 bytes of this structure, but is kept in native - * format instead of big endian. + * There is a very similar struct xfs_log_dinode which matches the layout of + * this structure, but is kept in native format instead of big endian. * * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed * padding field for v3 inodes. */ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -typedef struct xfs_dinode { +struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ @@ -857,16 +800,41 @@ typedef struct xfs_dinode { __be32 di_nlink; /* number of links to file */ __be16 di_projid_lo; /* lower part of owner's project id */ __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + __be64 di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + __be64 di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + __u8 di_v2_pad[6]; + __be16 di_flushiter; + }; + }; xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ __be64 di_size; /* number of bytes in file */ __be64 di_nblocks; /* # of direct & btree blocks used */ __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + __be32 di_nextents; + __be16 di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + __be32 di_big_anextents; + __be16 di_nrext64_pad; + } __packed; + } __packed; __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ __s8 di_aformat; /* format of attr fork's data */ __be32 di_dmevmask; /* DMIG event mask */ @@ -891,7 +859,7 @@ typedef struct xfs_dinode { uuid_t di_uuid; /* UUID of the filesystem */ /* structure must be padded to 64 bit alignment */ -} xfs_dinode_t; +}; #define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) @@ -936,6 +904,56 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_UUID, "uuid" } /* + * Max values for extnum and aextnum. + * + * The original on-disk extent counts were held in signed fields, resulting in + * maximum extent counts of 2^31 and 2^15 for the data and attr forks + * respectively. Similarly the maximum extent length is limited to 2^21 blocks + * by the 21-bit wide blockcount field of a BMBT extent record. + * + * The newly introduced data fork extent counter can hold a 64-bit value, + * however the maximum number of extents in a file is also limited to 2^54 + * extents by the 54-bit wide startoff field of a BMBT extent record. + * + * It is further limited by the maximum supported file size of 2^63 + * *bytes*. This leads to a maximum extent count for maximally sized filesystem + * blocks (64kB) of: + * + * 2^63 bytes / 2^16 bytes per block = 2^47 blocks + * + * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence + * 2^48 was chosen as the maximum data fork extent count. + * + * The maximum file size that can be represented by the data fork extent counter + * in the worst case occurs when all extents are 1 block in length and each + * block is 1KB in size. + * + * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and + * with 1KB sized blocks, a file can reach upto, + * 1KB * (2^31) = 2TB + * + * This is much larger than the theoretical maximum size of a directory + * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB. + * + * Hence, a directory inode can never overflow its data fork extent counter. + */ +#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) +#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) + +/* + * When we upgrade an inode to the large extent counts, the maximum value by + * which the extent count can increase is bound by the change in size of the + * on-disk field. No upgrade operation should ever be adding more than a few + * tens of extents, so if we get a really large value it is a sign of a code bug + * or corruption. + */ +#define XFS_MAX_EXTCNT_UPGRADE_NR \ + min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \ + XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL) + +/* * Inode minimum and maximum sizes. */ #define XFS_DINODE_MIN_LOG 8 @@ -946,23 +964,22 @@ enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) +#define XFS_DINODE_SIZE(mp) \ + (xfs_has_v3inodes(mp) ? \ + sizeof(struct xfs_dinode) : \ + offsetof(struct xfs_dinode, di_crc)) +#define XFS_LITINO(mp) \ + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp)) /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) + ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) + ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ @@ -985,10 +1002,6 @@ enum xfs_dinode_fmt { ((w) == XFS_DATA_FORK ? \ (dip)->di_format : \ (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) /* * For block and character special files the 32bit dev_t is stored at the @@ -1054,12 +1067,31 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ + #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) #define XFS_DIFLAG2_ANY \ - (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + +static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); +} + +static inline bool xfs_dinode_has_large_extent_counts( + const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); +} /* * Inode number format: @@ -1142,16 +1174,111 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ +#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */ +#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */ +#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */ +#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */ + +/* bitmask to determine if this is a user/group/project dquot */ +#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ + XFS_DQTYPE_PROJ | \ + XFS_DQTYPE_GROUP) + +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \ + XFS_DQTYPE_BIGTIME) + +/* + * XFS Quota Timers + * ================ + * + * Traditional quota grace period expiration timers are an unsigned 32-bit + * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970. + * Note that an expiration value of zero means that the quota limit has not + * been reached, and therefore no expiration has been set. Therefore, the + * ondisk min and max defined here can be used directly to constrain the incore + * quota expiration timestamps on a Unix system. + * + * When bigtime is enabled, we trade two bits of precision to expand the + * expiration timeout range to match that of big inode timestamps. The min and + * max recorded here are the on-disk limits, not a Unix timestamp. + * + * The grace period for each quota type is stored in the root dquot (id = 0) + * and is applied to a non-root dquot when it exceeds the soft or hard limits. + * The length of quota grace periods are unsigned 32-bit quantities measured in + * units of seconds. A value of zero means to use the default period. + */ + +/* + * Smallest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970. + */ +#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1) + +/* + * Largest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106. + */ +#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX) + +/* + * Smallest possible ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with the incore + * expiration of Jan 1 00:00:04 UTC 1970. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN) + +/* + * Largest supported ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with an incore + * expiration of Jul 2 20:20:24 UTC 2486. + * + * The ondisk field supports values up to -1U, which corresponds to an incore + * expiration in 2514. This is beyond the maximum the bigtime inode timestamp, + * so we cap the maximum bigtime quota expiration to the max inode timestamp. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U) + /* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the struct xfs_dquot that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. + * The following conversion factors assist in converting a quota expiration + * timestamp between the incore and ondisk formats. + */ +#define XFS_DQ_BIGTIME_SHIFT (2) +#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1) + +/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */ +static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds) +{ + /* + * Round the expiration timestamp up to the nearest bigtime timestamp + * that we can store, to give users the most time to fix problems. + */ + return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >> + XFS_DQ_BIGTIME_SHIFT; +} + +/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */ +static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT; +} + +/* + * Default quota grace periods, ranging from zero (use the compiled defaults) + * to ~136 years. These are applied to a non-root dquot that has exceeded + * either limit. + */ +#define XFS_DQ_GRACE_MIN ((int64_t)0) +#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) + +/* + * This is the main portion of the on-disk representation of quota information + * for a user. We pad this with some more expansion room to construct the on + * disk structure. */ struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */ __be32 d_id; /* user,project,group id */ __be64 d_blk_hardlimit;/* absolute limit on disk blks */ __be64 d_blk_softlimit;/* preferred limit on disk blks */ @@ -1177,7 +1304,7 @@ struct xfs_disk_dquot { * This is what goes on disk. This is separated from the xfs_disk_dquot because * carrying the unnecessary padding would be a waste of memory. */ -typedef struct xfs_dqblk { +struct xfs_dqblk { struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */ char dd_fill[4];/* filling for posterity */ @@ -1187,11 +1314,27 @@ typedef struct xfs_dqblk { __be32 dd_crc; /* checksum */ __be64 dd_lsn; /* last modification in log */ uuid_t dd_uuid; /* location information */ -} xfs_dqblk_t; +}; #define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) /* + * This defines the unit of allocation of dquots. + * + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * + * However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + * + * This is part of the ondisk format because the structure size is not a power + * of two, which leaves slack at the end of the disk block. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +/* * Remote symlink format and access functions. */ #define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ @@ -1218,7 +1361,7 @@ struct xfs_dsymlink_hdr { #define XFS_SYMLINK_MAPS 3 #define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + ((bufsize) - (xfs_has_crc((mp)) ? \ sizeof(struct xfs_dsymlink_hdr) : 0)) @@ -1421,20 +1564,6 @@ struct xfs_rmap_rec { #define RMAPBT_UNUSED_OFFSET_BITLEN 7 #define RMAPBT_OFFSET_BITLEN 54 -#define XFS_RMAP_ATTR_FORK (1 << 0) -#define XFS_RMAP_BMBT_BLOCK (1 << 1) -#define XFS_RMAP_UNWRITTEN (1 << 2) -#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ - XFS_RMAP_BMBT_BLOCK) -#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) -struct xfs_rmap_irec { - xfs_agblock_t rm_startblock; /* extent start block */ - xfs_extlen_t rm_blockcount; /* extent length */ - uint64_t rm_owner; /* extent owner */ - uint64_t rm_offset; /* offset within the owner */ - unsigned int rm_flags; /* state flags */ -}; - /* * Key structure * @@ -1450,7 +1579,7 @@ struct xfs_rmap_key { typedef __be32 xfs_rmap_ptr_t; #define XFS_RMAP_BLOCK(mp) \ - (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + (xfs_has_finobt(((mp))) ? \ XFS_FIBT_BLOCK(mp) + 1 : \ XFS_IBT_BLOCK(mp) + 1) @@ -1483,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp); * on the startblock. This speeds up mount time deletion of stale * staging extents because they're all at the right side of the tree. */ -#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define XFS_REFC_COWFLAG (1U << 31) #define REFCNTBT_COWFLAG_BITLEN 1 #define REFCNTBT_AGBLOCK_BITLEN 31 @@ -1497,12 +1626,6 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -struct xfs_refcount_irec { - xfs_agblock_t rc_startblock; /* starting block number */ - xfs_extlen_t rc_blockcount; /* count of free blocks */ - xfs_nlink_t rc_refcount; /* number of inodes linked here */ -}; - #define MAXREFCOUNT ((xfs_nlink_t)~0U) #define MAXREFCEXTLEN ((xfs_extlen_t)~0U) @@ -1542,6 +1665,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) #define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) +#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK)) + /* * bmbt records have a file offset (block) field that is 54 bits wide, so this * is the largest xfs_fileoff_t that we ever expect to see. @@ -1673,7 +1798,7 @@ struct xfs_acl_entry { struct xfs_acl { __be32 acl_cnt; - struct xfs_acl_entry acl_entry[0]; + struct xfs_acl_entry acl_entry[]; }; /* @@ -1682,7 +1807,7 @@ struct xfs_acl { * limited only by the maximum size of the xattr that stores the information. */ #define XFS_ACL_MAX_ENTRIES(mp) \ - (xfs_sb_version_hascrc(&mp->m_sb) \ + (xfs_has_crc(mp) \ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ sizeof(struct xfs_acl_entry) \ : 25) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ef95ca07d084..1cfd5bc6520a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +/* SPDX-License-Identifier: LGPL-2.1 */ /* * Copyright (c) 1995-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -65,7 +65,7 @@ struct getbmapx { /* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */ #define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ -#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ +#define BMV_IF_NO_DMAPI_READ 0x2 /* Deprecated */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ #define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ @@ -93,21 +93,6 @@ struct getbmapx { #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ /* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - -/* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. */ @@ -249,6 +234,9 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ +#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ +#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ +#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ /* * Minimum and maximum sizes need for growth checks. @@ -266,6 +254,8 @@ typedef struct xfs_fsop_resblks { */ #define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */ #define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */ +#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE) +#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE) /* keep the maximum size under 2^31 by a small amount */ #define XFS_MAX_LOG_BYTES \ @@ -388,7 +378,7 @@ struct xfs_bulkstat { uint32_t bs_extsize_blks; /* extent size hint, blocks */ uint32_t bs_nlink; /* number of links */ - uint32_t bs_extents; /* number of extents */ + uint32_t bs_extents; /* 32-bit data fork extent counter */ uint32_t bs_aextents; /* attribute number of extents */ uint16_t bs_version; /* structure version */ uint16_t bs_forkoff; /* inode fork offset in bytes */ @@ -397,8 +387,9 @@ struct xfs_bulkstat { uint16_t bs_checked; /* checked inode metadata */ uint16_t bs_mode; /* type and mode */ uint16_t bs_pad2; /* zeroed */ + uint64_t bs_extents64; /* 64-bit data fork extent counter */ - uint64_t bs_pad[7]; /* zeroed */ + uint64_t bs_pad[6]; /* zeroed */ }; #define XFS_BULKSTAT_VERSION_V1 (1) @@ -470,17 +461,28 @@ struct xfs_bulk_ireq { * Only return results from the specified @agno. If @ino is zero, start * with the first inode of @agno. */ -#define XFS_BULK_IREQ_AGNO (1 << 0) +#define XFS_BULK_IREQ_AGNO (1U << 0) /* * Return bulkstat information for a single inode, where @ino value is a * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* * values below. Not compatible with XFS_BULK_IREQ_AGNO. */ -#define XFS_BULK_IREQ_SPECIAL (1 << 1) +#define XFS_BULK_IREQ_SPECIAL (1U << 1) -#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ - XFS_BULK_IREQ_SPECIAL) +/* + * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign + * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use + * xfs_bulkstat->bs_extents for returning data fork extent count and set + * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and + * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than + * XFS_MAX_EXTCNT_DATA_FORK_OLD. + */ +#define XFS_BULK_IREQ_NREXT64 (1U << 2) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL | \ + XFS_BULK_IREQ_NREXT64) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -558,20 +560,44 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; +/* + * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. + * + * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix. + */ +#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */ +#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */ +#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */ typedef struct xfs_attrlist_cursor { __u32 opaque[4]; } xfs_attrlist_cursor_t; +/* + * Define how lists of attribute names are returned to userspace from the + * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the + * beginning of the returned buffer, and a each entry in al_offset contains the + * relative offset of an xfs_attrlist_ent containing the actual entry. + * + * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and + * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr. + */ +struct xfs_attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +}; + +struct xfs_attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +}; + typedef struct xfs_fsop_attrlist_handlereq { struct xfs_fsop_handlereq hreq; /* handle interface structure */ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ @@ -589,7 +615,7 @@ typedef struct xfs_attr_multiop { void __user *am_attrname; void __user *am_attrvalue; __u32 am_length; - __u32 am_flags; + __u32 am_flags; /* XFS_IOC_ATTR_* */ } xfs_attr_multiop_t; typedef struct xfs_fsop_attrmulti_handlereq { @@ -686,34 +712,34 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ -#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) +#define XFS_SCRUB_IFLAG_REPAIR (1u << 0) /* o: Metadata object needs repair. */ -#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) +#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1) /* * o: Metadata object could be optimized. It's not corrupt, but * we could improve on it somehow. */ -#define XFS_SCRUB_OFLAG_PREEN (1 << 2) +#define XFS_SCRUB_OFLAG_PREEN (1u << 2) /* o: Cross-referencing failed. */ -#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) +#define XFS_SCRUB_OFLAG_XFAIL (1u << 3) /* o: Metadata object disagrees with cross-referenced metadata. */ -#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) +#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4) /* o: Scan was not complete. */ -#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) +#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5) /* o: Metadata object looked funny but isn't corrupt. */ -#define XFS_SCRUB_OFLAG_WARNING (1 << 6) +#define XFS_SCRUB_OFLAG_WARNING (1u << 6) /* * o: IFLAG_REPAIR was set but metadata object did not need fixing or * optimization and has therefore not been altered. */ -#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ @@ -747,15 +773,15 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -797,7 +823,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 272005ac8c88..99e796256c5d 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index bf161e930f1d..94db50eb706a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -10,7 +10,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -27,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_rmap.h" +#include "xfs_ag.h" /* * Lookup a record by ino in the btree given by cur. @@ -58,7 +58,7 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_has_sparseinodes(cur->bc_mp)) { rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); rec.inobt.ir_u.sp.ir_count = irec->ir_count; rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; @@ -74,11 +74,11 @@ xfs_inobt_update( void xfs_inobt_btrec_to_irec( struct xfs_mount *mp, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec) { irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) { + if (xfs_has_sparseinodes(mp)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; @@ -105,7 +105,6 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -116,7 +115,7 @@ xfs_inobt_get_rec( xfs_inobt_btrec_to_irec(mp, rec, irec); - if (!xfs_verify_agino(mp, agno, irec->ir_startino)) + if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) goto out_bad_rec; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) @@ -137,7 +136,8 @@ xfs_inobt_get_rec( out_bad_rec: xfs_warn(mp, "%s Inode BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno); + cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", + cur->bc_ag.pag->pag_agno); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -172,18 +172,17 @@ xfs_inobt_insert( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t newino, xfs_agino_t newlen, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); for (thisino = newino; thisino < newino + newlen; @@ -215,10 +214,9 @@ xfs_inobt_insert( * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG -STATIC int +static int xfs_check_agi_freecount( - struct xfs_btree_cur *cur, - struct xfs_agi *agi) + struct xfs_btree_cur *cur) { if (cur->bc_nlevels == 1) { xfs_inobt_rec_incore_t rec; @@ -243,13 +241,13 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) - ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + if (!xfs_is_shutdown(cur->bc_mp)) + ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); } return 0; } #else -#define xfs_check_agi_freecount(cur, agi) 0 +#define xfs_check_agi_freecount(cur) 0 #endif /* @@ -304,7 +302,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -339,7 +337,6 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = xfs_dinode_size(version); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -356,7 +353,7 @@ xfs_ialloc_inode_init( } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, - ioffset + isize - 1); + ioffset + XFS_DINODE_SIZE(mp) - 1); } } @@ -520,18 +517,17 @@ xfs_inobt_insert_sprec( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, int btnum, struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -578,14 +574,14 @@ xfs_inobt_insert_sprec( goto error; } - trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, rec.ir_holemask, nrec->ir_startino, nrec->ir_holemask); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, nrec->ir_holemask); error = xfs_inobt_rec_check_count(mp, nrec); @@ -606,28 +602,28 @@ error: } /* - * Allocate new inodes in the allocation group specified by agbp. - * Return 0 for success, else error code. + * Allocate new inodes in the allocation group specified by agbp. Returns 0 if + * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so + * the caller knows it can try another AG, a hard -ENOSPC when over the maximum + * inode count threshold, or the usual negative error code for other errors. */ STATIC int xfs_ialloc_ag_alloc( struct xfs_trans *tp, struct xfs_buf *agbp, - int *alloc) + struct xfs_perag *pag) { struct xfs_agi *agi; struct xfs_alloc_arg args; - xfs_agnumber_t agno; int error; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe */ /* unit boundary */ /* init. to full chunk */ - uint16_t allocmask = (uint16_t) -1; struct xfs_inobt_rec_incore rec; - struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); + uint16_t allocmask = (uint16_t) -1; int do_sparse = 0; memset(&args, 0, sizeof(args)); @@ -638,9 +634,9 @@ xfs_ialloc_ag_alloc( #ifdef DEBUG /* randomly do sparse inode allocations */ - if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32() & 1; + do_sparse = prandom_u32_max(2); #endif /* @@ -658,16 +654,15 @@ xfs_ialloc_ag_alloc( * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ - agi = XFS_BUF_TO_AGI(agbp); + agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); - agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; args.prod = 1; @@ -688,7 +683,7 @@ xfs_ialloc_ag_alloc( args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ - args.minleft = igeo->inobt_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels; if ((error = xfs_alloc_vextent(&args))) return error; @@ -716,7 +711,7 @@ xfs_ialloc_ag_alloc( */ isaligned = 0; if (igeo->ialloc_align) { - ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + ASSERT(!xfs_has_noalign(args.mp)); args.alignment = args.mp->m_dalign; isaligned = 1; } else @@ -727,7 +722,7 @@ xfs_ialloc_ag_alloc( * For now, just allocate blocks up front. */ args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); /* * Allocate a fixed-size extent of inodes. */ @@ -736,7 +731,7 @@ xfs_ialloc_ag_alloc( /* * Allow space for the inode btree to split. */ - args.minleft = igeo->inobt_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -748,7 +743,7 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = igeo->cluster_align; if ((error = xfs_alloc_vextent(&args))) return error; @@ -758,13 +753,13 @@ xfs_ialloc_ag_alloc( * Finally, try a sparse allocation if the filesystem supports it and * the sparse allocation length is smaller than a full chunk. */ - if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + if (xfs_has_sparseinodes(args.mp) && igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; @@ -795,10 +790,9 @@ sparse_alloc: allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; } - if (args.fsbno == NULLFSBLOCK) { - *alloc = 0; - return 0; - } + if (args.fsbno == NULLFSBLOCK) + return -EAGAIN; + ASSERT(args.len == args.minlen); /* @@ -810,8 +804,8 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, - args.agbno, args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, + args.agbno, args.len, get_random_u32()); if (error) return error; @@ -837,12 +831,12 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, - &rec, true); + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + XFS_BTNUM_INO, &rec, true); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, agno, + XFS_AGINO_TO_INO(args.mp, pag->pag_agno, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); @@ -861,22 +855,21 @@ sparse_alloc: * from the previous call. Set merge false to replace any * existing record with this one. */ - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, - XFS_BTNUM_FINO, &rec, - false); + if (xfs_has_finobt(args.mp)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + XFS_BTNUM_FINO, &rec, false); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_INO); if (error) return error; - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, + if (xfs_has_finobt(args.mp)) { + error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_FINO); if (error) return error; @@ -888,10 +881,8 @@ sparse_alloc: */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - pag = xfs_perag_get(args.mp, agno); pag->pagi_freecount += newlen; pag->pagi_count += newlen; - xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); /* @@ -904,143 +895,9 @@ sparse_alloc: */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); - *alloc = 1; return 0; } -STATIC xfs_agnumber_t -xfs_ialloc_next_ag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - - spin_lock(&mp->m_agirotor_lock); - agno = mp->m_agirotor; - if (++mp->m_agirotor >= mp->m_maxagi) - mp->m_agirotor = 0; - spin_unlock(&mp->m_agirotor_lock); - - return agno; -} - -/* - * Select an allocation group to look for a free inode in, based on the parent - * inode and the mode. Return the allocation group buffer. - */ -STATIC xfs_agnumber_t -xfs_ialloc_ag_select( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent directory inode number */ - umode_t mode) /* bits set to indicate file type */ -{ - xfs_agnumber_t agcount; /* number of ag's in the filesystem */ - xfs_agnumber_t agno; /* current ag number */ - int flags; /* alloc buffer locking flags */ - xfs_extlen_t ineed; /* blocks needed for inode allocation */ - xfs_extlen_t longest = 0; /* longest extent available */ - xfs_mount_t *mp; /* mount point structure */ - int needspace; /* file mode implies space allocated */ - xfs_perag_t *pag; /* per allocation group data */ - xfs_agnumber_t pagno; /* parent (starting) ag number */ - int error; - - /* - * Files of these types need at least one block if length > 0 - * (and they won't fit in the inode, but that's hard to figure out). - */ - needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); - mp = tp->t_mountp; - agcount = mp->m_maxagi; - if (S_ISDIR(mode)) - pagno = xfs_ialloc_next_ag(mp); - else { - pagno = XFS_INO_TO_AGNO(mp, parent); - if (pagno >= agcount) - pagno = 0; - } - - ASSERT(pagno < agcount); - - /* - * Loop through allocation groups, looking for one with a little - * free space in it. Note we don't look for free inodes, exactly. - * Instead, we include whether there is a need to allocate inodes - * to mean that blocks must be allocated for them, - * if none are currently free. - */ - agno = pagno; - flags = XFS_ALLOC_FLAG_TRYLOCK; - for (;;) { - pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto nextag; - } - - if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, agno); - if (error) - goto nextag; - } - - if (pag->pagi_freecount) { - xfs_perag_put(pag); - return agno; - } - - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, agno, flags); - if (error) - goto nextag; - } - - /* - * Check that there is enough free space for the file plus a - * chunk of inodes if we need to allocate some. If this is the - * first pass across the AGs, take into account the potential - * space needed for alignment of inode chunks when checking the - * longest contiguous free space in the AG - this prevents us - * from getting ENOSPC because we have free space larger than - * ialloc_blks but alignment constraints prevent us from using - * it. - * - * If we can't find an AG with space for full alignment slack to - * be taken into account, we must be near ENOSPC in all AGs. - * Hence we don't include alignment for the second pass and so - * if we fail allocation due to alignment issues then it is most - * likely a real ENOSPC condition. - */ - ineed = M_IGEO(mp)->ialloc_min_blks; - if (flags && ineed > 1) - ineed += M_IGEO(mp)->cluster_align; - longest = pag->pagf_longest; - if (!longest) - longest = pag->pagf_flcount > 0; - - if (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed) { - xfs_perag_put(pag); - return agno; - } -nextag: - xfs_perag_put(pag); - /* - * No point in iterating over the rest, if we're shutting - * down. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return NULLAGNUMBER; - agno++; - if (agno >= agcount) - agno = 0; - if (agno == pagno) { - if (flags == 0) - return NULLAGNUMBER; - flags = 0; - } - } -} - /* * Try to retrieve the next record to the left/right from the current one. */ @@ -1126,15 +983,14 @@ STATIC int xfs_dialloc_ag_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; @@ -1143,14 +999,12 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - pag = xfs_perag_get(mp, agno); - ASSERT(pag->pagi_init); ASSERT(pag->pagi_inodeok); ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1158,14 +1012,14 @@ xfs_dialloc_ag_inobt( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == agno) { + if (pagno == pag->pag_agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1368,7 +1222,7 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1378,20 +1232,18 @@ alloc_inode: xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1575,19 +1427,18 @@ xfs_dialloc_ag_update_inobt( * The caller selected an AG for us, and made sure that free inodes are * available. */ -STATIC int +static int xfs_dialloc_ag( struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; @@ -1596,10 +1447,8 @@ xfs_dialloc_ag( int offset; int i; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) - return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); - - pag = xfs_perag_get(mp, agno); + if (!xfs_has_finobt(mp)) + return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -1608,9 +1457,9 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error_cur; @@ -1619,7 +1468,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (agno == pagno) + if (pag->pag_agno == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1631,7 +1480,7 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); /* * Modify or remove the finobt record. @@ -1651,9 +1500,9 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); - error = xfs_check_agi_freecount(icur, agi); + error = xfs_check_agi_freecount(icur); if (error) goto error_icur; @@ -1671,16 +1520,15 @@ xfs_dialloc_ag( xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - error = xfs_check_agi_freecount(icur, agi); + error = xfs_check_agi_freecount(icur); if (error) goto error_icur; - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error_icur; xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_perag_put(pag); *inop = ino; return 0; @@ -1688,73 +1536,226 @@ error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); + return error; +} + +static int +xfs_dialloc_roll( + struct xfs_trans **tpp, + struct xfs_buf *agibp) +{ + struct xfs_trans *tp = *tpp; + struct xfs_dquot_acct *dqinfo; + int error; + + /* + * Hold to on to the agibp across the commit so no other allocation can + * come in and take the free inodes we just allocated for our caller. + */ + xfs_trans_bhold(tp, agibp); + + /* + * We want the quota changes to be associated with the next transaction, + * NOT this one. So, detach the dqinfo from this and attach it to the + * next transaction. + */ + dqinfo = tp->t_dqinfo; + tp->t_dqinfo = NULL; + + error = xfs_trans_roll(&tp); + + /* Re-attach the quota info that we detached from prev trx. */ + tp->t_dqinfo = dqinfo; + + /* + * Join the buffer even on commit error so that the buffer is released + * when the caller cancels the transaction and doesn't have to handle + * this error case specially. + */ + xfs_trans_bjoin(tp, agibp); + *tpp = tp; + return error; +} + +static xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor >= mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + +static bool +xfs_dialloc_good_ag( + struct xfs_trans *tp, + struct xfs_perag *pag, + umode_t mode, + int flags, + bool ok_alloc) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_extlen_t ineed; + xfs_extlen_t longest = 0; + int needspace; + int error; + + if (!pag->pagi_inodeok) + return false; + + if (!pag->pagi_init) { + error = xfs_ialloc_read_agi(pag, tp, NULL); + if (error) + return false; + } + + if (pag->pagi_freecount) + return true; + if (!ok_alloc) + return false; + + if (!pag->pagf_init) { + error = xfs_alloc_read_agf(pag, tp, flags, NULL); + if (error) + return false; + } + + /* + * Check that there is enough free space for the file plus a chunk of + * inodes if we need to allocate some. If this is the first pass across + * the AGs, take into account the potential space needed for alignment + * of inode chunks when checking the longest contiguous free space in + * the AG - this prevents us from getting ENOSPC because we have free + * space larger than ialloc_blks but alignment constraints prevent us + * from using it. + * + * If we can't find an AG with space for full alignment slack to be + * taken into account, we must be near ENOSPC in all AGs. Hence we + * don't include alignment for the second pass and so if we fail + * allocation due to alignment issues then it is most likely a real + * ENOSPC condition. + * + * XXX(dgc): this calculation is now bogus thanks to the per-ag + * reservations that xfs_alloc_fix_freelist() now does via + * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will + * be more than large enough for the check below to succeed, but + * xfs_alloc_space_available() will fail because of the non-zero + * metadata reservation and hence we won't actually be able to allocate + * more inodes in this AG. We do soooo much unnecessary work near ENOSPC + * because of this. + */ + ineed = M_IGEO(mp)->ialloc_min_blks; + if (flags && ineed > 1) + ineed += M_IGEO(mp)->cluster_align; + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + + if (pag->pagf_freeblks < needspace + ineed || longest < ineed) + return false; + return true; +} + +static int +xfs_dialloc_try_ag( + struct xfs_trans **tpp, + struct xfs_perag *pag, + xfs_ino_t parent, + xfs_ino_t *new_ino, + bool ok_alloc) +{ + struct xfs_buf *agbp; + xfs_ino_t ino; + int error; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(pag, *tpp, &agbp); + if (error) + return error; + + if (!pag->pagi_freecount) { + if (!ok_alloc) { + error = -EAGAIN; + goto out_release; + } + + error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); + if (error < 0) + goto out_release; + + /* + * We successfully allocated space for an inode cluster in this + * AG. Roll the transaction so that we can allocate one of the + * new inodes. + */ + ASSERT(pag->pagi_freecount > 0); + error = xfs_dialloc_roll(tpp, agbp); + if (error) + goto out_release; + } + + /* Allocate an inode in the found AG */ + error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + if (!error) + *new_ino = ino; + return error; + +out_release: + xfs_trans_brelse(*tpp, agbp); return error; } /* - * Allocate an inode on disk. - * - * Mode is used to tell whether the new inode will need space, and whether it - * is a directory. + * Allocate an on-disk inode. * - * This function is designed to be called twice if it has to do an allocation - * to make more free inodes. On the first call, *IO_agbp should be set to NULL. - * If an inode is available without having to performn an allocation, an inode - * number is returned. In this case, *IO_agbp is set to NULL. If an allocation - * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. - * The caller should then commit the current transaction, allocate a - * new transaction, and call xfs_dialloc() again, passing in the previous value - * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI - * buffer is locked across the two calls, the second call is guaranteed to have - * a free inode available. - * - * Once we successfully pick an inode its number is returned and the on-disk - * data structures are updated. The inode itself is not read in, since doing so - * would break ordering constraints with xfs_reclaim. + * Mode is used to tell whether the new inode is a directory and hence where to + * locate it. The on-disk inode that is allocated will be returned in @new_ino + * on success, otherwise an error will be set to indicate the failure (e.g. + * -ENOSPC). */ int xfs_dialloc( - struct xfs_trans *tp, + struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, - struct xfs_buf **IO_agbp, - xfs_ino_t *inop) + xfs_ino_t *new_ino) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *agbp; + struct xfs_mount *mp = (*tpp)->t_mountp; xfs_agnumber_t agno; - int error; - int ialloced; - int noroom = 0; + int error = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); - int okalloc = 1; - - if (*IO_agbp) { - /* - * If the caller passes in a pointer to the AGI buffer, - * continue where we left off before. In this case, we - * know that the allocation group has free inodes. - */ - agbp = *IO_agbp; - goto out_alloc; - } + bool ok_alloc = true; + int flags; + xfs_ino_t ino; /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. + * Directories, symlinks, and regular files frequently allocate at least + * one block, so factor that potential expansion when we examine whether + * an AG has enough space for file creation. */ - start_agno = xfs_ialloc_ag_select(tp, parent, mode); - if (start_agno == NULLAGNUMBER) { - *inop = NULLFSINO; - return 0; + if (S_ISDIR(mode)) + start_agno = xfs_ialloc_next_ag(mp); + else { + start_agno = XFS_INO_TO_AGNO(mp, parent); + if (start_agno >= mp->m_maxagi) + start_agno = 0; } /* * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free + * ok_alloc so we scan all available agi structures for a free * inode. * * Read rough value of mp->m_icount by percpu_counter_read_positive, @@ -1763,8 +1764,7 @@ xfs_dialloc( if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { - noroom = 1; - okalloc = 0; + ok_alloc = false; } /* @@ -1773,85 +1773,34 @@ xfs_dialloc( * allocation groups upward, wrapping at the end. */ agno = start_agno; + flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto nextag; + if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { + error = xfs_dialloc_try_ag(tpp, pag, parent, + &ino, ok_alloc); + if (error != -EAGAIN) + break; } - if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, agno); - if (error) - goto out_error; - } - - /* - * Do a first racy fast path check if this AG is usable. - */ - if (!pag->pagi_freecount && !okalloc) - goto nextag; - - /* - * Then read in the AGI buffer and recheck with the AGI buffer - * lock held. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - if (error) - goto out_error; - - if (pag->pagi_freecount) { - xfs_perag_put(pag); - goto out_alloc; - } - - if (!okalloc) - goto nextag_relse_buffer; - - - error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); - if (error) { - xfs_trans_brelse(tp, agbp); - - if (error != -ENOSPC) - goto out_error; - - xfs_perag_put(pag); - *inop = NULLFSINO; - return 0; - } - - if (ialloced) { - /* - * We successfully allocated some inodes, return - * the current context to the caller so that it - * can commit the current transaction and call - * us again where we left off. - */ - ASSERT(pag->pagi_freecount > 0); - xfs_perag_put(pag); - - *IO_agbp = agbp; - *inop = NULLFSINO; - return 0; + if (xfs_is_shutdown(mp)) { + error = -EFSCORRUPTED; + break; } - -nextag_relse_buffer: - xfs_trans_brelse(tp, agbp); -nextag: - xfs_perag_put(pag); - if (++agno == mp->m_sb.sb_agcount) + if (++agno == mp->m_maxagi) agno = 0; if (agno == start_agno) { - *inop = NULLFSINO; - return noroom ? -ENOSPC : 0; + if (!flags) { + error = -ENOSPC; + break; + } + flags = 0; } + xfs_perag_put(pag); } -out_alloc: - *IO_agbp = NULL; - return xfs_dialloc_ag(tp, agbp, parent, inop); -out_error: + if (!error) + *new_ino = ino; xfs_perag_put(pag); return error; } @@ -1878,7 +1827,7 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), + xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES); return; @@ -1923,7 +1872,7 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno), + xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, &XFS_RMAP_OINFO_INODES); /* reset range to current bit and carry on... */ @@ -1939,13 +1888,12 @@ xfs_difree_inobt( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); - struct xfs_perag *pag; + struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; @@ -1959,9 +1907,9 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -2004,11 +1952,13 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - rec.ir_free == XFS_INOBT_ALL_FREE && + if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + struct xfs_perag *pag = agbp->b_pag; + xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2020,10 +1970,8 @@ xfs_difree_inobt( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; - xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -2033,7 +1981,7 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, agno, &rec); + xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); } else { xic->deleted = false; @@ -2044,18 +1992,16 @@ xfs_difree_inobt( goto error0; } - /* + /* * Change the inode free counts and log the ag/sb changes. */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); pag->pagi_freecount++; - xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -2076,18 +2022,17 @@ xfs_difree_finobt( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2151,9 +2096,8 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (rec.ir_free == XFS_INOBT_ALL_FREE && - mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { + if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) goto error; @@ -2165,7 +2109,7 @@ xfs_difree_finobt( } out: - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error; @@ -2185,36 +2129,33 @@ error: */ int xfs_difree( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - struct xfs_icluster *xic) /* cluster info if deleted */ + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_ino_t inode, + struct xfs_icluster *xic) { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ struct xfs_buf *agbp; /* buffer for allocation group header */ xfs_agino_t agino; /* allocation group inode number */ - xfs_agnumber_t agno; /* allocation group number */ int error; /* error return value */ - struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ - mp = tp->t_mountp; - /* * Break up inode number into its components. */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", - __func__, agno, mp->m_sb.sb_agcount); + if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); ASSERT(0); return -EINVAL; } @@ -2228,7 +2169,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2238,15 +2179,15 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, xic, &rec); + error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); if (error) goto error0; /* * Fix up the free inode btree. */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (xfs_has_finobt(mp)) { + error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); if (error) goto error0; } @@ -2261,7 +2202,7 @@ STATIC int xfs_imap_lookup( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, @@ -2274,11 +2215,11 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, agno); + __func__, error, pag->pag_agno); return error; } @@ -2288,7 +2229,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2322,42 +2263,44 @@ xfs_imap_lookup( */ int xfs_imap( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t ino, /* inode to locate */ - struct xfs_imap *imap, /* location map structure */ - uint flags) /* flags for inode btree lookup */ + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ { - xfs_agblock_t agbno; /* block number of inode in the alloc group */ - xfs_agino_t agino; /* inode number within alloc group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agblock_t chunk_agbno; /* first block in inode chunk */ - xfs_agblock_t cluster_agbno; /* first block in inode cluster */ - int error; /* error code */ - int offset; /* index of inode in its buffer */ - xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + int error; /* error code */ + int offset; /* index of inode in its buffer */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + struct xfs_perag *pag; ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ - agno = XFS_INO_TO_AGNO(mp, ino); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (!pag || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + error = -EINVAL; #ifdef DEBUG /* * Don't output diagnostic information for untrusted inodes * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - return -EINVAL; - if (agno >= mp->m_sb.sb_agcount) { + goto out_drop; + if (!pag) { xfs_alert(mp, "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", - __func__, agno, mp->m_sb.sb_agcount); + __func__, XFS_INO_TO_AGNO(mp, ino), + mp->m_sb.sb_agcount); } if (agbno >= mp->m_sb.sb_agblocks) { xfs_alert(mp, @@ -2365,15 +2308,15 @@ xfs_imap( __func__, (unsigned long long)agbno, (unsigned long)mp->m_sb.sb_agblocks); } - if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, agno, agino)); + XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); } xfs_stack_trace(); #endif /* DEBUG */ - return -EINVAL; + goto out_drop; } /* @@ -2384,10 +2327,10 @@ xfs_imap( * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { - error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + error = xfs_imap_lookup(mp, tp, pag, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - return error; + goto out_drop; goto out_map; } @@ -2399,11 +2342,12 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); - return 0; + error = 0; + goto out_drop; } /* @@ -2415,10 +2359,10 @@ xfs_imap( offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + error = xfs_imap_lookup(mp, tp, pag, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - return error; + goto out_drop; } out_map: @@ -2429,7 +2373,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2446,9 +2390,14 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - return -EINVAL; + error = -EINVAL; + goto out_drop; } - return 0; + error = 0; +out_drop: + if (pag) + xfs_perag_put(pag); + return error; } /* @@ -2465,9 +2414,9 @@ out_map: */ void xfs_ialloc_log_agi( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* allocation group header buffer */ - int fields) /* bitmask of fields to log */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ @@ -2486,12 +2435,12 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), + offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG - xfs_agi_t *agi; /* allocation group header */ + struct xfs_agi *agi = bp->b_addr; - agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif @@ -2523,14 +2472,13 @@ xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int i; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } @@ -2543,12 +2491,12 @@ xfs_agi_verify( return __this_address; if (be32_to_cpu(agi->agi_level) < 1 || - be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; - if (xfs_sb_version_hasfinobt(&mp->m_sb) && + if (xfs_has_finobt(mp) && (be32_to_cpu(agi->agi_free_level) < 1 || - be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) + be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; /* @@ -2577,7 +2525,7 @@ xfs_agi_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -2593,6 +2541,7 @@ xfs_agi_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); @@ -2601,11 +2550,11 @@ xfs_agi_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) - XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } @@ -2622,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = { */ int xfs_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { + struct xfs_mount *mp = pag->pag_mount; int error; - trace_xfs_read_agi(mp, agno); + trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); if (error) return error; if (tp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); + xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); - xfs_buf_set_ref(*bpp, XFS_AGI_REF); + xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } +/* + * Read in the agi and initialise the per-ag data. If the caller supplies a + * @agibpp, return the locked AGI buffer to them, otherwise release it. + */ int xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { - struct xfs_agi *agi; /* allocation group header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_buf *agibp; + struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(mp, agno); + trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(mp, tp, agno, bpp); + error = xfs_read_agi(pag, tp, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(*bpp); - pag = xfs_perag_get(mp, agno); + agi = agibp->b_addr; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2674,28 +2624,11 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - xfs_perag_put(pag); - return 0; -} - -/* - * Read in the agi to initialise the per-ag data in the mount structure - */ -int -xfs_ialloc_pagi_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno) /* allocation group number */ -{ - xfs_buf_t *bp = NULL; - int error; - - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) - return error; - if (bp) - xfs_trans_brelse(tp, bp); + xfs_is_shutdown(pag->pag_mount)); + if (agibpp) + *agibpp = agibp; + else + xfs_trans_brelse(tp, agibp); return 0; } @@ -2765,7 +2698,7 @@ struct xfs_ialloc_count_inodes { STATIC int xfs_ialloc_count_inodes_rec( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_inobt_rec_incore irec; @@ -2821,6 +2754,12 @@ xfs_ialloc_setup_geometry( uint64_t icount; uint inodes; + igeo->new_diflags2 = 0; + if (xfs_has_bigtime(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + if (xfs_has_large_extent_counts(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; + /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); @@ -2841,6 +2780,7 @@ xfs_ialloc_setup_geometry( inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); + ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk()); /* * Set the maximum inode count for this filesystem, being careful not @@ -2873,7 +2813,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; @@ -2891,7 +2831,7 @@ xfs_ialloc_setup_geometry( igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); /* Calculate inode cluster alignment. */ - if (xfs_sb_version_hasalign(&mp->m_sb) && + if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) igeo->cluster_align = mp->m_sb.sb_inoalignmt; else @@ -2939,15 +2879,15 @@ xfs_ialloc_calc_rootino( first_bno += xfs_alloc_min_freelist(mp, NULL); /* ...the free inode btree root... */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) first_bno++; /* ...the reverse mapping btree root... */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) first_bno++; /* ...the reference count btree... */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) first_bno++; /* @@ -2957,19 +2897,73 @@ xfs_ialloc_calc_rootino( * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* * Now round first_bno up to whatever allocation alignment is given * by the filesystem or was passed in. */ - if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + if (xfs_has_dalign(mp) && igeo->ialloc_align > 0) first_bno = roundup(first_bno, sunit); - else if (xfs_sb_version_hasalign(&mp->m_sb) && + else if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt > 1) first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); } + +/* + * Ensure there are not sparse inode clusters that cross the new EOAG. + * + * This is a no-op for non-spinode filesystems since clusters are always fully + * allocated and checking the bnobt suffices. However, a spinode filesystem + * could have a record where the upper inodes are free blocks. If those blocks + * were removed from the filesystem, the inode record would extend beyond EOAG, + * which will be flagged as corruption. + */ +int +xfs_ialloc_check_shrink( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + xfs_agblock_t new_length) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length); + int has; + int error; + + if (!xfs_has_sparseinodes(mp)) + return 0; + + pag = xfs_perag_get(mp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO); + + /* Look up the inobt record that would correspond to the new EOFS. */ + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); + if (error || !has) + goto out; + + error = xfs_inobt_get_rec(cur, &rec, &has); + if (error) + goto out; + + if (!has) { + error = -EFSCORRUPTED; + goto out; + } + + /* If the record covers inodes that would be beyond EOFS, bail out. */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) { + error = -ENOSPC; + goto out; + } +out: + xfs_btree_del_cursor(cur, error); + xfs_perag_put(pag); + return error; +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 72b3468b97b1..9bbbca6ac4ed 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -33,46 +33,14 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. - * - * To work within the constraint of one allocation per transaction, - * xfs_dialloc() is designed to be called twice if it has to do an - * allocation to make more free inodes. If an inode is - * available without an allocation, agbp would be set to the current - * agbp and alloc_done set to false. - * If an allocation needed to be done, agbp would be set to the - * inode header of the allocation group and alloc_done set to true. - * The caller should then commit the current transaction and allocate a new - * transaction. xfs_dialloc() should then be called again with - * the agbp value returned from the previous call. - * - * Once we successfully pick an inode its number is returned and the - * on-disk data structures are updated. The inode itself is not read - * in, since doing so would break ordering constraints with xfs_reclaim. - * - * *agbp should be set to NULL on the first call, *alloc_done set to FALSE. + * Allocate an inode on disk. Mode is used to tell whether the new inode will + * need space, and whether it is a directory. */ -int /* error */ -xfs_dialloc( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent inode (directory) */ - umode_t mode, /* mode bits for new inode */ - struct xfs_buf **agbp, /* buf for a.g. inode header */ - xfs_ino_t *inop); /* inode number allocated */ +int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, + xfs_ino_t *new_ino); -/* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int /* error */ -xfs_difree( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - struct xfs_icluster *ifree); /* cluster info if deleted */ +int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, + xfs_ino_t ino, struct xfs_icluster *ifree); /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -92,27 +60,12 @@ void xfs_ialloc_log_agi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* allocation group header buffer */ - int fields); /* bitmask of fields to log */ + uint32_t fields); /* bitmask of fields to log */ -/* - * Read in the allocation group header (inode allocation section) - */ -int /* error */ -xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp); /* allocation group hdr buf */ - -/* - * Read in the allocation group header to initialise the per-ag data - * in the mount structure - */ -int -xfs_ialloc_pagi_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno); /* allocation group number */ +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); +int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); /* * Lookup a record by ino in the btree given by cur. @@ -134,11 +87,10 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); -int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); union xfs_btree_rec; -void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, +void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, + const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); @@ -154,4 +106,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); +int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf *agibp, xfs_agblock_t new_length); + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b82992f795aa..8c83e265770c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" @@ -19,7 +20,9 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" +#include "xfs_ag.h" +static struct kmem_cache *xfs_inobt_cur_cache; STATIC int xfs_inobt_get_minrecs( @@ -34,18 +37,17 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, - cur->bc_btnum); + cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); } STATIC void xfs_inobt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, - int inc) /* level change */ + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *nptr, + int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_root = nptr->s; be32_add_cpu(&agi->agi_level, inc); @@ -54,12 +56,12 @@ xfs_inobt_set_root( STATIC void xfs_finobt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, - int inc) /* level change */ + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *nptr, + int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_free_root = nptr->s; be32_add_cpu(&agi->agi_free_level, inc); @@ -67,13 +69,32 @@ xfs_finobt_set_root( XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } +/* Update the inode btree block counter for this btree. */ +static inline void +xfs_inobt_mod_blockcount( + struct xfs_btree_cur *cur, + int howmuch) +{ + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; + + if (!xfs_has_inobtcounts(cur->bc_mp)) + return; + + if (cur->bc_btnum == XFS_BTNUM_FINO) + be32_add_cpu(&agi->agi_fblocks, howmuch); + else if (cur->bc_btnum == XFS_BTNUM_INO) + be32_add_cpu(&agi->agi_iblocks, howmuch); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); +} + STATIC int __xfs_inobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat, - enum xfs_ag_resv_type resv) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -83,7 +104,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -102,25 +123,26 @@ __xfs_inobt_alloc_block( new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; + xfs_inobt_mod_blockcount(cur, 1); return 0; } STATIC int xfs_inobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); } STATIC int xfs_finobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); @@ -134,8 +156,9 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, + XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); } @@ -167,18 +190,18 @@ xfs_inobt_get_maxrecs( STATIC void xfs_inobt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->inobt.ir_startino = rec->inobt.ir_startino; } STATIC void xfs_inobt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->inobt.ir_startino); x += XFS_INODES_PER_CHUNK - 1; @@ -191,7 +214,7 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_has_sparseinodes(cur->bc_mp)) { rec->inobt.ir_u.sp.ir_holemask = cpu_to_be16(cur->bc_rec.i.ir_holemask); rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; @@ -212,9 +235,9 @@ xfs_inobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -224,16 +247,16 @@ xfs_finobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } STATIC int64_t xfs_inobt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { return (int64_t)be32_to_cpu(key->inobt.ir_startino) - cur->bc_rec.i.ir_startino; @@ -241,9 +264,9 @@ xfs_inobt_key_diff( STATIC int64_t xfs_inobt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - be32_to_cpu(k2->inobt.ir_startino); @@ -271,7 +294,7 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; @@ -339,9 +362,9 @@ const struct xfs_buf_ops xfs_finobt_buf_ops = { STATIC int xfs_inobt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->inobt.ir_startino) < be32_to_cpu(k2->inobt.ir_startino); @@ -349,9 +372,9 @@ xfs_inobt_keys_inorder( STATIC int xfs_inobt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= be32_to_cpu(r2->inobt.ir_startino); @@ -400,46 +423,123 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Allocate a new inode btree cursor. + * Initialize a new inode btree cursor. */ -struct xfs_btree_cur * /* new inode btree cursor */ -xfs_inobt_init_cursor( +static struct xfs_btree_cur * +xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agi structure */ - xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, xfs_btnum_t btnum) /* ialloc or free ino btree */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_btnum = btnum; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, + M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); if (btnum == XFS_BTNUM_INO) { - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_ops = &xfs_inobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); + cur->bc_ops = &xfs_inobt_ops; } else { - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); - cur->bc_ops = &xfs_finobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); + cur->bc_ops = &xfs_finobt_ops; } - cur->bc_blocklog = mp->m_sb.sb_blocklog; - - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; + return cur; +} + +/* Create an inode btree cursor. */ +struct xfs_btree_cur * +xfs_inobt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = agbp->b_addr; + + cur = xfs_inobt_init_common(mp, tp, pag, btnum); + if (btnum == XFS_BTNUM_INO) + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + else + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ag.agbp = agbp; + return cur; +} + +/* Create an inode btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_inobt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + cur = xfs_inobt_init_common(mp, NULL, pag, btnum); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new inobt btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_inobt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agi *agi = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + int fields; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + if (cur->bc_btnum == XFS_BTNUM_INO) { + fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; + agi->agi_root = cpu_to_be32(afake->af_root); + agi->agi_level = cpu_to_be32(afake->af_levels); + if (xfs_has_inobtcounts(cur->bc_mp)) { + agi->agi_iblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + } else { + fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; + agi->agi_free_root = cpu_to_be32(afake->af_root); + agi->agi_free_level = cpu_to_be32(afake->af_levels); + if (xfs_has_inobtcounts(cur->bc_mp)) { + agi->agi_fblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + } +} + +/* Calculate number of records in an inode btree block. */ +static inline unsigned int +xfs_inobt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +} + +/* * Calculate number of records in an inobt btree block. */ int @@ -449,10 +549,54 @@ xfs_inobt_maxrecs( int leaf) { blocklen -= XFS_INOBT_BLOCK_LEN(mp); + return xfs_inobt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(xfs_inobt_rec_t); - return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +/* + * Maximum number of inode btree records per AG. Pretend that we can fill an + * entire AG completely full of inodes except for the AG headers. + */ +#define XFS_MAX_INODE_RECORDS \ + ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \ + XFS_INODES_PER_CHUNK + +/* Compute the max possible height for the inode btree. */ +static inline unsigned int +xfs_inobt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, + XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); + + minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS); +} + +/* Compute the max possible height for the free inode btree. */ +static inline unsigned int +xfs_finobt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; + + minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS); +} + +/* Compute the max possible height for either inode btree. */ +unsigned int +xfs_iallocbt_maxlevels_ondisk(void) +{ + return max(xfs_inobt_maxlevels_ondisk(), + xfs_finobt_maxlevels_ondisk()); } /* @@ -539,10 +683,10 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { - xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + struct xfs_mount *mp = pag->pag_mount; + xfs_agblock_t agblocks = pag->block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -553,8 +697,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -567,7 +710,7 @@ int xfs_inobt_cur( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t which, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp) @@ -578,16 +721,11 @@ xfs_inobt_cur( ASSERT(*agi_bpp == NULL); ASSERT(*curpp == NULL); - error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which); - if (!cur) { - xfs_trans_brelse(tp, *agi_bpp); - *agi_bpp = NULL; - return -ENOMEM; - } + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which); *curpp = cur; return 0; } @@ -596,7 +734,7 @@ static int xfs_inobt_count_blocks( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { @@ -604,7 +742,7 @@ xfs_inobt_count_blocks( struct xfs_btree_cur *cur = NULL; int error; - error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp); + error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp); if (error) return error; @@ -615,6 +753,27 @@ xfs_inobt_count_blocks( return error; } +/* Read finobt block count from AGI header. */ +static int +xfs_finobt_read_blocks( + struct xfs_perag *pag, + struct xfs_trans *tp, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_agi *agi; + int error; + + error = xfs_ialloc_read_agi(pag, tp, &agbp); + if (error) + return error; + + agi = agbp->b_addr; + *tree_blocks = be32_to_cpu(agi->agi_fblocks); + xfs_trans_brelse(tp, agbp); + return 0; +} + /* * Figure out how many blocks to reserve and how many are used by this btree. */ @@ -622,21 +781,25 @@ int xfs_finobt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { xfs_extlen_t tree_len = 0; int error; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return 0; - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len); + if (xfs_has_inobtcounts(mp)) + error = xfs_finobt_read_blocks(pag, tp, &tree_len); + else + error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, + &tree_len); if (error) return error; - *ask += xfs_inobt_max_size(mp, agno); + *ask += xfs_inobt_max_size(pag); *used += tree_len; return 0; } @@ -649,3 +812,22 @@ xfs_iallocbt_calc_size( { return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len); } + +int __init +xfs_inobt_init_cur_cache(void) +{ + xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur", + xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_inobt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_inobt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_inobt_cur_cache); + xfs_inobt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 951305ecaae1..26451cb76b98 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -13,12 +13,13 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; /* * Btree block header size depends on a superblock flag. */ #define XFS_INOBT_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* @@ -45,9 +46,12 @@ struct xfs_mount; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, - xfs_btnum_t); +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_btnum_t btnum); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, struct xfs_perag *pag, + xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -61,11 +65,19 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #endif /* DEBUG */ int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_btnum_t btnum, + struct xfs_perag *pag, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + +unsigned int xfs_iallocbt_maxlevels_ondisk(void); + +int __init xfs_inobt_init_cur_cache(void); +void xfs_inobt_destroy_cur_cache(void); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 52451809c478..773cf4349428 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -8,9 +8,9 @@ #include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_inode.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_trace.h" /* @@ -603,7 +603,7 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_u1.if_root = new; cur->leaf = new; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8afacfe4be0a..758aacd8166b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -21,41 +22,6 @@ #include <linux/iversion.h> /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. - */ -#if defined(DEBUG) -void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) -{ - int i; - xfs_dinode_t *dip; - - for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { - dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", - i, (long long)bp->b_bn); - } - } -} -#endif - -bool -xfs_dinode_good_version( - struct xfs_mount *mp, - __u8 version) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return version == 3; - - return version == 1 || version == 2; -} - -/* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence * has not had the inode cores stamped into it. Hence for readahead, the buffer @@ -64,10 +30,10 @@ xfs_dinode_good_version( * If the readahead buffer is invalid, we need to mark it with an error and * clear the DONE status of the buffer so that a followup read will re-read it * from disk. We don't report the error otherwise to avoid warnings during log - * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * recovery and we don't get unnecessary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. - * Changes to this readahead error behavour also need to be reflected in + * Changes to this readahead error behaviour also need to be reflected in * xfs_dquot_buf_readahead_verify(). */ static void @@ -76,25 +42,23 @@ xfs_inode_buf_verify( bool readahead) { struct xfs_mount *mp = bp->b_mount; - xfs_agnumber_t agno; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - xfs_agino_t unlinked_ino; + struct xfs_dinode *dip; + xfs_agino_t unlinked_ino; + int di_ok; dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - xfs_verify_agino_or_null(mp, agno, unlinked_ino); + xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -106,7 +70,7 @@ xfs_inode_buf_verify( #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)bp->b_bn, i, + (unsigned long long)xfs_buf_daddr(bp), i, be16_to_cpu(dip->di_magic)); #endif xfs_buf_verifier_error(bp, -EFSCORRUPTED, @@ -159,72 +123,96 @@ const struct xfs_buf_ops xfs_inode_buf_ra_ops = { /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dipp parameter it returns a - * pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and dipp are - * undefined. + * on-disk inode in the bpp parameter. */ int xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) + struct xfs_buf **bpp) { - struct xfs_buf *bp; - int error; - - buf_flags |= XBF_UNMAPPED; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, + return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); - if (error) { - if (error == -EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; - } - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - return error; - } +} - *bpp = bp; - *dipp = xfs_buf_offset(bp, imap->im_boffset); - return 0; +static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) +{ + struct timespec64 tv; + uint32_t n; + + tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); + tv.tv_nsec = n; + + return tv; } -void +/* Convert an ondisk timestamp to an incore timestamp. */ +struct timespec64 +xfs_inode_from_disk_ts( + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + struct xfs_legacy_timestamp *lts; + + if (xfs_dinode_has_bigtime(dip)) + return xfs_inode_decode_bigtime(be64_to_cpu(ts)); + + lts = (struct xfs_legacy_timestamp *)&ts; + tv.tv_sec = (int)be32_to_cpu(lts->t_sec); + tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); + + return tv; +} + +int xfs_inode_from_disk( struct xfs_inode *ip, struct xfs_dinode *from) { - struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); + int error; + xfs_failaddr_t fa; + ASSERT(ip->i_cowfp == NULL); + + fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from, + sizeof(*from), fa); + return -EFSCORRUPTED; + } + + /* + * First get the permanent information that is needed to allocate an + * inode. If the inode is unused, mode is zero and we shouldn't mess + * with the uninitialized part of it. + */ + if (!xfs_has_v3inodes(ip->i_mount)) + ip->i_flushiter = be16_to_cpu(from->di_flushiter); + inode->i_generation = be32_to_cpu(from->di_gen); + inode->i_mode = be16_to_cpu(from->di_mode); + if (!inode->i_mode) + return 0; /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. + * They will also be unconditionally written back to disk as v2 inodes. */ - to->di_version = from->di_version; - if (to->di_version == 1) { + if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); - to->di_projid = 0; - to->di_version = 2; + ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); - to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | + ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); } - to->di_format = from->di_format; - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); - to->di_flushiter = be16_to_cpu(from->di_flushiter); + i_uid_write(inode, be32_to_cpu(from->di_uid)); + i_gid_write(inode, be32_to_cpu(from->di_gid)); /* * Time is signed, so need to convert to signed 32 bit before @@ -232,33 +220,80 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); - inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); - inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); - inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); - inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); - inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); - inode->i_generation = be32_to_cpu(from->di_gen); - inode->i_mode = be16_to_cpu(from->di_mode); + inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); + inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); + + ip->i_disk_size = be64_to_cpu(from->di_size); + ip->i_nblocks = be64_to_cpu(from->di_nblocks); + ip->i_extsize = be32_to_cpu(from->di_extsize); + ip->i_forkoff = from->di_forkoff; + ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked); + + if (from->di_dmevmask || from->di_dmstate) + xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); - to->di_size = be64_to_cpu(from->di_size); - to->di_nblocks = be64_to_cpu(from->di_nblocks); - to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = be32_to_cpu(from->di_dmevmask); - to->di_dmstate = be16_to_cpu(from->di_dmstate); - to->di_flags = be16_to_cpu(from->di_flags); - - if (to->di_version == 3) { + if (xfs_has_v3inodes(ip->i_mount)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); - to->di_flags2 = be64_to_cpu(from->di_flags2); - to->di_cowextsize = be32_to_cpu(from->di_cowextsize); + ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); + ip->i_diflags2 = be64_to_cpu(from->di_flags2); + ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + } + + error = xfs_iformat_data_fork(ip, from); + if (error) + return error; + if (from->di_forkoff) { + error = xfs_iformat_attr_fork(ip, from); + if (error) + goto out_destroy_data_fork; + } + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + return 0; + +out_destroy_data_fork: + xfs_idestroy_fork(&ip->i_df); + return error; +} + +/* Convert an incore timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_inode_to_disk_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_timestamp *lts; + xfs_timestamp_t ts; + + if (xfs_inode_has_bigtime(ip)) + return cpu_to_be64(xfs_inode_encode_bigtime(tv)); + + lts = (struct xfs_legacy_timestamp *)&ts; + lts->t_sec = cpu_to_be32(tv.tv_sec); + lts->t_nsec = cpu_to_be32(tv.tv_nsec); + + return ts; +} + +static inline void +xfs_inode_to_disk_iext_counters( + struct xfs_inode *ip, + struct xfs_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af)); + /* + * We might be upgrading the inode to use larger extent counters + * than was previously used. Hence zero the unused field. + */ + to->di_nrext64_pad = cpu_to_be16(0); + } else { + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af)); } } @@ -268,107 +303,49 @@ xfs_inode_to_disk( struct xfs_dinode *to, xfs_lsn_t lsn) { - struct xfs_icdinode *from = &ip->i_d; struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); - to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); - - memset(to->di_pad, 0, sizeof(to->di_pad)); - to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); - to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); - to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); - to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + to->di_format = xfs_ifork_format(&ip->i_df); + to->di_uid = cpu_to_be32(i_uid_read(inode)); + to->di_gid = cpu_to_be32(i_gid_read(inode)); + to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); + to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); + + to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - - if (from->di_version == 3) { + to->di_size = cpu_to_be64(ip->i_disk_size); + to->di_nblocks = cpu_to_be64(ip->i_nblocks); + to->di_extsize = cpu_to_be32(ip->i_extsize); + to->di_forkoff = ip->i_forkoff; + to->di_aformat = xfs_ifork_format(&ip->i_af); + to->di_flags = cpu_to_be16(ip->i_diflags); + + if (xfs_has_v3inodes(ip->i_mount)) { + to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); + to->di_flags2 = cpu_to_be64(ip->i_diflags2); + to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); + to->di_version = 2; + to->di_flushiter = cpu_to_be16(ip->i_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } -} -void -xfs_log_dinode_to_disk( - struct xfs_log_dinode *from, - struct xfs_dinode *to) -{ - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_onlink = 0; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; - } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); - } + xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t @@ -377,20 +354,40 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents; + xfs_extnum_t max_extents; + mode_t mode = be16_to_cpu(dip->di_mode); + uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); + uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); - switch (XFS_DFORK_FORMAT(dip, whichfork)) { + di_nextents = xfs_dfork_nextents(dip, whichfork); + + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. + * + * For all types, check that when the size says the should be in extent + * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISDIR(mode) || S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* - * no local regular files yet + * No local regular files yet. */ - if (whichfork == XFS_DATA_FORK) { - if (S_ISREG(be16_to_cpu(dip->di_mode))) - return __this_address; - if (be64_to_cpu(dip->di_size) > - XFS_DFORK_SIZE(dip, mp, whichfork)) - return __this_address; - } + if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) + return __this_address; if (di_nextents) return __this_address; break; @@ -399,12 +396,11 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - if (whichfork == XFS_ATTR_FORK) { - if (di_nextents > MAXAEXTNUM) - return __this_address; - } else if (di_nextents > MAXEXTNUM) { + max_extents = xfs_iext_max_nextents( + xfs_dinode_has_large_extent_counts(dip), + whichfork); + if (di_nextents > max_extents) return __this_address; - } break; default: return __this_address; @@ -417,7 +413,7 @@ xfs_dinode_verify_forkoff( struct xfs_dinode *dip, struct xfs_mount *mp) { - if (!XFS_DFORK_Q(dip)) + if (!dip->di_forkoff) return NULL; switch (dip->di_format) { @@ -428,7 +424,7 @@ xfs_dinode_verify_forkoff( case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: - if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: @@ -437,6 +433,24 @@ xfs_dinode_verify_forkoff( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_nrext64( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) { + if (!xfs_has_large_extent_counts(mp)) + return __this_address; + if (dip->di_nrext64_pad != 0) + return __this_address; + } else if (dip->di_version >= 3) { + if (dip->di_v3_pad != 0) + return __this_address; + } + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -448,13 +462,16 @@ xfs_dinode_verify( uint16_t flags; uint64_t flags2; uint64_t di_size; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_v3inodes(mp)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -478,10 +495,19 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) + return fa; + + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + /* Fork checks carried over from xfs_iformat_fork */ - if (mode && - be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks)) + if (mode && nextents + naextents > nblocks) + return __this_address; + + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) @@ -520,7 +546,7 @@ xfs_dinode_verify( return __this_address; } - if (XFS_DFORK_Q(dip)) { + if (dip->di_forkoff) { fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); if (fa) return fa; @@ -538,7 +564,7 @@ xfs_dinode_verify( default: return __this_address; } - if (dip->di_anextents) + if (naextents) return __this_address; } @@ -556,7 +582,7 @@ xfs_dinode_verify( /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && - !xfs_sb_version_hasreflink(&mp->m_sb)) + !xfs_has_reflink(mp)) return __this_address; /* only regular files get reflink */ @@ -567,16 +593,17 @@ xfs_dinode_verify( if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) return __this_address; - /* don't let reflink and dax mix */ - if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) - return __this_address; - /* COW extent size hint validation */ fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), mode, flags, flags2); if (fa) return fa; + /* bigtime iflag can only happen on bigtime filesystems */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_has_bigtime(mp)) + return __this_address; + return NULL; } @@ -590,136 +617,26 @@ xfs_dinode_calc_crc( if (dip->di_version < 3) return; - ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + ASSERT(xfs_has_crc(mp)); crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } /* - * Read the disk inode attributes into the in-core inode structure. - * - * For version 5 superblocks, if we are initialising a new inode and we are not - * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new - * inode core with a random generation number. If we are keeping inodes around, - * we need to read the inode cluster to get the existing generation number off - * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode - * format) then log recovery is dependent on the di_flushiter field being - * initialised from the current on-disk value and hence we must also read the - * inode off disk. - */ -int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) -{ - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_failaddr_t fa; - int error; - - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; - - /* shortcut IO on inode allocation if possible */ - if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { - VFS_I(ip)->i_generation = prandom_u32(); - ip->i_d.di_version = 3; - return 0; - } - - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; - - /* even unallocated inodes are verified */ - fa = xfs_dinode_verify(mp, ip->i_ino, dip); - if (fa) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip, - sizeof(*dip), fa); - error = -EFSCORRUPTED; - goto out_brelse; - } - - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat_fork() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_inode_from_disk(ip, dip); - error = xfs_iformat_fork(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - /* - * Partial initialisation of the in-core inode. Just the bits - * that xfs_ialloc won't overwrite or relies on being correct. - */ - ip->i_d.di_version = dip->di_version; - VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - VFS_I(ip)->i_mode = 0; - } - - ASSERT(ip->i_d.di_version >= 2); - ip->i_delayed_blks = 0; - - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); - - /* - * Use xfs_trans_brelse() to release the buffer containing the on-disk - * inode, because it was acquired with xfs_trans_read_buf() in - * xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be locking the - * new in-core inode before putting it in the cache where other - * processes can find it. Thus we don't have to worry about the inode - * being changed just because we released the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Validate di_extsize hint. * - * The rules are documented at xfs_ioctl_setattr_check_extsize(). - * These functions must be kept in sync with each other. + * 1. Extent size hint is only valid for directories and regular files. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files. + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. + * 4. Hint cannot be larger than MAXTEXTLEN. + * 5. Can be changed on directories at any time. + * 6. Hint value of 0 turns off hints, clears inode flags. + * 7. Extent size must be a multiple of the appropriate block size. + * For realtime files, this is the rt extent size. + * 8. For non-realtime files, the extent size hint must be limited + * to half the AG size to avoid alignment extending the extent beyond the + * limits of the AG. */ xfs_failaddr_t xfs_inode_validate_extsize( @@ -739,8 +656,34 @@ xfs_inode_validate_extsize( inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); extsize_bytes = XFS_FSB_TO_B(mp, extsize); + /* + * This comment describes a historic gap in this verifier function. + * + * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this + * function has never checked that the extent size hint is an integer + * multiple of the realtime extent size. Since we allow users to set + * this combination on non-rt filesystems /and/ to change the rt + * extent size when adding a rt device to a filesystem, the net effect + * is that users can configure a filesystem anticipating one rt + * geometry and change their minds later. Directories do not use the + * extent size hint, so this is harmless for them. + * + * If a directory with a misaligned extent size hint is allowed to + * propagate that hint into a new regular realtime file, the result + * is that the inode cluster buffer verifier will trigger a corruption + * shutdown the next time it is run, because the verifier has always + * enforced the alignment rule for regular files. + * + * Because we allow administrators to set a new rt extent size when + * adding a rt section, we cannot add a check to this verifier because + * that will result a new source of directory corruption errors when + * reading an existing filesystem. Instead, we rely on callers to + * decide when alignment checks are appropriate, and fix things up as + * needed. + */ + if (rt_flag) - blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); else blocksize_bytes = mp->m_sb.sb_blocksize; @@ -763,7 +706,7 @@ xfs_inode_validate_extsize( if (extsize_bytes % blocksize_bytes) return __this_address; - if (extsize > MAXEXTLEN) + if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) @@ -775,8 +718,15 @@ xfs_inode_validate_extsize( /* * Validate di_cowextsize hint. * - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). - * These functions must be kept in sync with each other. + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. Hint value of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. */ xfs_failaddr_t xfs_inode_validate_cowextsize( @@ -794,7 +744,7 @@ xfs_inode_validate_cowextsize( hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); - if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) @@ -813,7 +763,7 @@ xfs_inode_validate_cowextsize( if (cowextsize_bytes % mp->m_sb.sb_blocksize) return __this_address; - if (cowextsize > MAXEXTLEN) + if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (cowextsize > mp->m_sb.sb_agblocks / 2) diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index fd94b1078722..585ed5a110af 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -10,36 +10,6 @@ struct xfs_inode; struct xfs_dinode; /* - * In memory representation of the XFS inode. This is held in the in-core struct - * xfs_inode and represents the current on disk values but the structure is not - * in on-disk format. That is, this structure is always translated to on-disk - * format specific structures at the appropriate time. - */ -struct xfs_icdinode { - int8_t di_version; /* inode version */ - int8_t di_format; /* format of di_c data */ - uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_uid; /* owner's user id */ - uint32_t di_gid; /* owner's group id */ - uint32_t di_projid; /* owner's project id */ - xfs_fsize_t di_size; /* number of bytes in file */ - xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ - xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - int8_t di_aformat; /* format of attr fork's data */ - uint32_t di_dmevmask; /* DMIG event mask */ - uint16_t di_dmstate; /* DMIG state info */ - uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - - uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ - - struct timespec64 di_crtime; /* time created */ -}; - -/* * Inode location information. Stored in the inode and passed to * xfs_imap_to_bp() to get a buffer and dinode for a given inode. */ @@ -49,25 +19,12 @@ struct xfs_imap { unsigned short im_boffset; /* inode offset in block in bytes */ }; -int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, - struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); -void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_imap *imap, struct xfs_buf **bpp); +void xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); -void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); -void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, - struct xfs_dinode *to); - -bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); - -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ +int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); @@ -77,4 +34,21 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) +{ + return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec; +} + +struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, + const xfs_timestamp_t ts); + +static inline bool +xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version) +{ + if (xfs_has_v3inodes(mp)) + return version == 3; + return version == 1 || version == 2; +} + + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index ad2b9c313fd2..6b21760184d9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -23,112 +23,10 @@ #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" #include "xfs_attr_leaf.h" +#include "xfs_types.h" +#include "xfs_errortag.h" -kmem_zone_t *xfs_ifork_zone; - -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); - -/* - * Copy inode type and data and attr format specific information from the - * on-disk inode to the in-core inode and fork structures. For fifos, devices, - * and sockets this means set i_rdev to the proper value. For files, - * directories, and symlinks this means to bring in the in-line data or extent - * pointers as well as the attribute fork. For a fork in B-tree format, only - * the root is immediately brought in-core. The rest will be read in later when - * first referenced (see xfs_iread_extents()). - */ -int -xfs_iformat_fork( - struct xfs_inode *ip, - struct xfs_dinode *dip) -{ - struct inode *inode = VFS_I(ip); - struct xfs_attr_shortform *atp; - int size; - int error = 0; - xfs_fsize_t di_size; - - switch (inode->i_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - ip->i_d.di_size = 0; - inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); - break; - - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - di_size = be64_to_cpu(dip->di_size); - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - break; - - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - if (error) - return error; - - if (xfs_is_reflink_inode(ip)) { - ASSERT(ip->i_cowfp == NULL); - xfs_ifork_init_cow(ip); - } - - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - xfs_inode_verifier_error(ip, error, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - break; - } - if (error) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - if (ip->i_cowfp) - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); - } - return error; -} +struct kmem_cache *xfs_ifork_cache; void xfs_init_local_fork( @@ -137,8 +35,8 @@ xfs_init_local_fork( const void *data, int64_t size) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int mem_size = size, real_size = 0; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + int mem_size = size; bool zero_terminate; /* @@ -152,8 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -162,8 +59,6 @@ xfs_init_local_fork( } ifp->if_bytes = size; - ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ifp->if_flags |= XFS_IFINLINE; } /* @@ -171,10 +66,10 @@ xfs_init_local_fork( */ STATIC int xfs_iformat_local( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork, - int size) + struct xfs_inode *ip, + struct xfs_dinode *dip, + int whichfork, + int size) { /* * If the size is unreasonable, then something @@ -183,7 +78,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", + "corrupt inode %llu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -207,9 +102,9 @@ xfs_iformat_extents( int whichfork) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; @@ -221,8 +116,8 @@ xfs_iformat_extents( * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); + xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", + ip->i_ino, nex); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); @@ -253,7 +148,6 @@ xfs_iformat_extents( xfs_iext_next(ifp, &icur); } } - ifp->if_flags |= XFS_IFEXTENTS; return 0; } @@ -267,8 +161,8 @@ xfs_iformat_extents( */ STATIC int xfs_iformat_btree( - xfs_inode_t *ip, - xfs_dinode_t *dip, + struct xfs_inode *ip, + struct xfs_dinode *dip, int whichfork) { struct xfs_mount *mp = ip->i_mount; @@ -279,7 +173,7 @@ xfs_iformat_btree( int size; int level; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); @@ -292,14 +186,13 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || - level == 0 || level > XFS_BTREE_MAXLEVELS) { - xfs_warn(mp, "corrupt inode %Lu (btree).", + ifp->if_nextents > ip->i_nblocks) || + level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { + xfs_warn(mp, "corrupt inode %llu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, @@ -316,8 +209,6 @@ xfs_iformat_btree( */ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFBROOT; ifp->if_bytes = 0; ifp->if_u1.if_root = NULL; @@ -325,6 +216,124 @@ xfs_iformat_btree( return 0; } +int +xfs_iformat_data_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct inode *inode = VFS_I(ip); + int error; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_df.if_format = dip->di_format; + ip->i_df.if_nextents = xfs_dfork_data_extents(dip); + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_disk_size = 0; + inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); + return 0; + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (ip->i_df.if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, + be64_to_cpu(dip->di_size)); + if (!error) + error = xfs_ifork_verify_local_data(ip); + return error; + case XFS_DINODE_FMT_EXTENTS: + return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_BTREE: + return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } + break; + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } +} + +static uint16_t +xfs_dfork_attr_shortform_size( + struct xfs_dinode *dip) +{ + struct xfs_attr_shortform *atp = + (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + + return be16_to_cpu(atp->hdr.totsize); +} + +void +xfs_ifork_init_attr( + struct xfs_inode *ip, + enum xfs_dinode_fmt format, + xfs_extnum_t nextents) +{ + ip->i_af.if_format = format; + ip->i_af.if_nextents = nextents; +} + +void +xfs_ifork_zap_attr( + struct xfs_inode *ip) +{ + xfs_idestroy_fork(&ip->i_af); + memset(&ip->i_af, 0, sizeof(struct xfs_ifork)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; +} + +int +xfs_iformat_attr_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + xfs_extnum_t naextents = xfs_dfork_attr_extents(dip); + int error = 0; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + xfs_ifork_init_attr(ip, dip->di_aformat, naextents); + + switch (ip->i_af.if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, + xfs_dfork_attr_shortform_size(dip)); + if (!error) + error = xfs_ifork_verify_local_attr(ip); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + break; + } + + if (error) + xfs_ifork_zap_attr(ip); + return error; +} + /* * Reallocate the space for if_broot based on the number of records * being added or deleted as indicated in rec_diff. Move the records @@ -365,7 +374,7 @@ xfs_iroot_realloc( return; } - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (rec_diff > 0) { /* * If there wasn't any memory allocated before, just @@ -387,15 +396,15 @@ xfs_iroot_realloc( cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_NOFS); + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_NOFS | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); return; } @@ -422,7 +431,6 @@ xfs_iroot_realloc( XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; - ifp->if_flags &= ~XFS_IFBROOT; } /* @@ -450,7 +458,7 @@ xfs_iroot_realloc( ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); return; } @@ -476,11 +484,11 @@ xfs_idata_realloc( int64_t byte_diff, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); - ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); if (byte_diff == 0) return; @@ -492,50 +500,30 @@ xfs_idata_realloc( return; } - /* - * For inline data, the underlying buffer must be a multiple of 4 bytes - * in size so that it can be logged and stay on word boundaries. - * We enforce that here. - */ - ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_NOFS); + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } void xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) + struct xfs_ifork *ifp) { - struct xfs_ifork *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); if (ifp->if_broot != NULL) { kmem_free(ifp->if_broot); ifp->if_broot = NULL; } - /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if (ifp->if_u1.if_data != NULL) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) { - xfs_iext_destroy(ifp); - } - - if (whichfork == XFS_ATTR_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } else if (whichfork == XFS_COW_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; + switch (ifp->if_format) { + case XFS_DINODE_FMT_LOCAL: + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + if (ifp->if_height) + xfs_iext_destroy(ifp); + break; } } @@ -555,7 +543,7 @@ xfs_iextents_copy( int whichfork) { int state = xfs_bmap_fork_to_state(whichfork); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; int64_t copied = 0; @@ -590,9 +578,9 @@ xfs_iextents_copy( */ void xfs_iflush_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, + struct xfs_inode *ip, + struct xfs_dinode *dip, + struct xfs_inode_log_item *iip, int whichfork) { char *cp; @@ -607,7 +595,7 @@ xfs_iflush_fork( if (!iip) return; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, * for the attribute fork. @@ -618,22 +606,20 @@ xfs_iflush_fork( } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); } break; case XFS_DINODE_FMT_EXTENTS: - ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + ASSERT(ifp->if_nextents > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); } @@ -644,7 +630,7 @@ xfs_iflush_fork( (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); @@ -674,7 +660,7 @@ xfs_iext_state_to_fork( if (state & BMAP_COWFORK) return ip->i_cowfp; else if (state & BMAP_ATTRFORK) - return ip->i_afp; + return &ip->i_af; return &ip->i_df; } @@ -688,51 +674,106 @@ xfs_ifork_init_cow( if (ip->i_cowfp) return; - ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_NOFS); - ip->i_cowfp->if_flags = XFS_IFEXTENTS; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; - ip->i_cnextents = 0; + ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, + GFP_NOFS | __GFP_NOFAIL); + ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } -/* Default fork content verifiers. */ -struct xfs_ifork_ops xfs_default_ifork_ops = { - .verify_attr = xfs_attr_shortform_verify, - .verify_dir = xfs_dir2_sf_verify, - .verify_symlink = xfs_symlink_shortform_verify, -}; - /* Verify the inline contents of the data fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_data( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_data( + struct xfs_inode *ip) { - /* Non-local data fork, we're done. */ - if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return NULL; + xfs_failaddr_t fa = NULL; - /* Check the inline data fork if there is one. */ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFDIR: - return ops->verify_dir(ip); + fa = xfs_dir2_sf_verify(ip); + break; case S_IFLNK: - return ops->verify_symlink(ip); + fa = xfs_symlink_shortform_verify(ip); + break; default: - return NULL; + break; + } + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", + ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + return -EFSCORRUPTED; } + + return 0; } /* Verify the inline contents of the attr fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_attr( +int +xfs_ifork_verify_local_attr( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + xfs_failaddr_t fa; + + if (!xfs_inode_has_attr_fork(ip)) + fa = __this_address; + else + fa = xfs_attr_shortform_verify(ip); + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", + ifp->if_u1.if_data, ifp->if_bytes, fa); + return -EFSCORRUPTED; + } + + return 0; +} + +int +xfs_iext_count_may_overflow( + struct xfs_inode *ip, + int whichfork, + int nr_to_add) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + uint64_t max_exts; + uint64_t nr_exts; + + if (whichfork == XFS_COW_FORK) + return 0; + + max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), + whichfork); + + if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + max_exts = 10; + + nr_exts = ifp->if_nextents + nr_to_add; + if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + return -EFBIG; + + return 0; +} + +/* + * Upgrade this inode's extent counter fields to be able to handle a potential + * increase in the extent count by nr_to_add. Normally this is the same + * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. + */ +int +xfs_iext_count_upgrade( + struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_ifork_ops *ops) + uint nr_to_add) { - /* There has to be an attr fork allocated if aformat is local. */ - if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return NULL; - if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK)) - return __this_address; - return ops->verify_attr(ip); + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + + if (!xfs_has_large_extent_counts(ip->i_mount) || + xfs_inode_has_large_extent_counts(ip) || + XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + return -EFBIG; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 500333d0101e..d3943d6ad0b9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -21,82 +21,154 @@ struct xfs_ifork { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ + int8_t if_format; /* format of this fork */ }; /* - * Per-fork incore inode flags. + * Worst-case increase in the fork extent count when we're adding a single + * extent to a fork and there's no possibility of splitting an existing mapping. */ -#define XFS_IFINLINE 0x01 /* Inline data is read in */ -#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ -#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IEXT_ADD_NOSPLIT_CNT (1) /* - * Fork handling. + * Punching out an extent from the middle of an existing extent can cause the + * extent count to increase by 1. + * i.e. | Old extent | Hole | Old extent | + */ +#define XFS_IEXT_PUNCH_HOLE_CNT (1) + +/* + * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to + * be added. One extra extent for dabtree in case a local attr is + * large enough to cause a double split. It can also cause extent + * count to increase proportional to the size of a remote xattr's + * value. + */ +#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \ + (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks)) + +/* + * A write to a sub-interval of an existing unwritten extent causes the original + * extent to be split into 3 extents + * i.e. | Unwritten | Real | Unwritten | + * Hence extent count can increase by 2. + */ +#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2) + + +/* + * Moving an extent to data fork can cause a sub-interval of an existing extent + * to be unmapped. This will increase extent count by 1. Mapping in the new + * extent can increase the extent count by 1 again i.e. + * | Old extent | New extent | Old extent | + * Hence number of extents increases by 2. */ +#define XFS_IEXT_REFLINK_END_COW_CNT (2) -#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) - -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_afp : \ - (ip)->i_cowfp)) -#define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) -#define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - ((w) == XFS_ATTR_FORK ? \ - XFS_IFORK_ASIZE(ip) : \ - 0)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_aformat : \ - (ip)->i_cformat)) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_aformat = (n)) : \ - ((ip)->i_cformat = (n)))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_anextents : \ - (ip)->i_cnextents)) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_anextents = (n)) : \ - ((ip)->i_cnextents = (n)))) +/* + * Removing an initial range of source/donor file's extent and adding a new + * extent (from donor/source file) in its place will cause extent count to + * increase by 1. + */ +#define XFS_IEXT_SWAP_RMAP_CNT (1) + +/* + * Fork handling. + */ #define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t)) + +static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) +{ + return ifp->if_format == XFS_DINODE_FMT_EXTENTS || + ifp->if_format == XFS_DINODE_FMT_BTREE; +} + +static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp) +{ + if (!ifp) + return 0; + return ifp->if_nextents; +} -#define xfs_ifork_has_extents(ip, w) \ - (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \ - XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE) +static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) +{ + if (!ifp) + return XFS_DINODE_FMT_EXTENTS; + return ifp->if_format; +} + +static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + case XFS_COW_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_DATA_FORK_LARGE; + return XFS_MAX_EXTCNT_DATA_FORK_SMALL; + + case XFS_ATTR_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + return XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + + default: + ASSERT(0); + return 0; + } +} + +static inline xfs_extnum_t +xfs_dfork_data_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be64_to_cpu(dip->di_big_nextents); + return be32_to_cpu(dip->di_nextents); +} + +static inline xfs_extnum_t +xfs_dfork_attr_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be32_to_cpu(dip->di_big_anextents); + + return be16_to_cpu(dip->di_anextents); +} + +static inline xfs_extnum_t +xfs_dfork_nextents( + struct xfs_dinode *dip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_dfork_data_extents(dip); + case XFS_ATTR_FORK: + return xfs_dfork_attr_extents(dip); + default: + ASSERT(0); + break; + } + + return 0; +} + +void xfs_ifork_zap_attr(struct xfs_inode *ip); +void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format, + xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); -int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); -void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idestroy_fork(struct xfs_ifork *ifp); void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); @@ -176,22 +248,21 @@ static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp, xfs_iext_get_extent((ifp), (ext), (got)); \ xfs_iext_next((ifp), (ext))) -extern struct kmem_zone *xfs_ifork_zone; +extern struct kmem_cache *xfs_ifork_cache; extern void xfs_ifork_init_cow(struct xfs_inode *ip); -typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *); - -struct xfs_ifork_ops { - xfs_ifork_verifier_t verify_symlink; - xfs_ifork_verifier_t verify_dir; - xfs_ifork_verifier_t verify_attr; -}; -extern struct xfs_ifork_ops xfs_default_ifork_ops; +int xfs_ifork_verify_local_data(struct xfs_inode *ip); +int xfs_ifork_verify_local_attr(struct xfs_inode *ip); +int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, + int nr_to_add); +int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, + uint nr_to_add); -xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); -xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); +/* returns true if the fork has extents but they are not read in yet. */ +static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) +{ + return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0; +} #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 9bac0d2e56dc..f13e0809dc63 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t; #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) #define XLOG_HEADER_SIZE 512 @@ -44,10 +41,10 @@ typedef uint32_t xlog_tid_t; #define XFS_MIN_LOG_FACTOR 3 #define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) #define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) /* get lsn fields */ @@ -72,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr) /* Log Clients */ #define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ @@ -117,7 +113,12 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_MAX 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_MAX 30 + /* * Flags to log operation header @@ -240,6 +241,8 @@ typedef struct xfs_trans_header { #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 +#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ +#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -255,7 +258,9 @@ typedef struct xfs_trans_header { { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ - { XFS_LI_BUD, "XFS_LI_BUD" } + { XFS_LI_BUD, "XFS_LI_BUD" }, \ + { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ + { XFS_LI_ATTRD, "XFS_LI_ATTRD" } /* * Inode Log Item Format definitions. @@ -368,10 +373,13 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef struct xfs_ictimestamp { +typedef uint64_t xfs_log_timestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_log_legacy_timestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; +}; /* * Define the format of the inode core that is logged. This structure must be @@ -388,16 +396,41 @@ struct xfs_log_dinode { uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ - uint8_t di_pad[6]; /* unused, zeroed space */ - uint16_t di_flushiter; /* incremented on flush */ - xfs_ictimestamp_t di_atime; /* time last accessed */ - xfs_ictimestamp_t di_mtime; /* time last modified */ - xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + union { + /* Number of data fork extents if NREXT64 is set */ + uint64_t di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + uint64_t di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ + uint16_t di_flushiter; /* V2 inode incremented on flush */ + }; + }; + xfs_log_timestamp_t di_atime; /* time last accessed */ + xfs_log_timestamp_t di_mtime; /* time last modified */ + xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + uint32_t di_nextents; + uint16_t di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + uint32_t di_big_anextents; + uint16_t di_nrext64_pad; + } __packed; + } __packed; uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ @@ -411,25 +444,32 @@ struct xfs_log_dinode { /* start of the extended dinode, writable fields */ uint32_t di_crc; /* CRC of the inode */ uint64_t di_changecount; /* number of attribute changes */ - xfs_lsn_t di_lsn; /* flush sequence */ + + /* + * The LSN we write to this field during formatting is not a reflection + * of the current on-disk LSN. It should never be used for recovery + * sequencing, nor should it be recovered into the on-disk inode at all. + * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk() + * for details. + */ + xfs_lsn_t di_lsn; + uint64_t di_flags2; /* more random flags */ uint32_t di_cowextsize; /* basic cow extent size for file */ uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ - xfs_ictimestamp_t di_crtime; /* time created */ + xfs_log_timestamp_t di_crtime; /* time created */ xfs_ino_t di_ino; /* inode number */ uuid_t di_uuid; /* UUID of the filesystem */ /* structure must be padded to 64 bit alignment */ }; -static inline uint xfs_log_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_log_dinode); - return offsetof(struct xfs_log_dinode, di_next_unlinked); -} +#define xfs_log_dinode_size(mp) \ + (xfs_has_v3inodes((mp)) ? \ + sizeof(struct xfs_log_dinode) : \ + offsetof(struct xfs_log_dinode, di_next_unlinked)) /* * Buffer Log Format definitions @@ -573,25 +613,49 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ + xfs_extent_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; +static inline size_t +xfs_efi_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ + xfs_extent_32_t efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +static inline size_t +xfs_efi_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ + xfs_extent_64_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; +static inline size_t +xfs_efi_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * This is the structure used to lay out an efd log item in the * log. The efd_extents array is a variable size array whose @@ -602,25 +666,49 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ + xfs_extent_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; +static inline size_t +xfs_efd_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ + xfs_extent_32_t efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +static inline size_t +xfs_efd_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ + xfs_extent_64_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; +static inline size_t +xfs_efd_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * RUI/RUD (reverse mapping) log format definitions */ @@ -862,4 +950,44 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; +/* + * Flags for deferred attribute operations. + * Upper bits are flags, lower byte is type code + */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) + +/* + * This is the structure used to lay out an attr log item in the + * log. + */ +struct xfs_attri_log_format { + uint16_t alfi_type; /* attri log item type */ + uint16_t alfi_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfi_id; /* attri identifier */ + uint64_t alfi_ino; /* the inode for this attr operation */ + uint32_t alfi_op_flags; /* marks the op as a set or remove */ + uint32_t alfi_name_len; /* attr name length */ + uint32_t alfi_value_len; /* attr value length */ + uint32_t alfi_attr_filter;/* attr filter flags */ +}; + +struct xfs_attrd_log_format { + uint16_t alfd_type; /* attrd log item type */ + uint16_t alfd_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfd_alf_id; /* id of corresponding attri */ +}; + #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3bf671637a91..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -7,6 +7,75 @@ #define __XFS_LOG_RECOVER_H__ /* + * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to + * define how recovery should work for that type of log item. + */ +struct xlog_recover_item; + +/* Sorting hat for log items as they're read in. */ +enum xlog_recover_reorder { + XLOG_REORDER_BUFFER_LIST, + XLOG_REORDER_ITEM_LIST, + XLOG_REORDER_INODE_BUFFER_LIST, + XLOG_REORDER_CANCEL_LIST, +}; + +struct xlog_recover_item_ops { + uint16_t item_type; /* XFS_LI_* type code. */ + + /* + * Help sort recovered log items into the order required to replay them + * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do + * not have to supply a function here. See the comment preceding + * xlog_recover_reorder_trans for more details about what the return + * values mean. + */ + enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item); + + /* Start readahead for pass2, if provided. */ + void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item); + + /* Do whatever work we need to do for pass1, if provided. */ + int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item); + + /* + * This function should do whatever work is needed for pass2 of log + * recovery, if provided. + * + * If the recovered item is an intent item, this function should parse + * the recovered item to construct an in-core log intent item and + * insert it into the AIL. The in-core log intent item should have 1 + * refcount so that the item is freed either (a) when we commit the + * recovered log item for the intent-done item; (b) replay the work and + * log a new intent-done item; or (c) recovery fails and we have to + * abort. + * + * If the recovered item is an intent-done item, this function should + * parse the recovered item to find the id of the corresponding intent + * log item. Next, it should find the in-core log intent item in the + * AIL and release it. + */ + int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list, + struct xlog_recover_item *item, xfs_lsn_t lsn); +}; + +extern const struct xlog_recover_item_ops xlog_icreate_item_ops; +extern const struct xlog_recover_item_ops xlog_buf_item_ops; +extern const struct xlog_recover_item_ops xlog_inode_item_ops; +extern const struct xlog_recover_item_ops xlog_dquot_item_ops; +extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops; +extern const struct xlog_recover_item_ops xlog_bui_item_ops; +extern const struct xlog_recover_item_ops xlog_bud_item_ops; +extern const struct xlog_recover_item_ops xlog_efi_item_ops; +extern const struct xlog_recover_item_ops xlog_efd_item_ops; +extern const struct xlog_recover_item_ops xlog_rui_item_ops; +extern const struct xlog_recover_item_ops xlog_rud_item_ops; +extern const struct xlog_recover_item_ops xlog_cui_item_ops; +extern const struct xlog_recover_item_ops xlog_cud_item_ops; +extern const struct xlog_recover_item_ops xlog_attri_item_ops; +extern const struct xlog_recover_item_ops xlog_attrd_item_ops; + +/* * Macros, structures, prototypes for internal log manager use. */ @@ -22,13 +91,13 @@ /* * item headers are in ri_buf[0]. Additional buffers follow. */ -typedef struct xlog_recover_item { +struct xlog_recover_item { struct list_head ri_list; - int ri_type; int ri_cnt; /* count of regions found */ int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ -} xlog_recover_item_t; + struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */ + const struct xlog_recover_item_ops *ri_ops; +}; struct xlog_recover { struct hlist_node r_list; @@ -41,14 +110,25 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 +void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, + const struct xfs_buf_ops *ops); +bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); + +int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_inode **ipp); +void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, + uint64_t intent_id); +int xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 7f55eb3f3653..9975b93a7412 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -14,6 +14,7 @@ #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" +#include "xfs_trace.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res( } /* + * Compute an alternate set of log reservation sizes for use exclusively with + * minimum log size calculations. + */ +static void +xfs_log_calc_trans_resv_for_minlogblocks( + struct xfs_mount *mp, + struct xfs_trans_resv *resv) +{ + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + + xfs_trans_resv_calc(mp, resv); + + if (xfs_has_reflink(mp)) { + /* + * In the early days of reflink, typical log operation counts + * were greatly overestimated. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + resv->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + } else if (xfs_has_rmapbt(mp)) { + /* + * In the early days of non-reflink rmap, the impact of rmapbt + * updates on log counts were not taken into account at all. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + } + + /* + * In the early days of reflink, we did not use deferred refcount + * update log items, so log reservations must be recomputed using the + * old calculations. + */ + resv->tr_write.tr_logres = + xfs_calc_write_reservation_minlogsize(mp); + resv->tr_itruncate.tr_logres = + xfs_calc_itruncate_reservation_minlogsize(mp); + resv->tr_qm_dqalloc.tr_logres = + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; +} + +/* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. @@ -46,19 +106,25 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { + struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; + unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); - resp = (struct xfs_trans_res *)M_RES(mp); - end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); - for (; resp < end_resp; resp++) { + xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); + + resp = (struct xfs_trans_res *)&resv; + end_resp = (struct xfs_trans_res *)(&resv + 1); + for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; + + trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ @@ -66,9 +132,10 @@ xfs_log_get_max_trans_res( } if (attr_space > log_space) { - *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } + trace_xfs_log_get_max_trans_res(mp, max_resp); } /* @@ -92,7 +159,7 @@ xfs_log_calc_minimum_size( if (tres.tr_logcount > 1) max_logres *= tres.tr_logcount; - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) lsunit = BTOBB(mp->m_sb.sb_logsunit); /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index b2113b17e53c..cb035da3f990 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -16,25 +16,24 @@ * and quota-limits. This is a waste in the common case, but hey ... */ typedef uint64_t xfs_qcnt_t; -typedef uint16_t xfs_qwarncnt_t; + +typedef uint8_t xfs_dqtype_t; + +#define XFS_DQTYPE_STRINGS \ + { XFS_DQTYPE_USER, "USER" }, \ + { XFS_DQTYPE_PROJ, "PROJ" }, \ + { XFS_DQTYPE_GROUP, "GROUP" }, \ + { XFS_DQTYPE_BIGTIME, "BIGTIME" } /* * flags for q_flags field in the dquot. */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */ - -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) +#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ -#define XFS_DQ_FLAGS \ - { XFS_DQ_USER, "USER" }, \ - { XFS_DQ_PROJ, "PROJ" }, \ - { XFS_DQ_GROUP, "GROUP" }, \ - { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_FREEING, "FREEING" } +#define XFS_DQFLAG_STRINGS \ + { XFS_DQFLAG_DIRTY, "DIRTY" }, \ + { XFS_DQFLAG_FREEING, "FREEING" } /* * We have the possibility of all three quota types being active at once, and @@ -60,65 +59,58 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_DQUOT_LOGRES(mp) \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) #define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) #define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) /* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ -#define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) - -/* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. - */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) - -/* * Flags to tell various functions what to do. Not all of these are meaningful * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ +#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */ +#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */ +#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be * modified. */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 +#define XFS_QMOPT_RES_REGBLKS (1u << 7) +#define XFS_QMOPT_RES_RTBLKS (1u << 8) +#define XFS_QMOPT_BCOUNT (1u << 9) +#define XFS_QMOPT_ICOUNT (1u << 10) +#define XFS_QMOPT_RTBCOUNT (1u << 11) +#define XFS_QMOPT_DELBCOUNT (1u << 12) +#define XFS_QMOPT_DELRTBCOUNT (1u << 13) +#define XFS_QMOPT_RES_INOS (1u << 14) /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_INHERIT (1u << 31) + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } /* * flags to xfs_trans_mod_dquot. @@ -137,12 +129,18 @@ typedef uint16_t xfs_qwarncnt_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); + struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, - struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); + struct xfs_dqblk *dqb, xfs_dqid_t id); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, uint type); + xfs_dqid_t id, xfs_dqtype_t type); + +struct xfs_dquot; +time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, + __be32 dtimer); +__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6e1665f2cb67..3f34bafe18dd 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -22,6 +22,9 @@ #include "xfs_bit.h" #include "xfs_refcount.h" #include "xfs_rmap.h" +#include "xfs_ag.h" + +struct kmem_cache *xfs_refcount_intent_cache; /* Allowable refcount adjustment amounts. */ enum xfs_refc_adjust_op { @@ -43,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, int xfs_refcount_lookup_le( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); } @@ -60,13 +66,16 @@ xfs_refcount_lookup_le( int xfs_refcount_lookup_ge( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -77,23 +86,36 @@ xfs_refcount_lookup_ge( int xfs_refcount_lookup_eq( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } /* Convert on-disk record to in-core format. */ void xfs_refcount_btrec_to_irec( - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { - irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + uint32_t start; + + start = be32_to_cpu(rec->refc.rc_startblock); + if (start & XFS_REFC_COWFLAG) { + start &= ~XFS_REFC_COWFLAG; + irec->rc_domain = XFS_REFC_DOMAIN_COW; + } else { + irec->rc_domain = XFS_REFC_DOMAIN_SHARED; + } + + irec->rc_startblock = start; irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } @@ -108,48 +130,35 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; - xfs_agblock_t realstart; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !*stat) return error; xfs_refcount_btrec_to_irec(rec, irec); - - agno = cur->bc_private.a.agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; - /* handle special COW-staging state */ - realstart = irec->rc_startblock; - if (realstart & XFS_REFC_COW_START) { - if (irec->rc_refcount != 1) - goto out_bad_rec; - realstart &= ~XFS_REFC_COW_START; - } else if (irec->rc_refcount < 2) { + if (!xfs_refcount_check_domain(irec)) goto out_bad_rec; - } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, realstart)) - goto out_bad_rec; - if (realstart > realstart + irec->rc_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); return 0; out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", agno); + "Refcount BTree record corruption in AG %d detected!", + pag->pag_agno); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -167,16 +176,21 @@ xfs_refcount_update( struct xfs_refcount_irec *irec) { union xfs_btree_rec rec; + uint32_t start; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); - rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec.refc.rc_startblock = cpu_to_be32(start); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -193,10 +207,13 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + cur->bc_rec.rc.rc_domain = irec->rc_domain; + error = xfs_btree_insert(cur, i); if (error) goto out_error; @@ -208,7 +225,7 @@ xfs_refcount_insert( out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -234,7 +251,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { error = -EFSCORRUPTED; @@ -242,11 +259,12 @@ xfs_refcount_delete( } if (error) goto out_error; - error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); + error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock, + &found_rec); out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -341,6 +359,7 @@ xfs_refc_next( STATIC int xfs_refcount_split_extent( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t agbno, bool *shape_changed) { @@ -349,7 +368,7 @@ xfs_refcount_split_extent( int error; *shape_changed = false; - error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec); if (error) goto out_error; if (!found_rec) @@ -362,11 +381,13 @@ xfs_refcount_split_extent( error = -EFSCORRUPTED; goto out_error; } + if (rcext.rc_domain != domain) + return 0; if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &rcext, agbno); /* Establish the right extent. */ @@ -391,7 +412,7 @@ xfs_refcount_split_extent( out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -411,7 +432,10 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_private.a.agno, left, center, right); + cur->bc_ag.pag->pag_agno, left, center, right); + + ASSERT(left->rc_domain == center->rc_domain); + ASSERT(right->rc_domain == center->rc_domain); /* * Make sure the center and right extents are not in the btree. @@ -421,8 +445,8 @@ xfs_refcount_merge_center_extents( * call removes the center and the second one removes the right * extent. */ - error = xfs_refcount_lookup_ge(cur, center->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_ge(cur, center->rc_domain, + center->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -449,8 +473,8 @@ xfs_refcount_merge_center_extents( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -468,7 +492,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -487,12 +511,14 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_private.a.agno, left, cleft); + cur->bc_ag.pag->pag_agno, left, cleft); + + ASSERT(left->rc_domain == cleft->rc_domain); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cleft->rc_domain, + cleft->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -510,8 +536,8 @@ xfs_refcount_merge_left_extent( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -530,7 +556,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -548,15 +574,17 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_private.a.agno, cright, right); + cur->bc_ag.pag->pag_agno, cright, right); + + ASSERT(right->rc_domain == cright->rc_domain); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, * remove it. */ if (cright->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cright->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cright->rc_domain, + cright->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -574,8 +602,8 @@ xfs_refcount_merge_right_extent( } /* Enlarge the right extent. */ - error = xfs_refcount_lookup_le(cur, right->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, right->rc_domain, + right->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -594,12 +622,10 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } -#define XFS_FIND_RCEXT_SHARED 1 -#define XFS_FIND_RCEXT_COW 2 /* * Find the left extent and the one after it (cleft). This function assumes * that we've already split any extent crossing agbno. @@ -609,16 +635,16 @@ xfs_refcount_find_left_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec); if (error) goto out_error; if (!found_rec) @@ -632,11 +658,9 @@ xfs_refcount_find_left_extents( goto out_error; } - if (xfs_refc_next(&tmp) != agbno) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (xfs_refc_next(&tmp) != agbno) return 0; /* We have a left extent; retrieve (or invent) the next right one */ *left = tmp; @@ -653,6 +677,9 @@ xfs_refcount_find_left_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) *cleft = tmp; @@ -669,8 +696,10 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = min(aglen, tmp.rc_startblock - agbno); cleft->rc_refcount = 1; + cleft->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -678,14 +707,15 @@ xfs_refcount_find_left_extents( cleft->rc_startblock = agbno; cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; + cleft->rc_domain = domain; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -698,16 +728,16 @@ xfs_refcount_find_right_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec); if (error) goto out_error; if (!found_rec) @@ -721,11 +751,9 @@ xfs_refcount_find_right_extents( goto out_error; } - if (tmp.rc_startblock != agbno + aglen) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) - return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (tmp.rc_startblock != agbno + aglen) return 0; /* We have a right extent; retrieve (or invent) the next left one */ *right = tmp; @@ -742,6 +770,9 @@ xfs_refcount_find_right_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) *cright = tmp; @@ -758,8 +789,10 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = right->rc_startblock - cright->rc_startblock; cright->rc_refcount = 1; + cright->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -767,14 +800,15 @@ xfs_refcount_find_right_extents( cright->rc_startblock = agbno; cright->rc_blockcount = aglen; cright->rc_refcount = 1; + cright->rc_domain = domain; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -792,10 +826,10 @@ xfs_refc_valid( STATIC int xfs_refcount_merge_extents( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adjust, - int flags, bool *shape_changed) { struct xfs_refcount_irec left = {0}, cleft = {0}; @@ -810,12 +844,12 @@ xfs_refcount_merge_extents( * just below (agbno + aglen) [cright], and just above (agbno + aglen) * [right]. */ - error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, - *aglen, flags); + error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain, + *agbno, *aglen); if (error) return error; - error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, - *aglen, flags); + error = xfs_refcount_find_right_extents(cur, &right, &cright, domain, + *agbno, *aglen); if (error) return error; @@ -868,7 +902,7 @@ xfs_refcount_merge_extents( aglen); } - return error; + return 0; } /* @@ -883,25 +917,30 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_private.a.priv.refc.shape_changes * - xfs_allocfree_log_count(cur->bc_mp, 1); + /* + * Worst case estimate: full splits of the free space and rmap btrees + * to handle each of the shape changes to the refcount btree. + */ + overhead = xfs_allocfree_block_count(cur->bc_mp, + cur->bc_ag.refc.shape_changes); + overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_private.a.priv.refc.nr_ops > 2 && + if (cur->bc_ag.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_private.a.priv.refc.nr_ops == 0) + if (cur->bc_ag.refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -915,8 +954,7 @@ xfs_refcount_adjust_extents( struct xfs_btree_cur *cur, xfs_agblock_t *agbno, xfs_extlen_t *aglen, - enum xfs_refc_adjust_op adj, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { struct xfs_refcount_irec ext, tmp; int error; @@ -927,7 +965,8 @@ xfs_refcount_adjust_extents( if (*aglen == 0) return 0; - error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno, + &found_rec); if (error) goto out_error; @@ -935,10 +974,11 @@ xfs_refcount_adjust_extents( error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; - if (!found_rec) { + if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_SHARED; } /* @@ -951,13 +991,16 @@ xfs_refcount_adjust_extents( tmp.rc_blockcount = min(*aglen, ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; + tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; + trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.pag->pag_agno, &tmp); /* * Either cover the hole (increment) or * delete the range (decrement). */ + cur->bc_ag.refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -968,27 +1011,41 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.pag->pag_agno, tmp.rc_startblock); - xfs_bmap_add_free(cur->bc_tp, fsbno, - tmp.rc_blockcount, oinfo); + xfs_free_extent_later(cur->bc_tp, fsbno, + tmp.rc_blockcount, NULL); } (*agbno) += tmp.rc_blockcount; (*aglen) -= tmp.rc_blockcount; - error = xfs_refcount_lookup_ge(cur, *agbno, + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* Move the cursor to the start of ext. */ + error = xfs_refcount_lookup_ge(cur, + XFS_REFC_DOMAIN_SHARED, *agbno, &found_rec); if (error) goto out_error; } - /* Stop if there's nothing left to modify */ - if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) - break; + /* + * A previous step trimmed agbno/aglen such that the end of the + * range would not be in the middle of the record. If this is + * no longer the case, something is seriously wrong with the + * btree. Make sure we never feed the synthesized record into + * the processing loop below. + */ + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || + XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Adjust the reference count and either update the tree @@ -998,12 +1055,12 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.pag->pag_agno, &ext); + cur->bc_ag.refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_private.a.priv.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1012,14 +1069,13 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.pag->pag_agno, ext.rc_startblock); - xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, - oinfo); + xfs_free_extent_later(cur->bc_tp, fsbno, + ext.rc_blockcount, NULL); } skip: @@ -1035,7 +1091,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1047,8 +1103,7 @@ xfs_refcount_adjust( xfs_extlen_t aglen, xfs_agblock_t *new_agbno, xfs_extlen_t *new_aglen, - enum xfs_refc_adjust_op adj, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { bool shape_changed; int shape_changes = 0; @@ -1057,22 +1112,24 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno + aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1081,25 +1138,24 @@ xfs_refcount_adjust( /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, - XFS_FIND_RCEXT_SHARED, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, + new_agbno, new_aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_private.a.priv.refc.shape_changes++; + cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, - adj, oinfo); + error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); if (error) goto out_error; return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1115,13 +1171,39 @@ xfs_refcount_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); } /* + * Set up a continuation a deferred refcount operation by updating the intent. + * Checks to make sure we're not going to run off the end of the AG. + */ +static inline int +xfs_refcount_continue_op( + struct xfs_btree_cur *cur, + xfs_fsblock_t startblock, + xfs_agblock_t new_agbno, + xfs_extlen_t new_len, + xfs_fsblock_t *new_fsbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; + + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + return -EFSCORRUPTED; + + *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + + ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + + return 0; +} + +/* * Process one of the deferred refcount operations. We pass back the * btree cursor to maintain our lock on the btree between calls. * This saves time and eliminates a buffer deadlock between the @@ -1142,62 +1224,66 @@ xfs_refcount_finish_one( struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; - xfs_agnumber_t agno; xfs_agblock_t bno; xfs_agblock_t new_agbno; unsigned long nr_ops = 0; int shape_changes = 0; + struct xfs_perag *pag; - agno = XFS_FSB_TO_AGNO(mp, startblock); - ASSERT(agno != NULLAGNUMBER); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); bno = XFS_FSB_TO_AGBNO(mp, startblock); trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), type, XFS_FSB_TO_AGBNO(mp, startblock), blockcount); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_REFCOUNT_FINISH_ONE)) - return -EIO; + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { + error = -EIO; + goto out_drop; + } /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { - nr_ops = rcur->bc_private.a.priv.refc.nr_ops; - shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + if (rcur != NULL && rcur->bc_ag.pag != pag) { + nr_ops = rcur->bc_ag.refc.nr_ops; + shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, + &agbp); if (error) - return error; + goto out_drop; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); - if (!rcur) { - error = -ENOMEM; - goto out_cur; - } - rcur->bc_private.a.priv.refc.nr_ops = nr_ops; - rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + rcur->bc_ag.refc.nr_ops = nr_ops; + rcur->bc_ag.refc.shape_changes = shape_changes; } *pcur = rcur; switch (type) { case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL); - *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + new_len, XFS_REFCOUNT_ADJUST_INCREASE); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL); - *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + new_len, XFS_REFCOUNT_ADJUST_DECREASE); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1214,13 +1300,10 @@ xfs_refcount_finish_one( error = -EFSCORRUPTED; } if (!error && *new_len > 0) - trace_xfs_refcount_finish_one_leftover(mp, agno, type, + trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, bno, blockcount, new_agbno, *new_len); - return error; - -out_cur: - xfs_trans_brelse(tp, agbp); - +out_drop: + xfs_perag_put(pag); return error; } @@ -1241,8 +1324,8 @@ __xfs_refcount_add( type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), blockcount); - ri = kmem_alloc(sizeof(struct xfs_refcount_intent), - KM_NOFS); + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; @@ -1259,7 +1342,7 @@ xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { - if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) + if (!xfs_has_reflink(tp->t_mountp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, @@ -1274,7 +1357,7 @@ xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { - if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) + if (!xfs_has_reflink(tp->t_mountp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, @@ -1303,7 +1386,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); /* By default, skip the whole range */ @@ -1311,7 +1394,8 @@ xfs_refcount_find_shared( *flen = 0; /* Try to find a refcount extent that crosses the start */ - error = xfs_refcount_lookup_le(cur, agbno, &have); + error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno, + &have); if (error) goto out_error; if (!have) { @@ -1329,6 +1413,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1344,6 +1430,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; } /* If the extent starts after the range we want, bail out */ @@ -1375,7 +1463,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } - if (tmp.rc_startblock >= agbno + aglen || + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED || + tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); @@ -1383,12 +1472,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_private.a.agno, *fbno, *flen); + cur->bc_ag.pag->pag_agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1459,17 +1548,23 @@ xfs_refcount_adjust_cow_extents( return 0; /* Find any overlapping refcount records */ - error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno, + &found_rec); if (error) goto out_error; error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && + ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + error = -EFSCORRUPTED; + goto out_error; + } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + - XFS_REFC_COW_START; + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_COW; } switch (adj) { @@ -1484,8 +1579,10 @@ xfs_refcount_adjust_cow_extents( tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; + tmp.rc_domain = XFS_REFC_DOMAIN_COW; + trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.pag->pag_agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1513,7 +1610,7 @@ xfs_refcount_adjust_cow_extents( ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.pag->pag_agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1529,7 +1626,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1546,24 +1643,24 @@ xfs_refcount_adjust_cow( bool shape_changed; int error; - agbno += XFS_REFC_COW_START; - /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno, &shape_changed); if (error) goto out_error; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno + aglen, &shape_changed); if (error) goto out_error; /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, - XFS_FIND_RCEXT_COW, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno, + &aglen, adj, &shape_changed); if (error) goto out_error; @@ -1575,7 +1672,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1589,7 +1686,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1606,7 +1703,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, aglen); /* Remove refcount btree reservation */ @@ -1623,7 +1720,7 @@ xfs_refcount_alloc_cow_extent( { struct xfs_mount *mp = tp->t_mountp; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); @@ -1642,7 +1739,7 @@ xfs_refcount_free_cow_extent( { struct xfs_mount *mp = tp->t_mountp; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; /* Remove rmap entry */ @@ -1660,7 +1757,7 @@ struct xfs_refcount_recovery { STATIC int xfs_refcount_recover_extent( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct list_head *debris = priv; @@ -1670,10 +1767,18 @@ xfs_refcount_recover_extent( be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); + rr = kmalloc(sizeof(struct xfs_refcount_recovery), + GFP_KERNEL | __GFP_NOFAIL); + INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - list_add_tail(&rr->rr_list, debris); + if (XFS_IS_CORRUPT(cur->bc_mp, + rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + kfree(rr); + return -EFSCORRUPTED; + } + + list_add_tail(&rr->rr_list, debris); return 0; } @@ -1681,7 +1786,7 @@ xfs_refcount_recover_extent( int xfs_refcount_recover_cow_leftovers( struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_trans *tp; struct xfs_btree_cur *cur; @@ -1691,10 +1796,11 @@ xfs_refcount_recover_cow_leftovers( union xfs_btree_irec low; union xfs_btree_irec high; xfs_fsblock_t fsb; - xfs_agblock_t agbno; int error; - if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); + if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) return -EOPNOTSUPP; INIT_LIST_HEAD(&debris); @@ -1713,15 +1819,15 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); - low.rc.rc_startblock = XFS_REFC_COW_START; + low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); @@ -1738,23 +1844,24 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_free; - trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + trace_xfs_refcount_recover_extent(mp, pag->pag_agno, + &rr->rr_rrec); /* Free the orphan record */ - agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, + rr->rr_rrec.rc_startblock); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); /* Free the block. */ - xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); + xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); error = xfs_trans_commit(tp); if (error) goto out_free; list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; @@ -1764,7 +1871,7 @@ out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; } @@ -1773,6 +1880,7 @@ out_free: int xfs_refcount_has_record( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, bool *exists) @@ -1784,6 +1892,24 @@ xfs_refcount_has_record( low.rc.rc_startblock = bno; memset(&high, 0xFF, sizeof(high)); high.rc.rc_startblock = bno + len - 1; + low.rc.rc_domain = high.rc.rc_domain = domain; return xfs_btree_has_record(cur, &low, &high, exists); } + +int __init +xfs_refcount_intent_init_cache(void) +{ + xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent", + sizeof(struct xfs_refcount_intent), + 0, 0, NULL); + + return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_refcount_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_refcount_intent_cache); + xfs_refcount_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 209795539c8d..452f30556f5a 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -6,15 +6,41 @@ #ifndef __XFS_REFCOUNT_H__ #define __XFS_REFCOUNT_H__ +struct xfs_trans; +struct xfs_mount; +struct xfs_perag; +struct xfs_btree_cur; +struct xfs_bmbt_irec; +struct xfs_refcount_irec; + extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +static inline uint32_t +xfs_refcount_encode_startblock( + xfs_agblock_t startblock, + enum xfs_refc_domain domain) +{ + uint32_t start; + + /* + * low level btree operations need to handle the generic btree range + * query functions (which set rc_domain == -1U), so we check that the + * domain is /not/ shared. + */ + start = startblock & ~XFS_REFC_COWFLAG; + if (domain != XFS_REFC_DOMAIN_SHARED) + start |= XFS_REFC_COWFLAG; + + return start; +} + enum xfs_refcount_intent_type { XFS_REFCOUNT_INCREASE = 1, XFS_REFCOUNT_DECREASE, @@ -25,10 +51,22 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; enum xfs_refcount_intent_type ri_type; - xfs_fsblock_t ri_startblock; xfs_extlen_t ri_blockcount; + xfs_fsblock_t ri_startblock; }; +/* Check that the refcount is appropriate for the record domain. */ +static inline bool +xfs_refcount_check_domain( + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1) + return false; + if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2) + return false; + return true; +} + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -50,7 +88,7 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len); extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - xfs_agnumber_t agno); + struct xfs_perag *pag); /* * While we're adjusting the refcounts records of an extent, we have @@ -60,20 +98,29 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, * log (plus any key updates) so we'll conservatively assume 32 bytes * per record. We must also leave space for btree splits on both ends * of the range and space for the CUD and a new CUI. + * + * Each EFI that we attach to the transaction is assumed to consume ~32 bytes. + * This is a low estimate for an EFI tracking a single extent (16 bytes for the + * EFI header, 16 for the extent, and 12 for the xlog op header), but the + * estimate is acceptable if there's more than one extent being freed. + * In the worst case of freeing every other block during a refcount decrease + * operation, we amortize the space used for one EFI log item across 16 + * extents. */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) -{ - return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; -} - extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + enum xfs_refc_domain domain, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); union xfs_btree_rec; -extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec, +extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +extern struct kmem_cache *xfs_refcount_intent_cache; + +int __init xfs_refcount_intent_init_cache(void); +void xfs_refcount_intent_destroy_cache(void); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 38529dbacd55..e1f789866683 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -9,42 +9,44 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" +#include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_rmap.h" +#include "xfs_ag.h" + +static struct kmem_cache *xfs_refcountbt_cur_cache; static struct xfs_btree_cur * xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.pag); } STATIC void xfs_refcountbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_refcount_root = ptr->s; be32_add_cpu(&agf->agf_refcount_level, inc); pag->pagf_refcount_level += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); @@ -52,13 +54,13 @@ xfs_refcountbt_set_root( STATIC int xfs_refcountbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ @@ -66,7 +68,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -75,13 +77,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.agno == cur->bc_ag.pag->pag_agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -101,12 +103,12 @@ xfs_refcountbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -136,18 +138,18 @@ xfs_refcountbt_get_maxrecs( STATIC void xfs_refcountbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->refc.rc_startblock = rec->refc.rc_startblock; } STATIC void xfs_refcountbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->refc.rc_startblock); x += be32_to_cpu(rec->refc.rc_blockcount) - 1; @@ -159,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } @@ -169,29 +176,32 @@ xfs_refcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } STATIC int64_t xfs_refcountbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - struct xfs_refcount_irec *rec = &cur->bc_rec.rc; - struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; - return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; } STATIC int64_t xfs_refcountbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - be32_to_cpu(k2->refc.rc_startblock); @@ -210,7 +220,7 @@ xfs_refcountbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return __this_address; fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) @@ -270,9 +280,9 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { STATIC int xfs_refcountbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->refc.rc_startblock) < be32_to_cpu(k2->refc.rc_startblock); @@ -280,9 +290,9 @@ xfs_refcountbt_keys_inorder( STATIC int xfs_refcountbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->refc.rc_startblock) + be32_to_cpu(r1->refc.rc_blockcount) <= @@ -311,42 +321,102 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Allocate a new refcount btree cursor. + * Initialize a new refcount btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( +static struct xfs_btree_cur * +xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; - ASSERT(agno != NULLAGNUMBER); - ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_btnum = XFS_BTNUM_REFC; - cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_refcountbt_ops; + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, + mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); - - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.priv.refc.nr_ops = 0; - cur->bc_private.a.priv.refc.shape_changes = 0; + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; + + cur->bc_ag.refc.nr_ops = 0; + cur->bc_ag.refc.shape_changes = 0; + cur->bc_ops = &xfs_refcountbt_ops; + return cur; +} + +/* Create a btree cursor. */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, tp, pag); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + cur->bc_ag.agbp = agbp; + return cur; +} + +/* Create a btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_refcountbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + cur = xfs_refcountbt_init_common(mp, NULL, pag); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Swap in the new btree root. Once we pass this point the newly rebuilt btree + * is in place and we have to kill off all the old btree blocks. + */ +void +xfs_refcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_refcount_root = cpu_to_be32(afake->af_root); + agf->agf_refcount_level = cpu_to_be32(afake->af_levels); + agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | + XFS_AGF_REFCOUNT_ROOT | + XFS_AGF_REFCOUNT_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); +} + +/* Calculate number of records in a refcount btree block. */ +static inline unsigned int +xfs_refcountbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_refcount_ptr_t)); +} + +/* * Calculate the number of records in a refcount btree block. */ int @@ -355,11 +425,22 @@ xfs_refcountbt_maxrecs( bool leaf) { blocklen -= XFS_REFCOUNT_BLOCK_LEN; + return xfs_refcountbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(struct xfs_refcount_rec); - return blocklen / (sizeof(struct xfs_refcount_key) + - sizeof(xfs_refcount_ptr_t)); +/* Compute the max possible height of the maximally sized refcount btree. */ +unsigned int +xfs_refcountbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; + + minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS); } /* Compute the maximum height of a refcount btree. */ @@ -367,8 +448,14 @@ void xfs_refcountbt_compute_maxlevels( struct xfs_mount *mp) { + if (!xfs_has_reflink(mp)) { + mp->m_refc_maxlevels = 0; + return; + } + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels( mp->m_refc_mnr, mp->m_sb.sb_agblocks); + ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk()); } /* Calculate the refcount btree size for some records. */ @@ -402,7 +489,7 @@ int xfs_refcountbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -412,15 +499,14 @@ xfs_refcountbt_calc_reserves( xfs_extlen_t tree_len; int error; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; - - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); @@ -430,8 +516,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); @@ -439,3 +524,22 @@ xfs_refcountbt_calc_reserves( return error; } + +int __init +xfs_refcountbt_init_cur_cache(void) +{ + xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur", + xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_refcountbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_refcountbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_refcountbt_cur_cache); + xfs_refcountbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index ba416f71c824..d66b37259bed 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,8 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; +struct xbtree_afakeroot; /* * Btree block header size @@ -45,7 +47,9 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno); + struct xfs_perag *pag); +struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, struct xfs_perag *pag); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); @@ -55,7 +59,15 @@ extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp, xfs_agblock_t agblocks); extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, - struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, + struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); +void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + +unsigned int xfs_refcountbt_maxlevels_ondisk(void); + +int __init xfs_refcountbt_init_cur_cache(void); +void xfs_refcountbt_destroy_cur_cache(void); + #endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ff9412f113c4..b56aca1e7c66 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -11,6 +11,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_sb.h" #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_trans.h" @@ -21,6 +22,9 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_inode.h" +#include "xfs_ag.h" + +struct kmem_cache *xfs_rmap_intent_cache; /* * Lookup the first record less than or equal to [bno, len, owner, offset] @@ -30,18 +34,32 @@ int xfs_rmap_lookup_le( struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat) { + int get_stat = 0; + int error; + cur->bc_rec.r.rm_startblock = bno; - cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_blockcount = 0; cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + if (error || !(*stat) || !irec) + return error; + + error = xfs_rmap_get_rec(cur, irec, &get_stat); + if (error) + return error; + if (!get_stat) + return -EFSCORRUPTED; + + return 0; } /* @@ -79,7 +97,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -91,7 +109,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -107,7 +125,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -133,7 +151,7 @@ xfs_rmap_insert( done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -149,7 +167,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -170,15 +188,15 @@ xfs_rmap_delete( done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } /* Convert an internal btree record to an rmap record. */ int xfs_rmap_btrec_to_irec( - union xfs_btree_rec *rec, - struct xfs_rmap_irec *irec) + const union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec) { irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); @@ -197,7 +215,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -217,13 +235,8 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, irec->rm_startblock)) - goto out_bad_rec; - if (irec->rm_startblock > - irec->rm_startblock + irec->rm_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, - irec->rm_startblock + irec->rm_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) goto out_bad_rec; } @@ -236,7 +249,7 @@ xfs_rmap_get_rec( out_bad_rec: xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected!", - agno); + pag->pag_agno); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -247,20 +260,19 @@ out_bad_rec: struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; - int *stat; }; /* For each rmap given, figure out if it matches the key we want. */ STATIC int xfs_rmap_find_left_neighbor_helper( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv) + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) { struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.pag->pag_agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -272,7 +284,6 @@ xfs_rmap_find_left_neighbor_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -281,7 +292,7 @@ xfs_rmap_find_left_neighbor_helper( * return a match with the same owner and adjacent physical and logical * block ranges. */ -int +STATIC int xfs_rmap_find_left_neighbor( struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -292,6 +303,7 @@ xfs_rmap_find_left_neighbor( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; *stat = 0; @@ -309,34 +321,57 @@ xfs_rmap_find_left_neighbor( info.high.rm_flags = flags; info.high.rm_blockcount = 0; info.irec = irec; - info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_find_left_neighbor_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * extent conversion and remap operations run a bit faster if the + * physical extents aren't being shared. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* For each rmap given, figure out if it matches the key we want. */ STATIC int xfs_rmap_lookup_le_range_helper( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv) + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) { struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.pag->pag_agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -349,7 +384,6 @@ xfs_rmap_lookup_le_range_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -370,6 +404,7 @@ xfs_rmap_lookup_le_range( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; info.high.rm_startblock = bno; @@ -382,20 +417,44 @@ xfs_rmap_lookup_le_range( info.high.rm_blockcount = 0; *stat = 0; info.irec = irec; - info.stat = stat; - - trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_lookup_le_range_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + + trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, + bno, 0, owner, offset, flags); + + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * scrub run much faster on most filesystems because bmbt records are + * usually an exact match for rmap records. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_lookup_le_range_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* @@ -498,7 +557,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -506,7 +565,7 @@ xfs_rmap_unmap( * for the AG headers at rm_startblock == 0 created by mkfs/growfs that * will not ever be removed from the tree. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -514,15 +573,8 @@ xfs_rmap_unmap( goto out_error; } - error = xfs_rmap_get_rec(cur, <rec, &i); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -588,7 +640,7 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -666,7 +718,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -678,11 +730,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -694,7 +746,7 @@ int xfs_rmap_free( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo) @@ -703,10 +755,10 @@ xfs_rmap_free( struct xfs_btree_cur *cur; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); @@ -773,7 +825,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -782,20 +834,13 @@ xfs_rmap_map( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &have_lt); if (error) goto out_error; if (have_lt) { - error = xfs_rmap_get_rec(cur, <rec, &have_lt); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, have_lt != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -831,7 +876,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -870,7 +915,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -921,7 +966,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) @@ -932,11 +977,11 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -948,7 +993,7 @@ int xfs_rmap_alloc( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo) @@ -957,10 +1002,10 @@ xfs_rmap_alloc( struct xfs_btree_cur *cur; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1010,7 +1055,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -1018,7 +1063,7 @@ xfs_rmap_convert( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1026,15 +1071,8 @@ xfs_rmap_convert( goto done; } - error = xfs_rmap_get_rec(cur, &PREV, &i); - if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, PREV.rm_startblock, + cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1076,7 +1114,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, LEFT.rm_startblock, + cur->bc_ag.pag->pag_agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1114,7 +1152,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1132,11 +1170,11 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, _RET_IP_); /* reset the cursor back to PREV */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1162,7 +1200,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1180,7 +1218,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1210,7 +1248,7 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1247,7 +1285,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1326,7 +1364,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1383,7 +1421,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1414,7 +1452,7 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); @@ -1441,7 +1479,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1465,12 +1503,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1506,7 +1544,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -1514,7 +1552,7 @@ xfs_rmap_convert_shared( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; @@ -1573,7 +1611,7 @@ xfs_rmap_convert_shared( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1589,7 +1627,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1880,12 +1918,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1923,7 +1961,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -2072,12 +2110,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -2112,7 +2150,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -2138,7 +2176,7 @@ xfs_rmap_map_shared( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -2231,12 +2269,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -2276,9 +2314,9 @@ struct xfs_rmap_query_range_info { /* Format btree record and pass to our callback. */ STATIC int xfs_rmap_query_range_helper( - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - void *priv) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) { struct xfs_rmap_query_range_info *query = priv; struct xfs_rmap_irec irec; @@ -2294,8 +2332,8 @@ xfs_rmap_query_range_helper( int xfs_rmap_query_range( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, - struct xfs_rmap_irec *high_rec, + const struct xfs_rmap_irec *low_rec, + const struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv) { @@ -2336,7 +2374,7 @@ xfs_rmap_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -2362,31 +2400,32 @@ xfs_rmap_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; - xfs_agnumber_t agno; struct xfs_owner_info oinfo; xfs_agblock_t bno; bool unwritten; - agno = XFS_FSB_TO_AGNO(mp, startblock); - ASSERT(agno != NULLAGNUMBER); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); bno = XFS_FSB_TO_AGBNO(mp, startblock); - trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork, + trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, startoff, blockcount, state); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_RMAP_FINISH_ONE)) - return -EIO; + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { + error = -EIO; + goto out_drop; + } + /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { + if (rcur != NULL && rcur->bc_ag.pag != pag) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2397,17 +2436,15 @@ xfs_rmap_finish_one( * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) - return -EFSCORRUPTED; - - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); - if (!rcur) { - error = -ENOMEM; - goto out_cur; + goto out_drop; + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + error = -EFSCORRUPTED; + goto out_drop; } + + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); } *pcur = rcur; @@ -2445,11 +2482,8 @@ xfs_rmap_finish_one( ASSERT(0); error = -EFSCORRUPTED; } - return error; - -out_cur: - xfs_trans_brelse(tp, agbp); - +out_drop: + xfs_perag_put(pag); return error; } @@ -2461,7 +2495,7 @@ xfs_rmap_update_is_needed( struct xfs_mount *mp, int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; + return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK; } /* @@ -2487,7 +2521,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; @@ -2505,12 +2539,15 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_MAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2521,12 +2558,15 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_UNMAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* @@ -2543,12 +2583,15 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + if (!xfs_rmap_update_is_needed(mp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_CONVERT_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ @@ -2668,7 +2711,7 @@ xfs_rmap_record_exists( ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || (flags & XFS_RMAP_BMBT_BLOCK)); - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, &has_record); if (error) return error; @@ -2677,14 +2720,6 @@ xfs_rmap_record_exists( return 0; } - error = xfs_rmap_get_rec(cur, &irec, &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; - } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; @@ -2694,14 +2729,13 @@ struct xfs_rmap_key_state { uint64_t owner; uint64_t offset; unsigned int flags; - bool has_rmap; }; /* For each rmap given, figure out if it doesn't match the key we want. */ STATIC int xfs_rmap_has_other_keys_helper( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_rmap_key_state *rks = priv; @@ -2709,7 +2743,6 @@ xfs_rmap_has_other_keys_helper( if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; - rks->has_rmap = true; return -ECANCELED; } @@ -2731,7 +2764,7 @@ xfs_rmap_has_other_keys( int error; xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - rks.has_rmap = false; + *has_rmap = false; low.rm_startblock = bno; memset(&high, 0xFF, sizeof(high)); @@ -2739,11 +2772,12 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); - if (error < 0) - return error; + if (error == -ECANCELED) { + *has_rmap = true; + return 0; + } - *has_rmap = rks.has_rmap; - return 0; + return error; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { @@ -2773,3 +2807,20 @@ const struct xfs_owner_info XFS_RMAP_OINFO_REFC = { const struct xfs_owner_info XFS_RMAP_OINFO_COW = { .oi_owner = XFS_RMAP_OWN_COW, }; + +int __init +xfs_rmap_intent_init_cache(void) +{ + xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent", + sizeof(struct xfs_rmap_intent), + 0, 0, NULL); + + return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_rmap_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_rmap_intent_cache); + xfs_rmap_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index abe633403fd1..54741a591a17 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -6,6 +6,8 @@ #ifndef __XFS_RMAP_H__ #define __XFS_RMAP_H__ +struct xfs_perag; + static inline void xfs_rmap_ino_bmbt_owner( struct xfs_owner_info *oi, @@ -113,15 +115,15 @@ xfs_owner_info_pack( } int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, - unsigned int flags, int *stat); + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, int *stat); @@ -132,12 +134,13 @@ int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec, int *stat); typedef int (*xfs_rmap_query_range_fn)( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv); + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv); int xfs_rmap_query_range(struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, + const struct xfs_rmap_irec *low_rec, + const struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv); int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn, void *priv); @@ -156,8 +159,8 @@ enum xfs_rmap_intent_type { struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; - uint64_t ri_owner; int ri_whichfork; + uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; }; @@ -181,16 +184,13 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); -int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, - struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); union xfs_btree_rec; -int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, +int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); @@ -212,4 +212,9 @@ extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES; extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC; extern const struct xfs_owner_info XFS_RMAP_OINFO_COW; +extern struct kmem_cache *xfs_rmap_intent_cache; + +int __init xfs_rmap_intent_init_cache(void); +void xfs_rmap_intent_destroy_cache(void); + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index fc78efa52c94..7f83f62e51e0 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -9,18 +9,21 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" +static struct kmem_cache *xfs_rmapbt_cur_cache; + /* * Reverse map btree. * @@ -51,65 +54,60 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.pag); } STATIC void xfs_rmapbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); + cur->bc_ag.pag->pagf_levels[btnum] += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } STATIC int xfs_rmapbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = cur->bc_ag.pag; int error; xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, - bno, 1); + trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, - false); + xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); - xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); *stat = 1; return 0; @@ -120,26 +118,25 @@ xfs_rmapbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); + trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); - - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); return 0; } @@ -161,8 +158,8 @@ xfs_rmapbt_get_maxrecs( STATIC void xfs_rmapbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->rmap.rm_startblock = rec->rmap.rm_startblock; key->rmap.rm_owner = rec->rmap.rm_owner; @@ -178,11 +175,11 @@ xfs_rmapbt_init_key_from_rec( */ STATIC void xfs_rmapbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - uint64_t off; - int adj; + uint64_t off; + int adj; adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; @@ -215,22 +212,22 @@ xfs_rmapbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } STATIC int64_t xfs_rmapbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - struct xfs_rmap_irec *rec = &cur->bc_rec.r; - struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + const struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + int64_t d; d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; if (d) @@ -254,14 +251,14 @@ xfs_rmapbt_key_diff( STATIC int64_t xfs_rmapbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - struct xfs_rmap_key *kp1 = &k1->rmap; - struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + const struct xfs_rmap_key *kp1 = &k1->rmap; + const struct xfs_rmap_key *kp2 = &k2->rmap; + int64_t d; + __u64 x, y; d = (int64_t)be32_to_cpu(kp1->rm_startblock) - be32_to_cpu(kp2->rm_startblock); @@ -309,7 +306,7 @@ xfs_rmapbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return __this_address; fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) @@ -369,9 +366,9 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { STATIC int xfs_rmapbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { uint32_t x; uint32_t y; @@ -399,9 +396,9 @@ xfs_rmapbt_keys_inorder( STATIC int xfs_rmapbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { uint32_t x; uint32_t y; @@ -448,37 +445,95 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .recs_inorder = xfs_rmapbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * -xfs_rmapbt_init_cursor( +static struct xfs_btree_cur * +xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - cur->bc_tp = tp; - cur->bc_mp = mp; /* Overlapping btree; 2 keys per pointer. */ - cur->bc_btnum = XFS_BTNUM_RMAP; + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, + mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; - cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ops = &xfs_rmapbt_ops; + + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; + + return cur; +} + +/* Create a new reverse mapping btree cursor. */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_rmapbt_init_common(mp, tp, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agbp = agbp; + return cur; +} - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; +/* Create a new reverse mapping btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_rmapbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + cur = xfs_rmapbt_init_common(mp, NULL, pag); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | + XFS_AGF_RMAP_BLOCKS); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); +} + +/* Calculate number of records in a reverse mapping btree block. */ +static inline unsigned int +xfs_rmapbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +} + +/* * Calculate number of records in an rmap btree block. */ int @@ -487,11 +542,33 @@ xfs_rmapbt_maxrecs( int leaf) { blocklen -= XFS_RMAP_BLOCK_LEN; + return xfs_rmapbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(struct xfs_rmap_rec); - return blocklen / - (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +/* Compute the max possible height for reverse mapping btrees. */ +unsigned int +xfs_rmapbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; + + minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2; + + /* + * Compute the asymptotic maxlevels for an rmapbt on any reflink fs. + * + * On a reflink filesystem, each AG block can have up to 2^32 (per the + * refcount record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. However, we're likely to run + * out of blocks in the AG long before that happens, which means that + * we must compute the max height based on what the btree will look + * like if it consumes almost all the blocks in the AG due to maximal + * sharing factor. + */ + return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS); } /* Compute the maximum height of an rmap btree. */ @@ -499,26 +576,36 @@ void xfs_rmapbt_compute_maxlevels( struct xfs_mount *mp) { - /* - * On a non-reflink filesystem, the maximum number of rmap - * records is the number of blocks in the AG, hence the max - * rmapbt height is log_$maxrecs($agblocks). However, with - * reflink each AG block can have up to 2^32 (per the refcount - * record format) owners, which means that theoretically we - * could face up to 2^64 rmap records. - * - * That effectively means that the max rmapbt height must be - * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG - * blocks to feed the rmapbt long before the rmapbt reaches - * maximum height. The reflink code uses ag_resv_critical to - * disallow reflinking when less than 10% of the per-AG metadata - * block reservation since the fallback is a regular file copy. - */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) - mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; - else + if (!xfs_has_rmapbt(mp)) { + mp->m_rmap_maxlevels = 0; + return; + } + + if (xfs_has_reflink(mp)) { + /* + * Compute the asymptotic maxlevels for an rmap btree on a + * filesystem that supports reflink. + * + * On a reflink filesystem, each AG block can have up to 2^32 + * (per the refcount record format) owners, which means that + * theoretically we could face up to 2^64 rmap records. + * However, we're likely to run out of blocks in the AG long + * before that happens, which means that we must compute the + * max height based on what the btree will look like if it + * consumes almost all the blocks in the AG due to maximal + * sharing factor. + */ + mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr, + mp->m_sb.sb_agblocks); + } else { + /* + * If there's no block sharing, compute the maximum rmapbt + * height assuming one rmap record per AG block. + */ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( mp->m_rmap_mnr, mp->m_sb.sb_agblocks); + } + ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk()); } /* Calculate the refcount btree size for some records. */ @@ -552,7 +639,7 @@ int xfs_rmapbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -562,14 +649,14 @@ xfs_rmapbt_calc_reserves( xfs_extlen_t tree_len; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_trans_brelse(tp, agbp); @@ -579,8 +666,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ @@ -589,3 +675,22 @@ xfs_rmapbt_calc_reserves( return error; } + +int __init +xfs_rmapbt_init_cur_cache(void) +{ + xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur", + xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rmapbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rmapbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rmapbt_cur_cache); + xfs_rmapbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 820d668b063d..3244715dd111 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -9,6 +9,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -42,7 +43,11 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - xfs_agnumber_t agno); + struct xfs_perag *pag); +struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, struct xfs_perag *pag); +void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); @@ -52,6 +57,11 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, xfs_agblock_t agblocks); extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); -#endif /* __XFS_RMAP_BTREE_H__ */ +unsigned int xfs_rmapbt_maxlevels_ondisk(void); + +int __init xfs_rmapbt_init_cur_cache(void); +void xfs_rmapbt_destroy_cur_cache(void); + +#endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f42c74cb8be5..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -56,9 +56,9 @@ xfs_rtbuf_get( xfs_trans_t *tp, /* transaction pointer */ xfs_rtblock_t block, /* block number in bitmap or summary */ int issum, /* is summary not bitmap */ - xfs_buf_t **bpp) /* output: buffer for the block */ + struct xfs_buf **bpp) /* output: buffer for the block */ { - xfs_buf_t *bp; /* block buffer, result */ + struct xfs_buf *bp; /* block buffer, result */ xfs_inode_t *ip; /* bitmap or summary inode */ xfs_bmbt_irec_t map; int nmap = 1; @@ -66,11 +66,11 @@ xfs_rtbuf_get( ip = issum ? mp->m_rsumip : mp->m_rbmip; - error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); @@ -101,7 +101,7 @@ xfs_rtfind_back( xfs_rtword_t *b; /* current word in buffer */ int bit; /* bit number in the word */ xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ + struct xfs_buf *bp; /* buf for the block */ xfs_rtword_t *bufp; /* starting word in buffer */ int error; /* error value */ xfs_rtblock_t firstbit; /* first useful bit in the word */ @@ -276,7 +276,7 @@ xfs_rtfind_forw( xfs_rtword_t *b; /* current word in buffer */ int bit; /* bit number in the word */ xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ + struct xfs_buf *bp; /* buf for the block */ xfs_rtword_t *bufp; /* starting word in buffer */ int error; /* error value */ xfs_rtblock_t i; /* current bit number rel. to start */ @@ -447,11 +447,11 @@ xfs_rtmodify_summary_int( int log, /* log2 of extent size */ xfs_rtblock_t bbno, /* bitmap block number */ int delta, /* change to make to summary info */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ + struct xfs_buf **rbpp, /* in/out: summary block buffer */ xfs_fsblock_t *rsb, /* in/out: summary block number */ xfs_suminfo_t *sum) /* out: summary info for this block */ { - xfs_buf_t *bp; /* buffer for the summary block */ + struct xfs_buf *bp; /* buffer for the summary block */ int error; /* error value */ xfs_fsblock_t sb; /* summary fsblock */ int so; /* index into the summary file */ @@ -517,7 +517,7 @@ xfs_rtmodify_summary( int log, /* log2 of extent size */ xfs_rtblock_t bbno, /* bitmap block number */ int delta, /* change to make to summary info */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ + struct xfs_buf **rbpp, /* in/out: summary block buffer */ xfs_fsblock_t *rsb) /* in/out: summary block number */ { return xfs_rtmodify_summary_int(mp, tp, log, bbno, @@ -539,7 +539,7 @@ xfs_rtmodify_range( xfs_rtword_t *b; /* current word in buffer */ int bit; /* bit number in the word */ xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ + struct xfs_buf *bp; /* buf for the block */ xfs_rtword_t *bufp; /* starting word in buffer */ int error; /* error value */ xfs_rtword_t *first; /* first used word in the buffer */ @@ -690,7 +690,7 @@ xfs_rtfree_range( xfs_trans_t *tp, /* transaction pointer */ xfs_rtblock_t start, /* starting block to free */ xfs_extlen_t len, /* length to free */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ + struct xfs_buf **rbpp, /* in/out: summary block buffer */ xfs_fsblock_t *rsb) /* in/out: summary block number */ { xfs_rtblock_t end; /* end of the freed extent */ @@ -773,7 +773,7 @@ xfs_rtcheck_range( xfs_rtword_t *b; /* current word in buffer */ int bit; /* bit number in the word */ xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ + struct xfs_buf *bp; /* buf for the block */ xfs_rtword_t *bufp; /* starting word in buffer */ int error; /* error value */ xfs_rtblock_t i; /* current bit number rel. to start */ @@ -969,7 +969,7 @@ xfs_rtfree_extent( int error; /* error value */ xfs_mount_t *mp; /* file system mount structure */ xfs_fsblock_t sb; /* summary file block number */ - xfs_buf_t *sumbp = NULL; /* summary file block buffer */ + struct xfs_buf *sumbp = NULL; /* summary file block buffer */ mp = tp->t_mountp; @@ -997,8 +997,8 @@ xfs_rtfree_extent( */ if (tp->t_frextents_delta + mp->m_sb.sb_frextents == mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } @@ -1008,17 +1008,17 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; - xfs_rtblock_t rem; + xfs_rtblock_t high_key; int is_free; int error = 0; @@ -1027,13 +1027,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext > mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents; + + high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - rem = high_rec->ar_startext - rtstart; - while (rem) { + while (rtstart <= high_key) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1041,8 +1040,7 @@ xfs_rtalloc_query_range( break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext - 1, &rtend); + error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); if (error) break; @@ -1050,12 +1048,11 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } - rem -= rtend - rtstart + 1; rtstart = rtend + 1; } @@ -1065,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1072,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 2f60fc3c99a0..a20cade590e9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -15,7 +15,6 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_bmap_btree.h" @@ -25,70 +24,157 @@ #include "xfs_refcount_btree.h" #include "xfs_da_format.h" #include "xfs_health.h" +#include "xfs_ag.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ /* - * Reference counting access wrappers to the perag structures. - * Because we never free per-ag structures, the only thing we - * have to protect against changes is the tree structure itself. + * Check that all the V4 feature bits that the V5 filesystem format requires are + * correctly set. */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) +static bool +xfs_sb_validate_v5_features( + struct xfs_sb *sbp) { - struct xfs_perag *pag; - int ref = 0; + /* We must not have any unknown V4 feature bits set */ + if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) + return false; - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); - } - rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); - return pag; + /* + * The CRC bit is considered an invalid V4 flag, so we have to add it + * manually to the OKBITS mask. + */ + if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | + XFS_SB_VERSION2_CRCBIT)) + return false; + + /* Now check all the required V4 feature flags are set. */ + +#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_MOREBITSBIT) + +#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_CRCBIT) + + if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) + return false; + if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) + return false; + return true; } /* - * search from @first to find the next perag with the given tag set. + * We support all XFS versions newer than a v4 superblock with V2 directories. */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) +bool +xfs_sb_good_version( + struct xfs_sb *sbp) { - struct xfs_perag *pag; - int found; - int ref; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - ref = atomic_inc_return(&pag->pag_ref); - rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); - return pag; + /* + * All v5 filesystems are supported, but we must check that all the + * required v4 feature flags are enabled correctly as the code checks + * those flags and not for v5 support. + */ + if (xfs_sb_is_v5(sbp)) + return xfs_sb_validate_v5_features(sbp); + + /* We must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + /* versions prior to v4 are not supported */ + if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) + return false; + + /* V4 filesystems need v2 directories and unwritten extents */ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) + return false; + + /* It's a supported v4 filesystem */ + return true; } -void -xfs_perag_put( - struct xfs_perag *pag) +uint64_t +xfs_sb_version_to_features( + struct xfs_sb *sbp) { - int ref; + uint64_t features = 0; + + /* optional V4 features */ + if (sbp->sb_rblocks > 0) + features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) + features |= XFS_FEAT_NLINK; + if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) + features |= XFS_FEAT_ATTR; + if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) + features |= XFS_FEAT_QUOTA; + if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT) + features |= XFS_FEAT_ALIGN; + if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT) + features |= XFS_FEAT_LOGV2; + if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT) + features |= XFS_FEAT_DALIGN; + if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT) + features |= XFS_FEAT_EXTFLG; + if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) + features |= XFS_FEAT_SECTOR; + if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT) + features |= XFS_FEAT_ASCIICI; + if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { + if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) + features |= XFS_FEAT_LAZYSBCOUNT; + if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) + features |= XFS_FEAT_ATTR2; + if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) + features |= XFS_FEAT_PROJID32; + if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) + features |= XFS_FEAT_FTYPE; + } - ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); + if (!xfs_sb_is_v5(sbp)) + return features; + + /* Always on V5 features */ + features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; + + /* Optional V5 features */ + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT) + features |= XFS_FEAT_FINOBT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT) + features |= XFS_FEAT_RMAPBT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK) + features |= XFS_FEAT_REFLINK; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) + features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) + features |= XFS_FEAT_FTYPE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) + features |= XFS_FEAT_SPINODES; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) + features |= XFS_FEAT_META_UUID; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME) + features |= XFS_FEAT_BIGTIME; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + features |= XFS_FEAT_NEEDSREPAIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) + features |= XFS_FEAT_NREXT64; + + return features; } /* Check all the superblock fields we care about when reading one in. */ @@ -97,7 +183,7 @@ xfs_validate_sb_read( struct xfs_mount *mp, struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5) + if (!xfs_sb_is_v5(sbp)) return 0; /* @@ -117,7 +203,7 @@ xfs_validate_sb_read( "Superblock has unknown read-only compatible features (0x%x) enabled.", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!xfs_is_readonly(mp)) { xfs_warn(mp, "Attempted to mount read-only compatible filesystem read-write."); xfs_warn(mp, @@ -156,7 +242,7 @@ xfs_validate_sb_write( * secondary superblocks, so allow this usage to continue because * we never read counters from such superblocks. */ - if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && + if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && (sbp->sb_fdblocks > sbp->sb_dblocks || !xfs_verify_icount(mp, sbp->sb_icount) || sbp->sb_ifree > sbp->sb_icount)) { @@ -164,7 +250,7 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5) + if (!xfs_sb_is_v5(sbp)) return 0; /* @@ -220,49 +306,64 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; + bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, +"Superblock has bad magic number 0x%x. Not an XFS filesystem?", + be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); + xfs_warn(mp, +"Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } - if (xfs_sb_version_has_pquotino(sbp)) { - if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { + /* + * Validate feature flags and state + */ + if (xfs_sb_is_v5(sbp)) { + if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { xfs_notice(mp, - "Version 5 of Super block has XFS_OQUOTA bits."); +"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)", + sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE); return -EFSCORRUPTED; } - } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | - XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + + /* V5 has a separate project quota inode */ + if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, -"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); + "Version 5 of Super block has XFS_OQUOTA bits."); return -EFSCORRUPTED; - } + } - /* - * Full inode chunks must be aligned to inode chunk size when - * sparse inodes are enabled to support the sparse chunk - * allocation algorithm and prevent overlapping inode records. - */ - if (xfs_sb_version_hassparseinodes(sbp)) { - uint32_t align; + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) { + uint32_t align; - align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize - >> sbp->sb_blocklog; - if (sbp->sb_inoalignmt != align) { - xfs_warn(mp, + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, "Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", - sbp->sb_inoalignmt, align); - return -EINVAL; + sbp->sb_inoalignmt, align); + return -EINVAL; + } } + } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + xfs_notice(mp, +"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits."); + return -EFSCORRUPTED; } if (unlikely( @@ -328,39 +429,52 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } - if (sbp->sb_unit) { - if (!xfs_sb_version_hasdalign(sbp) || - sbp->sb_unit > sbp->sb_width || - (sbp->sb_width % sbp->sb_unit) != 0) { - xfs_notice(mp, "SB stripe unit sanity check failed"); - return -EFSCORRUPTED; - } - } else if (xfs_sb_version_hasdalign(sbp)) { - xfs_notice(mp, "SB stripe alignment sanity check failed"); - return -EFSCORRUPTED; - } else if (sbp->sb_width) { - xfs_notice(mp, "SB stripe width sanity check failed"); + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); return -EFSCORRUPTED; } + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; - if (xfs_sb_version_hascrc(&mp->m_sb) && - sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { - xfs_notice(mp, "v5 SB sanity check failed"); - return -EFSCORRUPTED; + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } } /* - * Until this is fixed only page-sized or smaller data blocks work. + * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) + * would imply the image is corrupted. */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_warn(mp, - "File system with blocksize %d bytes. " - "Only pagesize (%ld) or less will currently work.", - sbp->sb_blocksize, PAGE_SIZE); - return -ENOSYS; + has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT; + if (!!sbp->sb_unit ^ has_dalign) { + xfs_notice(mp, "SB stripe alignment sanity check failed"); + return -EFSCORRUPTED; } + if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), + XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) + return -EFSCORRUPTED; + /* * Currently only very few inode sizes are supported. */ @@ -376,22 +490,6 @@ xfs_validate_sb_common( return -ENOSYS; } - if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || - xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - xfs_warn(mp, - "file system too large to be mounted on this system."); - return -EFBIG; - } - - /* - * Don't touch the filesystem if a user tool thinks it owns the primary - * superblock. mkfs doesn't clear the flag from secondary supers, so - * we don't check them at all. - */ - if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && sbp->sb_inprogress) { - xfs_warn(mp, "Offline file system operation in progress!"); - return -EFSCORRUPTED; - } return 0; } @@ -420,7 +518,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) * We need to do these manipilations only if we are working * with an older version of on-disk superblock. */ - if (xfs_sb_version_has_pquotino(sbp)) + if (xfs_sb_is_v5(sbp)) return; if (sbp->sb_qflags & XFS_OQUOTA_ENFD) @@ -450,7 +548,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) static void __xfs_sb_from_disk( struct xfs_sb *to, - xfs_dsb_t *from, + struct xfs_dsb *from, bool convert_xquota) { to->sb_magicnum = be32_to_cpu(from->sb_magicnum); @@ -513,7 +611,8 @@ __xfs_sb_from_disk( * sb_meta_uuid is only on disk if it differs from sb_uuid and the * feature flag is set; if not set we keep it only in memory. */ - if (xfs_sb_version_hasmetauuid(to)) + if (xfs_sb_is_v5(to) && + (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); else uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); @@ -525,7 +624,7 @@ __xfs_sb_from_disk( void xfs_sb_from_disk( struct xfs_sb *to, - xfs_dsb_t *from) + struct xfs_dsb *from) { __xfs_sb_from_disk(to, from, true); } @@ -538,7 +637,12 @@ xfs_sb_quota_to_disk( uint16_t qflags = from->sb_qflags; to->sb_uquotino = cpu_to_be64(from->sb_uquotino); - if (xfs_sb_version_has_pquotino(from)) { + + /* + * The in-memory superblock quota state matches the v5 on-disk format so + * just write them out and return + */ + if (xfs_sb_is_v5(from)) { to->sb_qflags = cpu_to_be16(from->sb_qflags); to->sb_gquotino = cpu_to_be64(from->sb_gquotino); to->sb_pquotino = cpu_to_be64(from->sb_pquotino); @@ -546,9 +650,9 @@ xfs_sb_quota_to_disk( } /* - * The in-core version of sb_qflags do not have XFS_OQUOTA_* - * flags, whereas the on-disk version does. So, convert incore - * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. + * For older superblocks (v4), the in-core version of sb_qflags do not + * have XFS_OQUOTA_* flags, whereas the on-disk version does. So, + * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. */ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); @@ -568,7 +672,7 @@ xfs_sb_quota_to_disk( * disk. If neither are active, we should NULL the inode. * * In all cases, the separate pquotino must remain 0 because it - * it beyond the "end" of the valid non-pquotino superblock. + * is beyond the "end" of the valid non-pquotino superblock. */ if (from->sb_qflags & XFS_GQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_gquotino); @@ -648,19 +752,20 @@ xfs_sb_to_disk( to->sb_features2 = cpu_to_be32(from->sb_features2); to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); - if (xfs_sb_version_hascrc(from)) { - to->sb_features_compat = cpu_to_be32(from->sb_features_compat); - to->sb_features_ro_compat = - cpu_to_be32(from->sb_features_ro_compat); - to->sb_features_incompat = - cpu_to_be32(from->sb_features_incompat); - to->sb_features_log_incompat = - cpu_to_be32(from->sb_features_log_incompat); - to->sb_spino_align = cpu_to_be32(from->sb_spino_align); - to->sb_lsn = cpu_to_be64(from->sb_lsn); - if (xfs_sb_version_hasmetauuid(from)) - uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); - } + if (!xfs_sb_is_v5(from)) + return; + + to->sb_features_compat = cpu_to_be32(from->sb_features_compat); + to->sb_features_ro_compat = + cpu_to_be32(from->sb_features_ro_compat); + to->sb_features_incompat = + cpu_to_be32(from->sb_features_incompat); + to->sb_features_log_incompat = + cpu_to_be32(from->sb_features_log_incompat); + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); + to->sb_lsn = cpu_to_be64(from->sb_lsn); + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } /* @@ -681,7 +786,7 @@ xfs_sb_read_verify( { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; int error; /* @@ -695,8 +800,8 @@ xfs_sb_read_verify( if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ - if (bp->b_bn == XFS_SB_DADDR || - xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_buf_daddr(bp) == XFS_SB_DADDR || + xfs_has_crc(mp)) { error = -EFSBADCRC; goto out_error; } @@ -707,7 +812,7 @@ xfs_sb_read_verify( * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -730,7 +835,7 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ @@ -748,13 +853,14 @@ xfs_sb_write_verify( struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -762,11 +868,11 @@ xfs_sb_write_verify( if (error) goto out_error; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_is_v5(&sb)) return; if (bip) - XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; @@ -839,78 +945,6 @@ xfs_sb_mount_common( } /* - * xfs_initialize_perag_data - * - * Read in each per-ag structure so we can count up the number of - * allocated inodes, free inodes and used filesystem blocks as this - * information is no longer persistent in the superblock. Once we have - * this information, write it into the in-core superblock structure. - */ -int -xfs_initialize_perag_data( - struct xfs_mount *mp, - xfs_agnumber_t agcount) -{ - xfs_agnumber_t index; - xfs_perag_t *pag; - xfs_sb_t *sbp = &mp->m_sb; - uint64_t ifree = 0; - uint64_t ialloc = 0; - uint64_t bfree = 0; - uint64_t bfreelst = 0; - uint64_t btree = 0; - uint64_t fdblocks; - int error = 0; - - for (index = 0; index < agcount; index++) { - /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. - */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) - return error; - - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); - ifree += pag->pagi_freecount; - ialloc += pag->pagi_count; - bfree += pag->pagf_freeblks; - bfreelst += pag->pagf_flcount; - btree += pag->pagf_btreeblks; - xfs_perag_put(pag); - } - fdblocks = bfree + bfreelst + btree; - - /* - * If the new summary counts are obviously incorrect, fail the - * mount operation because that implies the AGFs are also corrupt. - * Clear FS_COUNTERS so that we don't unmount with a dirty log, which - * will prevent xfs_repair from fixing anything. - */ - if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { - xfs_alert(mp, "AGF corruption. Please run xfs_repair."); - error = -EFSCORRUPTED; - goto out; - } - - /* Overwrite incore superblock counters with just-read data */ - spin_lock(&mp->m_sb_lock); - sbp->sb_ifree = ifree; - sbp->sb_icount = ialloc; - sbp->sb_fdblocks = fdblocks; - spin_unlock(&mp->m_sb_lock); - - xfs_reinit_percpu_counters(mp); -out: - xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); - return error; -} - -/* * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock * into the superblock buffer to be logged. It does not provide the higher * level of locking that is needed to protect the in-core superblock from @@ -921,13 +955,28 @@ xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *bp = xfs_trans_getsb(tp, mp); + struct xfs_buf *bp = xfs_trans_getsb(tp); - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + /* + * Lazy sb counters don't update the in-core superblock so do that now. + * If this is at unmount, the counters will be exactly correct, but at + * any other time they will only be ballpark correct because of + * reservations that have been taken out percpu counters. If we have an + * unclean shutdown, this will be corrected by log recovery rebuilding + * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. + */ + if (xfs_has_lazysbcount(mp)) { + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + } - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } @@ -976,17 +1025,18 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - xfs_agnumber_t agno; + struct xfs_perag *pag; + xfs_agnumber_t agno = 1; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -998,7 +1048,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - agno); + pag->pag_agno); if (!saved_error) saved_error = error; continue; @@ -1007,7 +1057,7 @@ xfs_update_secondary_sbs( bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); @@ -1019,7 +1069,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, agno); + error, pag->pag_agno); if (!saved_error) saved_error = error; continue; @@ -1051,7 +1101,7 @@ xfs_sync_sb_buf( if (error) return error; - bp = xfs_trans_getsb(tp, mp); + bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); @@ -1069,10 +1119,12 @@ out: void xfs_fs_geometry( - struct xfs_sb *sbp, + struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version) { + struct xfs_sb *sbp = &mp->m_sb; + memset(geo, 0, sizeof(struct xfs_fsop_geom)); geo->blocksize = sbp->sb_blocksize; @@ -1103,47 +1155,53 @@ xfs_fs_geometry( geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | XFS_FSOP_GEOM_FLAGS_EXTFLG; - if (xfs_sb_version_hasattr(sbp)) + if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; - if (xfs_sb_version_hasquota(sbp)) + if (xfs_has_quota(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; - if (xfs_sb_version_hasalign(sbp)) + if (xfs_has_align(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; - if (xfs_sb_version_hasdalign(sbp)) + if (xfs_has_dalign(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; - if (xfs_sb_version_hassector(sbp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; - if (xfs_sb_version_hasasciici(sbp)) + if (xfs_has_asciici(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; - if (xfs_sb_version_haslazysbcount(sbp)) + if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_sb_version_hasattr2(sbp)) + if (xfs_has_attr2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; - if (xfs_sb_version_hasprojid32bit(sbp)) + if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; - if (xfs_sb_version_hascrc(sbp)) + if (xfs_has_crc(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; - if (xfs_sb_version_hasftype(sbp)) + if (xfs_has_ftype(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; - if (xfs_sb_version_hasfinobt(sbp)) + if (xfs_has_finobt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; - if (xfs_sb_version_hassparseinodes(sbp)) + if (xfs_has_sparseinodes(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; - if (xfs_sb_version_hasrmapbt(sbp)) + if (xfs_has_rmapbt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; - if (xfs_sb_version_hasreflink(sbp)) + if (xfs_has_reflink(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; - if (xfs_sb_version_hassector(sbp)) + if (xfs_has_bigtime(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; + if (xfs_has_inobtcounts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; + if (xfs_has_sector(mp)) { + geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; - else + } else { geo->logsectsize = BBSIZE; + } + if (xfs_has_large_extent_counts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) return; - if (xfs_sb_version_haslogv2(sbp)) + if (xfs_has_logv2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; @@ -1198,3 +1256,61 @@ xfs_sb_get_secondary( *bpp = bp; return 0; } + +/* + * sunit, swidth, sectorsize(optional with 0) should be all in bytes, + * so users won't be confused by values in error messages. + */ +bool +xfs_validate_stripe_geometry( + struct xfs_mount *mp, + __s64 sunit, + __s64 swidth, + int sectorsize, + bool silent) +{ + if (swidth > INT_MAX) { + if (!silent) + xfs_notice(mp, +"stripe width (%lld) is too large", swidth); + return false; + } + + if (sunit > swidth) { + if (!silent) + xfs_notice(mp, +"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); + return false; + } + + if (sectorsize && (int)sunit % sectorsize) { + if (!silent) + xfs_notice(mp, +"stripe unit (%lld) must be a multiple of the sector size (%d)", + sunit, sectorsize); + return false; + } + + if (sunit && !swidth) { + if (!silent) + xfs_notice(mp, +"invalid stripe unit (%lld) and stripe width of 0", sunit); + return false; + } + + if (!sunit && swidth) { + if (!silent) + xfs_notice(mp, +"invalid stripe width (%lld) and stripe unit of 0", swidth); + return false; + } + + if (sunit && (int)swidth % (int)sunit) { + if (!silent) + xfs_notice(mp, +"stripe width (%lld) must be a multiple of the stripe unit (%lld)", + swidth, sunit); + return false; + } + return true; +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 92465a9a5162..a5e14740ec9a 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -13,15 +13,6 @@ struct xfs_trans; struct xfs_fsop_geom; struct xfs_perag; -/* - * perag get/put wrappers for ref counting - */ -extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); -extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); -extern void xfs_perag_put(struct xfs_perag *pag); -extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); - extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); extern int xfs_sync_sb_buf(struct xfs_mount *mp); @@ -29,11 +20,13 @@ extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +extern bool xfs_sb_good_version(struct xfs_sb *sbp); +extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); #define XFS_FS_GEOM_MAX_STRUCT_VER (4) -extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, +extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, @@ -42,4 +35,7 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, + __s64 sunit, __s64 swidth, int sectorsize, bool silent); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c45acbd3add9..c4381388c0c1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -22,30 +22,26 @@ struct xfs_inode; * Buffer verifier operations are widely used, including userspace tools */ extern const struct xfs_buf_ops xfs_agf_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_bnobt_buf_ops; -extern const struct xfs_buf_ops xfs_cntbt_buf_ops; -extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; -extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_da3_node_buf_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_finobt_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbuf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_rtbuf_ops; /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); @@ -58,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, /* * Values for t_flags. */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +/* Transaction needs to be logged */ +#define XFS_TRANS_DIRTY (1u << 0) +/* Superblock is dirty and needs to be logged */ +#define XFS_TRANS_SB_DIRTY (1u << 1) +/* Transaction took a permanent log reservation */ +#define XFS_TRANS_PERM_LOG_RES (1u << 2) +/* Synchronous transaction commit needed */ +#define XFS_TRANS_SYNC (1u << 3) +/* Transaction can use reserve block pool */ +#define XFS_TRANS_RESERVE (1u << 4) +/* Transaction should avoid VFS level superblock write accounting */ +#define XFS_TRANS_NO_WRITECOUNT (1u << 5) +/* Transaction has freed blocks returned to it's reservation */ +#define XFS_TRANS_RES_FDBLKS (1u << 6) +/* Transaction contains an intent done log item */ +#define XFS_TRANS_HAS_INTENT_DONE (1u << 7) + /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an @@ -175,6 +181,13 @@ struct xfs_ino_geometry { unsigned int ialloc_align; unsigned int agino_log; /* #bits for agino in inum */ + + /* precomputed default inode attribute fork offset */ + unsigned int attr_fork_offset; + + /* precomputed value for di_flags2 */ + uint64_t new_diflags2; + }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 3b8260ca7d1b..bdc777b9ec4a 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -42,7 +42,7 @@ xfs_symlink_hdr_set( { struct xfs_dsymlink_hdr *dsl = bp->b_addr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr)); @@ -51,7 +51,7 @@ xfs_symlink_hdr_set( dsl->sl_bytes = cpu_to_be32(size); uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); - dsl->sl_blkno = cpu_to_be64(bp->b_bn); + dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp)); bp->b_ops = &xfs_symlink_buf_ops; return sizeof(struct xfs_dsymlink_hdr); @@ -89,13 +89,13 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno)) return __this_address; if (be32_to_cpu(dsl->sl_offset) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) @@ -116,7 +116,7 @@ xfs_symlink_read_verify( xfs_failaddr_t fa; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) @@ -137,7 +137,7 @@ xfs_symlink_write_verify( xfs_failaddr_t fa; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_symlink_verify(bp); @@ -173,7 +173,7 @@ xfs_symlink_local_to_remote( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); @@ -204,20 +204,16 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - char *sfp; - char *endp; - struct xfs_ifork *ifp; - int size; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + char *sfp = (char *)ifp->if_u1.if_data; + int size = ifp->if_bytes; + char *endp = sfp + size; - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - sfp = (char *)ifp->if_u1.if_data; - size = ifp->if_bytes; - endp = sfp + size; + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 2b8ccb5b975d..8b5547073379 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -27,7 +29,7 @@ xfs_trans_ijoin( struct xfs_inode *ip, uint lock_flags) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) @@ -36,6 +38,7 @@ xfs_trans_ijoin( ASSERT(iip->ili_lock_flags == 0); iip->ili_lock_flags = lock_flags; + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Get a log_item_desc to point at the new item. @@ -67,28 +70,39 @@ xfs_trans_ichgtime( if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; if (flags & XFS_ICHGTIME_CREATE) - ip->i_d.di_crtime = tv; + ip->i_crtime = tv; } /* - * This is called to mark the fields indicated in fieldmask as needing - * to be logged when the transaction is committed. The inode must - * already be associated with the given transaction. + * This is called to mark the fields indicated in fieldmask as needing to be + * logged when the transaction is committed. The inode must already be + * associated with the given transaction. * - * The values for fieldmask are defined in xfs_inode_item.h. We always - * log all of the core inode if any of it has changed, and we always log - * all of the inline data/extents/b-tree root if any of them has changed. + * The values for fieldmask are defined in xfs_inode_item.h. We always log all + * of the core inode if any of it has changed, and we always log all of the + * inline data/extents/b-tree root if any of them has changed. + * + * Grab and pin the cluster buffer associated with this inode to avoid RMW + * cycles at inode writeback time. Avoid the need to add error handling to every + * xfs_trans_log_inode() call by shutting down on read error. This will cause + * transactions to fail and everything to error out, just like if we return a + * read error in a dirty transaction and cancel it. */ void xfs_trans_log_inode( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + uint flags) { - struct inode *inode = VFS_I(ip); + struct xfs_inode_log_item *iip = ip->i_itemp; + struct inode *inode = VFS_I(ip); + uint iversion_flags = 0; - ASSERT(ip->i_itemp != NULL); + ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); + + tp->t_flags |= XFS_TRANS_DIRTY; /* * Don't bother with i_lock for the I_DIRTY_TIME check here, as races @@ -96,22 +110,13 @@ xfs_trans_log_inode( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); } /* - * Record the specific change for fdatasync optimisation. This - * allows fdatasync to skip log forces for inodes that are only - * timestamp dirty. We do this before the change count so that - * the core being logged in this case does not impact on fdatasync - * behaviour. - */ - ip->i_itemp->ili_fsync_fields |= flags; - - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually @@ -120,23 +125,89 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && - IS_I_VERSION(VFS_I(ip))) { - if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) - flags |= XFS_ILOG_CORE; + if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { + if (IS_I_VERSION(inode) && + inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) + iversion_flags = XFS_ILOG_CORE; } - tp->t_flags |= XFS_TRANS_DIRTY; + /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_has_bigtime(ip->i_mount) && + !xfs_inode_has_bigtime(ip)) { + ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= flags; + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); + if (error) { + xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); + return; + } + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } /* - * Always OR in the bits from the ili_last_fields field. - * This is to coordinate with the xfs_iflush() and xfs_iflush_done() - * routines in the eventual clearing of the ili_fields bits. - * See the big comment in xfs_iflush() for an explanation of - * this coordination mechanism. + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. */ - flags |= ip->i_itemp->ili_last_fields; - ip->i_itemp->ili_fields |= flags; + iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); + spin_unlock(&iip->ili_lock); } int diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 7a9c04920505..5b2f27cbdb80 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -56,30 +56,40 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). With reflink, there are four trees (refcountbt). The number of - * blocks reserved is based on the formula: + * (rmapbt). The number of blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ uint -xfs_allocfree_log_count( +xfs_allocfree_block_count( struct xfs_mount *mp, uint num_ops) { uint blocks; - blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); + if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_sb_version_hasreflink(&mp->m_sb)) - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } /* + * Per-extent log reservation for refcount btree changes. These are never done + * in the same transaction as an allocation or a free, so we compute them + * separately. + */ +static unsigned int +xfs_refcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_refc_maxlevels - 1); +} + +/* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into * the amount of space they use on disk. @@ -136,7 +146,7 @@ xfs_calc_inobt_res( { return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -155,7 +165,7 @@ STATIC uint xfs_calc_finobt_res( struct xfs_mount *mp) { - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return 0; return xfs_calc_inobt_res(mp); @@ -183,11 +193,11 @@ xfs_calc_inode_chunk_res( { uint res, size = 0; - res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_v3inodes(mp)) return res; size = XFS_FSB_TO_B(mp, 1); } @@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res( /* * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap - * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, - * as well as the realtime summary block. + * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime + * extents, as well as the realtime summary block. */ static unsigned int -xfs_rtalloc_log_count( +xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int rtbmp_bytes; - rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; } @@ -233,6 +243,28 @@ xfs_rtalloc_log_count( * register overflow from temporaries in the calculations. */ +/* + * Compute the log reservation required to handle the refcount update + * transaction. Refcount updates are always done via deferred log items. + * + * This is calculated as: + * Data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +static unsigned int +xfs_calc_refcountbt_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); +} /* * In a write transaction we can allocate a maximum of 2 @@ -247,7 +279,7 @@ xfs_rtalloc_log_count( * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size - * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 1 block * the allocation btrees: 2 trees * (2 * max depth - 1) * block size * And the bmap_finish transaction can free bmap blocks in a join (t3): @@ -255,34 +287,65 @@ xfs_rtalloc_log_count( * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_write_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz); } else { t2 = 0; } t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * two refcountbt splits for each transaction. The codebase runs + * refcountbt updates in separate transactions now, so to compute the + * minimum log size, add the refcountbtree splits back to t1 and t3 and + * do not account them separately as t4. Reflink did not support + * realtime when the reservations were established, so no adjustment to + * t2 is needed. + */ + if (for_minlogsize) { + unsigned int adj = 0; + + if (xfs_has_reflink(mp)) + adj = xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 2), + blksz); + t1 += adj; + t3 += adj; + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 1); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_write_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp, true); } /* @@ -299,38 +362,67 @@ xfs_calc_write_reservation( * the agf for each of the ags: 2 * sector size * the agfl for each of the ags: 2 * sector size * the super block to reflect the freed blocks: sector size - * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_itruncate_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); } else { t3 = 0; } - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * four refcountbt splits in the same transaction as bnobt/cntbt + * updates. The codebase runs refcountbt updates in separate + * transactions now, so to compute the minimum log size, add the + * refcount btree splits back here and do not compute them separately + * as t4. Reflink did not support realtime when the reservations were + * established, so do not adjust t3. + */ + if (for_minlogsize) { + if (xfs_has_reflink(mp)) + t2 += xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 4), + blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 2); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_itruncate_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_itruncate_reservation(mp, true); } /* * In renaming a files we can modify: - * the four inodes involved: 4 * inode size + * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets @@ -345,11 +437,11 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 5) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -389,7 +481,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -423,11 +515,11 @@ xfs_calc_remove_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -572,7 +664,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -670,7 +762,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -791,34 +883,19 @@ xfs_calc_qm_setqlim_reservation(void) */ STATIC uint xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - return xfs_calc_write_reservation(mp) + + return xfs_calc_write_reservation(mp, for_minlogsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } -/* - * Turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - * the superblock for the quota flags: sector size - */ -STATIC uint -xfs_calc_qm_quotaoff_reservation( +unsigned int +xfs_calc_qm_dqalloc_reservation_minlogsize( struct xfs_mount *mp) { - return sizeof(struct xfs_qoff_logitem) * 2 + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * End of turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - */ -STATIC uint -xfs_calc_qm_quotaoff_end_reservation(void) -{ - return sizeof(struct xfs_qoff_logitem) * 2; + return xfs_calc_qm_dqalloc_reservation(mp, true); } /* @@ -837,23 +914,18 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { + int logcount_adj = 0; + /* * The following transactions are logged in physical format and * require a permanent reservation on space. */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) - resp->tr_itruncate.tr_logcount = - XFS_ITRUNCATE_LOG_COUNT_REFLINK; - else - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -909,11 +981,9 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, + false); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* @@ -923,13 +993,6 @@ xfs_trans_resv_calc( resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); - resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - - resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(); - resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; @@ -946,4 +1009,20 @@ xfs_trans_resv_calc( resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); + + /* + * Add one logcount for BUI items that appear with rmap or reflink, + * one logcount for refcount intent items, and one logcount for rmap + * intent items. + */ + if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) + logcount_adj++; + if (xfs_has_reflink(mp)) + logcount_adj++; + if (xfs_has_rmapbt(mp)) + logcount_adj++; + + resp->tr_itruncate.tr_logcount += logcount_adj; + resp->tr_write.tr_logcount += logcount_adj; + resp->tr_qm_dqalloc.tr_logcount += logcount_adj; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 7241ab28cf84..0554b9d775d2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -46,8 +46,6 @@ struct xfs_trans_resv { struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ - struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ - struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ }; @@ -75,7 +73,6 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -85,13 +82,24 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 +/* + * Original log operation counts were overestimated in the early days of + * reflink. These are retained here purely for minimum log size calculations + * and must not be used for runtime reservations. + */ +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 + void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); -uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); + +unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 88221c7a04cc..87b31c69a773 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -17,6 +17,13 @@ /* Adding one rmap could split every level up to the top of the tree. */ #define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) +/* + * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled, + * so we must preserve this behavior to avoid changing the transaction space + * reservations and minimum log size calculations for existing filesystems. + */ +#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9 + /* Blocks we might need to add "b" rmaps to a tree. */ #define XFS_NRMAPADD_SPACE_RES(mp, b)\ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ @@ -57,8 +64,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ - (M_IGEO(mp)->inobt_maxlevels - 1))) + ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels)) /* * Space reservation values for various transactions. @@ -75,7 +81,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * (mp)->m_ag_maxlevels) + (2 * (mp)->m_alloc_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ @@ -94,8 +100,7 @@ #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? \ - M_IGEO(mp)->inobt_maxlevels : 0) + (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 4f595546a639..5c2765934732 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -11,26 +11,15 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_ag.h" -/* Find the size of the AG, in blocks. */ -xfs_agblock_t -xfs_ag_block_count( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - ASSERT(agno < mp->m_sb.sb_agcount); - - if (agno < mp->m_sb.sb_agcount - 1) - return mp->m_sb.sb_agblocks; - return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); -} /* * Verify that an AG block number pointer neither points outside the AG * nor points at static metadata. */ -bool -xfs_verify_agbno( +static inline bool +xfs_verify_agno_agbno( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno) @@ -49,7 +38,7 @@ xfs_verify_agbno( * Verify that an FS block number pointer neither points outside the * filesystem nor points at static AG metadata. */ -bool +inline bool xfs_verify_fsbno( struct xfs_mount *mp, xfs_fsblock_t fsbno) @@ -58,43 +47,38 @@ xfs_verify_fsbno( if (agno >= mp->m_sb.sb_agcount) return false; - return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); + return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); } -/* Calculate the first and last possible inode number in an AG. */ -void -xfs_agino_range( +/* + * Verify that a data device extent is fully contained inside the filesystem, + * does not cross an AG boundary, and does not point at static metadata. + */ +bool +xfs_verify_fsbext( struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t *first, - xfs_agino_t *last) + xfs_fsblock_t fsbno, + xfs_fsblock_t len) { - xfs_agblock_t bno; - xfs_agblock_t eoag; + if (fsbno + len <= fsbno) + return false; - eoag = xfs_ag_block_count(mp, agno); + if (!xfs_verify_fsbno(mp, fsbno)) + return false; - /* - * Calculate the first inode, which will be in the first - * cluster-aligned block after the AGFL. - */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); - *first = XFS_AGB_TO_AGINO(mp, bno); - - /* - * Calculate the last inode, which will be at the end of the - * last (aligned) cluster that can be allocated in the AG. - */ - bno = round_down(eoag, M_IGEO(mp)->cluster_align); - *last = XFS_AGB_TO_AGINO(mp, bno) - 1; + if (!xfs_verify_fsbno(mp, fsbno + len - 1)) + return false; + + return XFS_FSB_TO_AGNO(mp, fsbno) == + XFS_FSB_TO_AGNO(mp, fsbno + len - 1); } /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. */ -bool -xfs_verify_agino( +static inline bool +xfs_verify_agno_agino( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino) @@ -107,23 +91,10 @@ xfs_verify_agino( } /* - * Verify that an AG inode number pointer neither points outside the AG - * nor points at static metadata, or is NULLAGINO. - */ -bool -xfs_verify_agino_or_null( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino) -{ - return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); -} - -/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ -bool +inline bool xfs_verify_ino( struct xfs_mount *mp, xfs_ino_t ino) @@ -135,17 +106,17 @@ xfs_verify_ino( return false; if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) return false; - return xfs_verify_agino(mp, agno, agino); + return xfs_verify_agno_agino(mp, agno, agino); } /* Is this an internal inode number? */ -bool +inline bool xfs_internal_inum( struct xfs_mount *mp, xfs_ino_t ino) { return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (xfs_sb_version_hasquota(&mp->m_sb) && + (xfs_has_quota(mp) && xfs_is_quota_inode(&mp->m_sb, ino)); } @@ -167,7 +138,7 @@ xfs_verify_dir_ino( * Verify that an realtime block number pointer doesn't point off the * end of the realtime device. */ -bool +inline bool xfs_verify_rtbno( struct xfs_mount *mp, xfs_rtblock_t rtbno) @@ -175,25 +146,38 @@ xfs_verify_rtbno( return rtbno < mp->m_sb.sb_rblocks; } +/* Verify that a realtime device extent is fully contained inside the volume. */ +bool +xfs_verify_rtext( + struct xfs_mount *mp, + xfs_rtblock_t rtbno, + xfs_rtblock_t len) +{ + if (rtbno + len <= rtbno) + return false; + + if (!xfs_verify_rtbno(mp, rtbno)) + return false; + + return xfs_verify_rtbno(mp, rtbno + len - 1); +} + /* Calculate the range of valid icount values. */ -void +inline void xfs_icount_range( struct xfs_mount *mp, unsigned long long *min, unsigned long long *max) { unsigned long long nr_inos = 0; + struct xfs_perag *pag; xfs_agnumber_t agno; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - xfs_agino_t first, last; - - xfs_agino_range(mp, agno, &first, &last); - nr_inos += last - first + 1; - } + for_each_perag(mp, agno, pag) + nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } @@ -219,3 +203,28 @@ xfs_verify_dablk( return dabno <= max_dablk; } + +/* Check that a file block offset does not exceed the maximum. */ +bool +xfs_verify_fileoff( + struct xfs_mount *mp, + xfs_fileoff_t off) +{ + return off <= XFS_MAX_FILEOFF; +} + +/* Check that a range of file block offsets do not exceed the maximum. */ +bool +xfs_verify_fileext( + struct xfs_mount *mp, + xfs_fileoff_t off, + xfs_fileoff_t len) +{ + if (off + len <= off) + return false; + + if (!xfs_verify_fileoff(mp, off)) + return false; + + return xfs_verify_fileoff(mp, off + len - 1); +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 397d94775440..5ebdda7e1078 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ -typedef int32_t xfs_extnum_t; /* # of extents in a file */ -typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef uint64_t xfs_extnum_t; /* # of extents in a file */ +typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ @@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ +typedef int64_t xfs_csn_t; /* CIL sequence number */ typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ @@ -56,13 +57,6 @@ typedef void * xfs_failaddr_t; #define NULLAGINO ((xfs_agino_t)-1) /* - * Max values for extlen, extnum, aextnum. - */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ - -/* * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). @@ -86,6 +80,11 @@ typedef void * xfs_failaddr_t; #define XFS_ATTR_FORK 1 #define XFS_COW_FORK 2 +#define XFS_WHICHFORK_STRINGS \ + { XFS_DATA_FORK, "data" }, \ + { XFS_ATTR_FORK, "attr" }, \ + { XFS_COW_FORK, "cow" } + /* * Min numbers of data/attr fork btree root pointers. */ @@ -167,6 +166,36 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +enum xfs_refc_domain { + XFS_REFC_DOMAIN_SHARED = 0, + XFS_REFC_DOMAIN_COW, +}; + +#define XFS_REFC_DOMAIN_STRINGS \ + { XFS_REFC_DOMAIN_SHARED, "shared" }, \ + { XFS_REFC_DOMAIN_COW, "cow" } + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ + enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */ +}; + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + /* per-AG block reservation types */ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, @@ -180,24 +209,22 @@ enum xfs_ag_resv_type { */ struct xfs_mount; -xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); -bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno); bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, + xfs_fsblock_t len); -void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t *first, xfs_agino_t *last); -bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); -bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); +bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno, + xfs_rtblock_t len); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min, unsigned long long *max); +bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); +bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, + xfs_fileoff_t len); #endif /* __XFS_TYPES_H__ */ |