From ad431025aecda85d3ebef5e4a3aca5c1c681d0c7 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:10:39 -0400 Subject: ext4: generalize extents status tree search functions Ext4 contains a few functions that are used to search for delayed extents or blocks in the extents status tree. Rather than duplicate code to add new functions to search for extents with different status values, such as written or a combination of delayed and unwritten, generalize the existing code to search for caller-specified extents status values. Also, move this code into extents_status.c where it is better associated with the data structures it operates upon, and where it can be more readily used to implement new extents status tree functions that might want a broader scope for i_es_lock. Three missing static specifiers in RFC version of patch reported and fixed by Fengguang Wu . Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0e31eb136c57..7849b7f8fd9d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2270,7 +2270,7 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->lblk, __entry->len) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, +TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), @@ -2292,7 +2292,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, (unsigned long) __entry->ino, __entry->lblk) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, +TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), -- cgit v1.2.3-59-g8ed1b From 0b02f4c0d6d9e2c611dfbdd4317193e9dca740e6 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:19:37 -0400 Subject: ext4: fix reserved cluster accounting at delayed write time The code in ext4_da_map_blocks sometimes reserves space for more delayed allocated clusters than it should, resulting in premature ENOSPC, exceeded quota, and inaccurate free space reporting. Fix this by checking for written and unwritten blocks shared in the same cluster with the newly delayed allocated block. A cluster reservation should not be made for a cluster for which physical space has already been allocated. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.c | 53 ++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 12 +++++++ fs/ext4/inode.c | 79 ++++++++++++++++++++++++++++++++++----------- include/trace/events/ext4.h | 35 ++++++++++++++++++++ 6 files changed, 241 insertions(+), 18 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fc0f41dbf90b..d85fd5c8a2c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3155,6 +3155,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95796f00e4e6..26481e543312 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5930,3 +5930,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } return replaced_count; } + +/* + * ext4_clu_mapped - determine whether any block in a logical cluster has + * been mapped to a physical cluster + * + * @inode - file containing the logical cluster + * @lclu - logical cluster of interest + * + * Returns 1 if any block in the logical cluster is mapped, signifying + * that a physical cluster has been allocated for it. Otherwise, + * returns 0. Can also return negative error codes. Derived from + * ext4_ext_map_blocks(). + */ +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_ext_path *path; + int depth, mapped = 0, err = 0; + struct ext4_extent *extent; + ext4_lblk_t first_lblk, first_lclu, last_lclu; + + /* search for the extent closest to the first block in the cluster */ + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + + depth = ext_depth(inode); + + /* + * A consistent leaf must not be empty. This situation is possible, + * though, _during_ tree modification, and it's why an assert can't + * be put in ext4_find_extent(). + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, + "bad extent address - lblock: %lu, depth: %d, pblock: %lld", + (unsigned long) EXT4_C2B(sbi, lclu), + depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + extent = path[depth].p_ext; + + /* can't be mapped if the extent tree is empty */ + if (extent == NULL) + goto out; + + first_lblk = le32_to_cpu(extent->ee_block); + first_lclu = EXT4_B2C(sbi, first_lblk); + + /* + * Three possible outcomes at this point - found extent spanning + * the target cluster, to the left of the target cluster, or to the + * right of the target cluster. The first two cases are handled here. + * The last case indicates the target cluster is not mapped. + */ + if (lclu >= first_lclu) { + last_lclu = EXT4_B2C(sbi, first_lblk + + ext4_ext_get_actual_len(extent) - 1); + if (lclu <= last_lclu) { + mapped = 1; + } else { + first_lblk = ext4_ext_next_allocated_block(path); + first_lclu = EXT4_B2C(sbi, first_lblk); + if (lclu == first_lclu) + mapped = 1; + } + } + +out: + ext4_ext_drop_refs(path); + kfree(path); + + return err ? err : mapped; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 194785ce890a..c5d456e12062 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1552,3 +1552,56 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) return ret; } + +/* + * ext4_es_insert_delayed_block - adds a delayed block to the extents status + * tree, adding a pending reservation where + * needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * @allocated - indicates whether a physical cluster has been allocated for + * the logical cluster that contains the block + * + * Returns 0 on success, negative error code on failure. + */ +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) +{ + struct extent_status newes; + int err = 0; + + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", + lblk, inode->i_ino); + + newes.es_lblk = lblk; + newes.es_len = 1; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + + err = __es_remove_extent(inode, lblk, lblk); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err != 0) + goto error; + + if (allocated) + __insert_pending(inode, lblk); + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + + return err; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 379b7171c67c..9d3c676ec623 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -178,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -232,5 +242,7 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b83bf3308b5e..57c6dd38f071 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1780,6 +1780,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } +/* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block + * count or making a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool allocated = false; + + /* + * If the cluster containing lblk is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { /* bigalloc */ + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { + if (!ext4_es_scan_clu(inode, + &ext4_es_is_mapped, lblk)) { + ret = ext4_clu_mapped(inode, + EXT4_B2C(sbi, lblk)); + if (ret < 0) + goto errout; + if (ret == 0) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { + allocated = true; + } + } else { + allocated = true; + } + } + } + + ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + +errout: + return ret; +} + /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write @@ -1864,25 +1923,9 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* - * If the block was allocated from previously allocated cluster, - * then we don't need to reserve it again. However we still need - * to reserve metadata for every block we're going to write. - */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_es_scan_clu(inode, - &ext4_es_is_delayed, map->m_lblk)) { - ret = ext4_da_reserve_space(inode); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } - } - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - ~0, EXTENT_STATUS_DELAYED); - if (ret) { + ret = ext4_insert_delayed_block(inode, map->m_lblk); + if (ret != 0) { retval = ret; goto out_unlock; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7849b7f8fd9d..6d7a943f849c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2512,6 +2512,41 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); +TRACE_EVENT(ext4_es_insert_delayed_block, + TP_PROTO(struct inode *inode, struct extent_status *es, + bool allocated), + + TP_ARGS(inode, es, allocated), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( char, status ) + __field( bool, allocated ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); + __entry->allocated = allocated; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " + "allocated %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->len, + __entry->pblk, show_extent_status(__entry->status), + __entry->allocated) +); + /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, -- cgit v1.2.3-59-g8ed1b From 9fe671496b6c286f9033aedfc1718d67721da0ae Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:25:08 -0400 Subject: ext4: adjust reserved cluster count when removing extents Modify ext4_ext_remove_space() and the code it calls to correct the reserved cluster count for pending reservations (delayed allocated clusters shared with allocated blocks) when a block range is removed from the extent tree. Pending reservations may be found for the clusters at the ends of written or unwritten extents when a block range is removed. If a physical cluster at the end of an extent is freed, it's necessary to increment the reserved cluster count to maintain correct accounting if the corresponding logical cluster is shared with at least one delayed and unwritten extent as found in the extents status tree. Add a new function, ext4_rereserve_cluster(), to reapply a reservation on a delayed allocated cluster sharing blocks with a freed allocated cluster. To avoid ENOSPC on reservation, a flag is applied to ext4_free_blocks() to briefly defer updating the freeclusters counter when an allocated cluster is freed. This prevents another thread from allocating the freed block before the reservation can be reapplied. Redefine the partial cluster object as a struct to carry more state information and to clarify the code using it. Adjust the conditional code structure in ext4_ext_remove_space to reduce the indentation level in the main body of the code to improve readability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/ext4_extents.h | 13 ++ fs/ext4/extents.c | 284 +++++++++++++++++++++++++++----------------- fs/ext4/mballoc.c | 14 ++- include/trace/events/ext4.h | 60 ++++++---- 5 files changed, 238 insertions(+), 134 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d85fd5c8a2c4..0bdbbd151d2c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -628,6 +628,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 /* * ioctl commands diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -119,6 +119,19 @@ struct ext4_ext_path { struct buffer_head *p_bh; }; +/* + * Used to record a portion of a cluster found at the beginning or end + * of an extent while traversing the extent tree during space removal. + * A partial cluster may be removed if it does not contain blocks shared + * with extents that aren't being deleted (tofree state). Otherwise, + * it cannot be removed (nofree state). + */ +struct partial_cluster { + ext4_fsblk_t pclu; /* physical cluster number */ + ext4_lblk_t lblk; /* logical block number within logical cluster */ + enum {initial, tofree, nofree} state; +}; + /* * structure for external API */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b52ac813ca20..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) return 0; } +/* + * ext4_rereserve_cluster - increment the reserved cluster count when + * freeing a cluster with a pending reservation + * + * @inode - file containing the cluster + * @lblk - logical block in cluster to be reserved + * + * Increments the reserved cluster count and adjusts quota in a bigalloc + * file system when freeing a partial cluster containing at least one + * delayed and unwritten block. A partial cluster meeting that + * requirement will have a pending reservation. If so, the + * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to + * defer reserved and allocated space accounting to a subsequent call + * to this function. + */ +static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); + + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); + spin_unlock(&ei->i_block_reservation_lock); + + percpu_counter_add(&sbi->s_freeclusters_counter, 1); + ext4_remove_pending(inode, lblk); +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); - ext4_fsblk_t pblk; - int flags = get_default_free_blocks_flags(inode); + ext4_fsblk_t last_pblk, pblk; + ext4_lblk_t num; + int flags; + + /* only extent tail removal is allowed */ + if (from < le32_to_cpu(ex->ee_block) || + to != le32_to_cpu(ex->ee_block) + ee_len - 1) { + ext4_error(sbi->s_sb, + "strange request: removal(2) %u-%u from %u:%u", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; + } + +#ifdef EXTENTS_STATS + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); +#endif + + trace_ext4_remove_blocks(inode, ex, from, to, partial); /* - * For bigalloc file systems, we never free a partial cluster - * at the beginning of the extent. Instead, we make a note - * that we tried freeing the cluster, and check to see if we - * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + * if we have a partial cluster, and it's different from the + * cluster of the last block in the extent, we free it */ - flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + last_pblk = ext4_ext_pblock(ex) + ee_len - 1; + + if (partial->state != initial && + partial->pclu != EXT4_B2C(sbi, last_pblk)) { + if (partial->state == tofree) { + flags = get_default_free_blocks_flags(inode); + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; - trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* - * If we have a partial cluster, and it's different from the - * cluster of the last block, we need to explicitly free the - * partial cluster here. + * We free the partial cluster at the end of the extent (if any), + * unless the cluster is used by another extent (partial_cluster + * state is nofree). If a partial cluster exists here, it must be + * shared with the last block in the extent. */ - pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster > 0 && - *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + flags = get_default_free_blocks_flags(inode); + + /* partial, left end cluster aligned, right end unaligned */ + if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && + (EXT4_LBLK_CMASK(sbi, to) >= from) && + (partial->state != nofree)) { + if (ext4_is_pending(inode, to)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), + EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); - *partial_cluster = 0; + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, to); + partial->state = initial; + flags = get_default_free_blocks_flags(inode); } -#ifdef EXTENTS_STATS - { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - spin_lock(&sbi->s_ext_stats_lock); - sbi->s_ext_blocks += ee_len; - sbi->s_ext_extents++; - if (ee_len < sbi->s_ext_min) - sbi->s_ext_min = ee_len; - if (ee_len > sbi->s_ext_max) - sbi->s_ext_max = ee_len; - if (ext_depth(inode) > sbi->s_depth_max) - sbi->s_depth_max = ext_depth(inode); - spin_unlock(&sbi->s_ext_stats_lock); - } -#endif - if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* tail removal */ - ext4_lblk_t num; - long long first_cluster; - - num = le32_to_cpu(ex->ee_block) + ee_len - from; - pblk = ext4_ext_pblock(ex) + ee_len - num; - /* - * Usually we want to free partial cluster at the end of the - * extent, except for the situation when the cluster is still - * used by any other extent (partial_cluster is negative). - */ - if (*partial_cluster < 0 && - *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) - flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; - ext_debug("free last %u blocks starting %llu partial %lld\n", - num, pblk, *partial_cluster); - ext4_free_blocks(handle, inode, NULL, pblk, num, flags); - /* - * If the block range to be freed didn't start at the - * beginning of a cluster, and we removed the entire - * extent and the cluster is not used by any other extent, - * save the partial cluster here, since we might need to - * delete if we determine that the truncate or punch hole - * operation has removed all of the blocks in the cluster. - * If that cluster is used by another extent, preserve its - * negative value so it isn't freed later on. - * - * If the whole extent wasn't freed, we've reached the - * start of the truncated/punched region and have finished - * removing blocks. If there's a partial cluster here it's - * shared with the remainder of the extent and is no longer - * a candidate for removal. - */ - if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { - first_cluster = (long long) EXT4_B2C(sbi, pblk); - if (first_cluster != -*partial_cluster) - *partial_cluster = first_cluster; - } else { - *partial_cluster = 0; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + + /* reset the partial cluster if we've freed past it */ + if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) + partial->state = initial; + + /* + * If we've freed the entire extent but the beginning is not left + * cluster aligned and is not marked as ineligible for freeing we + * record the partial cluster at the beginning of the extent. It + * wasn't freed by the preceding ext4_free_blocks() call, and we + * need to look farther to the left to determine if it's to be freed + * (not shared with another extent). Else, reset the partial + * cluster - we're either done freeing or the beginning of the + * extent is left cluster aligned. + */ + if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { + if (partial->state == initial) { + partial->pclu = EXT4_B2C(sbi, pblk); + partial->lblk = from; + partial->state = tofree; } - } else - ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u", - from, to, le32_to_cpu(ex->ee_block), ee_len); + } else { + partial->state = initial; + } + return 0; } - /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" @@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); - *partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial->pclu = EXT4_B2C(sbi, pblk); + partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, partial_cluster, - a, b); + err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; @@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent - * we zero partial_cluster because we've reached the start of the + * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + if (partial->pclu != EXT4_B2C(sbi, pblk)) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); } - *partial_cluster = 0; + partial->state = initial; } /* if this leaf is free, then we should @@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - long long partial_cluster = 0; + struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + partial.pclu = 0; + partial.lblk = 0; + partial.state = initial; + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ @@ -2882,8 +2941,8 @@ again: */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 2; - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; } /* @@ -2911,9 +2970,10 @@ again: &ex); if (err) goto out; - if (pblk) - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + if (pblk) { + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } } } /* @@ -2948,8 +3008,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, start, - end); + &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3021,21 +3080,24 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, end, depth, - partial_cluster, path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, + path->p_hdr->eh_entries); /* - * If we still have something in the partial cluster and we have removed - * even the first extent, then we should free the blocks in the partial - * cluster as well. (This code will only run when there are no leaves - * to the immediate left of the truncated/punched region.) + * if there's a partial cluster and we have removed the first extent + * in the file, then we also free the partial cluster, if any */ - if (partial_cluster > 0 && err == 0) { - /* don't zero partial_cluster since it's not used afterwards */ + if (partial.state == tofree && err == 0) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial.lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial.pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial.lblk); + partial.state = initial; } /* TODO: flexible tree reduction should be here */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4915,9 +4915,17 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + /* + * on a bigalloc file system, defer the s_freeclusters_counter + * update to the caller (ext4_remove_space and friends) so they + * can determine if a cluster freed here should be rereserved + */ + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, + count_clusters); + } ext4_mb_unload_buddy(&e4b); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6d7a943f849c..698e0d8a5ca4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -17,6 +17,7 @@ struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; +struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent, ); TRACE_EVENT(ext4_remove_blocks, - TP_PROTO(struct inode *inode, struct ext4_extent *ex, - ext4_lblk_t from, ext4_fsblk_t to, - long long partial_cluster), + TP_PROTO(struct inode *inode, struct ext4_extent *ex, + ext4_lblk_t from, ext4_fsblk_t to, + struct partial_cluster *pc), - TP_ARGS(inode, ex, from, to, partial_cluster), + TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) - __field( long long, partial ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( @@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks, __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; - __entry->partial = partial_cluster; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" - "from %u to %u partial_cluster %lld", + "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, @@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, - long long partial_cluster), + struct partial_cluster *pc), - TP_ARGS(inode, start, ex, partial_cluster), + TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( long long, partial ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->partial = partial_cluster; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" - "partial_cluster %lld", + "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, @@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space, TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, - int depth, long long partial, __le16 eh_entries), + int depth, struct partial_cluster *pc, __le16 eh_entries), - TP_ARGS(inode, start, end, depth, partial, eh_entries), + TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) @@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done, __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) - __field( long long, partial ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state ) __field( unsigned short, eh_entries ) ), @@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->start = start; __entry->end = end; __entry->depth = depth; - __entry->partial = partial; + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), - TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " + TP_printk("dev %d,%d ino %lu since %u end %u depth %d " + "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, - (long long) __entry->partial, + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); -- cgit v1.2.3-59-g8ed1b