diff options
Diffstat (limited to 'fs/ext4/extents.c')
-rw-r--r-- | fs/ext4/extents.c | 1109 |
1 files changed, 882 insertions, 227 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 35f65cf4f318..74292a71b384 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,6 @@ #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/falloc.h> #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" @@ -51,8 +50,8 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ @@ -144,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, { if (path->p_bh) { /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); return ext4_journal_get_write_access(handle, path->p_bh); } /* path points to leaf/index in inode body */ @@ -161,6 +161,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ @@ -360,8 +362,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; - if (len == 0) + if (lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -387,11 +391,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -508,7 +527,7 @@ __read_extent_tree_block(const char *function, unsigned int line, lblk - prev, ~0, EXTENT_STATUS_HOLE); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); @@ -604,7 +623,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else @@ -630,7 +649,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); @@ -661,7 +680,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; @@ -786,7 +805,7 @@ ext4_ext_binsearch(struct inode *inode, ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -1670,11 +1689,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, /* * Make sure that both extents are initialized. We don't merge - * uninitialized extents so that we can be sure that end_io code has + * unwritten extents so that we can be sure that end_io code has * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); @@ -1691,6 +1710,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + if (ext4_ext_is_unwritten(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; @@ -1714,7 +1738,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; + int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1724,8 +1748,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); + if (unwritten) + ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1783,8 +1810,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | - EXT4_FREE_BLOCKS_RESERVE); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* @@ -1834,8 +1860,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1845,7 +1870,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -1880,7 +1905,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - int mb_flags = 0; + int mb_flags = 0, unwritten; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1920,19 +1945,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %u:[%d]%d" "(from %llu)\n", - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; - + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1944,10 +1971,10 @@ prepend: ext_debug("prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, @@ -1955,10 +1982,13 @@ prepend: if (err) return err; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -2018,7 +2048,7 @@ has_space: ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { @@ -2029,7 +2059,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; @@ -2040,7 +2070,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } @@ -2050,7 +2080,7 @@ has_space: "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, @@ -2172,7 +2202,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, es.es_lblk = le32_to_cpu(ex->ee_block); es.es_len = ext4_ext_get_actual_len(ex); es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } @@ -2504,7 +2534,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * extent, we have to mark the cluster as used (store negative * cluster number in partial_cluster). */ - unaligned = pblk & (sbi->s_cluster_ratio - 1); + unaligned = EXT4_PBLK_COFF(sbi, pblk); if (unaligned && (ee_len == num) && (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); @@ -2548,7 +2578,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; - unsigned uninitialized = 0; + unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; @@ -2569,18 +2599,39 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + /* + * If we're starting with an extent other than the last one in the + * node, we need to see if it shares a cluster with the extent to + * the right (towards the end of the file). If its leftmost cluster + * is this extent's rightmost cluster and it is not cluster aligned, + * we'll mark it as a partial that is not to be deallocated. + */ + + if (ex != EXT_LAST_EXTENT(eh)) { + ext4_fsblk_t current_pblk, right_pblk; + long long current_cluster, right_cluster; + + current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + current_cluster = (long long)EXT4_B2C(sbi, current_pblk); + right_pblk = ext4_ext_pblock(ex + 1); + right_cluster = (long long)EXT4_B2C(sbi, right_pblk); + if (current_cluster == right_cluster && + EXT4_PBLK_COFF(sbi, right_pblk)) + *partial_cluster = -right_cluster; + } + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; else - uninitialized = 0; + unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); + unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2598,7 +2649,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * accidentally freeing it later on */ pblk = ext4_ext_pblock(ex); - if (pblk & (sbi->s_cluster_ratio - 1)) + if (EXT4_PBLK_COFF(sbi, pblk)) *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); ex--; @@ -2652,11 +2703,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_len = cpu_to_le16(num); /* - * Do not mark uninitialized if all the blocks in the + * Do not mark unwritten if all the blocks in the * extent have been removed. */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); + if (unwritten && num) + ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf @@ -2694,10 +2745,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * Free the partial cluster only if the current extent does not - * reference it. Otherwise we might free used cluster. + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If there's a partial cluster and no extents + * remain in the leaf, it can't be freed here. It can only be + * freed when it's possible to determine if it's not shared with + * any other extent - when the next leaf is processed or when space + * removal is complete. */ - if (*partial_cluster > 0 && + if (*partial_cluster > 0 && eh->eh_entries && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { int flags = get_default_free_blocks_flags(inode); @@ -2800,9 +2856,9 @@ again: end < ee_block + ext4_ext_get_actual_len(ex) - 1) { int split_flag = 0; - if (ext4_ext_is_uninitialized(ex)) - split_flag = EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (ext4_ext_is_unwritten(ex)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; /* * Split the extent in two so that 'end' is the last @@ -3059,7 +3115,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. + * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * @@ -3101,10 +3157,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_uninitialized(ex) && + BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2)); + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3116,8 +3172,8 @@ static int ext4_split_extent_at(handle_t *handle, * then we just change the state of the extent, and splitting * is not needed. */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); @@ -3131,8 +3187,8 @@ static int ext4_split_extent_at(handle_t *handle, /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more @@ -3146,8 +3202,8 @@ static int ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { @@ -3198,7 +3254,7 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -3224,7 +3280,7 @@ static int ext4_split_extent(handle_t *handle, struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; - int uninitialized; + int unwritten; int split_flag1, flags1; int allocated = map->m_len; @@ -3232,14 +3288,14 @@ static int ext4_split_extent(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, path, @@ -3259,15 +3315,20 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; - uninitialized = ext4_ext_is_uninitialized(ex); + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; - if (uninitialized) { - split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT2); + EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); @@ -3282,16 +3343,16 @@ out: /* * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * @@ -3337,12 +3398,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its neighbor. This is much cheaper + * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) @@ -3378,7 +3439,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3393,7 +3454,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); @@ -3424,7 +3485,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3439,7 +3500,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); @@ -3461,7 +3522,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; @@ -3544,26 +3605,28 @@ out: /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent. * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple initialized/uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized + * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * + * This works the same way in the case of initialized -> unwritten conversion. + * * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success. */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, @@ -3575,9 +3638,9 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; @@ -3592,14 +3655,79 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - if (flags & EXT4_GET_BLOCKS_CONVERT) - split_flag |= EXT4_EXT_DATA_VALID2; + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); } +static int ext4_convert_initialized_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + goto out; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + + static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3633,8 +3761,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_unwritten_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT); + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT); if (err < 0) goto out; ext4_ext_drop_refs(path); @@ -3753,7 +3881,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3812,9 +3940,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3822,7 +3950,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -3835,7 +3963,39 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * unwritten extent + */ + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + + ret = ext4_convert_initialized_extents(handle, inode, map, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + + return err ? err : allocated; +} + +static int +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int allocated, ext4_fsblk_t newblock) @@ -3844,25 +4004,25 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* - * When writing into uninitialized space, we should not fail to + * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; - trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submit the IO, split the extent */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); + if (flags & EXT4_GET_BLOCKS_PRE_IO) { + ret = ext4_split_convert_extents(handle, inode, map, + path, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -3875,12 +4035,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; goto out; } /* IO end_io complete, convert the filled extent to written */ - if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, path); if (ret >= 0) { @@ -3890,6 +4048,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } else err = ret; map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; @@ -3900,7 +4059,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } @@ -3976,10 +4135,6 @@ out1: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } return err ? err : allocated; } @@ -4030,7 +4185,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -4048,8 +4203,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -4112,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth; + int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4154,8 +4308,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; + /* - * Uninitialized extents are treated as holes, except that + * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); @@ -4170,13 +4325,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if (!ext4_ext_is_uninitialized(ex)) + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_unwritten(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = ext4_ext_convert_initialized_extent( + handle, inode, map, path, flags, + allocated, newblock); + goto out2; + } else if (!ext4_ext_is_unwritten(ex)) goto out; - allocated = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_unwritten_extents( handle, inode, map, path, flags, allocated, newblock); - goto out3; + if (ret < 0) + err = ret; + else + allocated = ret; + goto out2; } } @@ -4203,7 +4372,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -4241,15 +4410,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); @@ -4271,7 +4440,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; @@ -4297,21 +4466,19 @@ got_allocated_blocks: /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, + * unwritten extent. To avoid unnecessary conversion, * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + if (flags & EXT4_GET_BLOCKS_PRE_IO) set_unwritten = 1; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; } err = 0; @@ -4438,9 +4605,9 @@ got_allocated_blocks: /* * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. + * when it is _not_ an unwritten extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -4457,7 +4624,6 @@ out2: kfree(path); } -out3: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); ext4_es_lru_add(inode); @@ -4498,34 +4664,233 @@ retry: ext4_std_error(inode->i_sb, err); } -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, loff_t new_size, + int flags, int mode) { - struct timespec now; + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits; + loff_t epos; + + map.m_lblk = offset; + map.m_len = len; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + +retry: + while (ret >= 0 && len) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + map.m_lblk += ret; + map.m_len = len = len - ret; + epos = (loff_t)map.m_lblk << inode->i_blkbits; + inode->i_ctime = ext4_current_time(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode->i_mtime = inode->i_ctime; + } else { + if (epos > inode->i_size) + ext4_set_inode_flag(inode, + EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; +} + +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int credits; + int partial_begin, partial_end; + loff_t start, end; + ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + /* - * Update only when preallocation was requested beyond - * the file size. + * Write out all dirty pages to avoid race conditions + * Then release them. */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); - } else { + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; + } + + /* + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. + */ + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial_begin = offset & ((1 << blkbits) - 1); + partial_end = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + + /* + * Indirect files do not support unwritten extnets + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + /* + * If we have a partial block after EOF we have to allocate + * the entire block. + */ + if (partial_end) + max_blocks += 1; + } + + if (max_blocks > 0) { + + /* Now release the pages and zero block aligned part of pages*/ + truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. + * Remove entire range from the extent status tree. */ - if (new_size > i_size_read(inode)) + ret = ext4_es_remove_extent(inode, lblk, max_blocks); + if (ret) + goto out_dio; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out_dio; + } + if (!partial_begin && !partial_end) + goto out_dio; + + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + if (new_size) { + ext4_update_inode_size(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -4538,18 +4903,16 @@ static void ext4_falloc_update_inode(struct inode *inode, long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - handle_t *handle; - loff_t new_size; + loff_t new_size = 0; unsigned int max_blocks; int ret = 0; - int ret2 = 0; - int retries = 0; int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + ext4_lblk_t lblk; + unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4566,83 +4929,48 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); + trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; + lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + - lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. - */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - inode->i_ino, map.m_lblk, - map.m_len, ret); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; + mutex_lock(&inode->i_mutex); - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - if ((file->f_flags & O_SYNC) && ret >= max_blocks) - ext4_handle_sync(handle); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } +out: mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; } /* @@ -4853,3 +5181,330 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_es_lru_add(inode); return error; } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + if (!ex_last) + return -EIO; + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = le32_to_cpu(ex_last->ee_block) + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block, current_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + ext4_ext_drop_refs(path); + kfree(path); + return ret; + } + + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + return ret; + + /* + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. + */ + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + ext4_ext_drop_refs(path); + kfree(path); + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; + + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_ext_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); + if (start > current_block) { + /* Hole, move to the next extent */ + ret = mext_next_extent(inode, path, &extent); + if (ret != 0) { + ext4_ext_drop_refs(path); + kfree(path); + if (ret == 1) + ret = 0; + break; + } + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) + break; + } + + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size, ioffset; + int ret; + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + ext4_discard_preallocations(inode); + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} |