/* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * alloc.c * * Extent allocs and frees * * Copyright (C) 2002, 2004 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */ #include #include #include #include #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" #include "journal.h" #include "localalloc.h" #include "suballoc.h" #include "sysfile.h" #include "file.h" #include "super.h" #include "uptodate.h" #include "buffer_head_io.h" static int ocfs2_extent_contig(struct inode *inode, struct ocfs2_extent_rec *ext, u64 blkno); static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, int wanted, struct ocfs2_alloc_context *meta_ac, struct buffer_head *bhs[]); static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head *eb_bh, struct buffer_head *last_eb_bh, struct ocfs2_alloc_context *meta_ac); static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh); static int ocfs2_do_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 blkno, u32 new_clusters); static int ocfs2_find_branch_target(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head **target_bh); static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe, unsigned int new_i_clusters, struct buffer_head *old_last_eb, struct buffer_head **new_last_eb); static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_extent_contig(struct inode *inode, struct ocfs2_extent_rec *ext, u64 blkno) { return blkno == (le64_to_cpu(ext->e_blkno) + ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(ext->e_clusters))); } /* * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe) { int retval; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; struct buffer_head *eb_bh = NULL; mlog_entry_void(); if (!OCFS2_IS_VALID_DINODE(fe)) { OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); retval = -EIO; goto bail; } if (fe->i_last_eb_blk) { retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), &eb_bh, OCFS2_BH_CACHED, inode); if (retval < 0) { mlog_errno(retval); goto bail; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else el = &fe->id2.i_list; BUG_ON(el->l_tree_depth != 0); retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: if (eb_bh) brelse(eb_bh); mlog_exit(retval); return retval; } /* expects array to already be allocated * * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and * l_count for you */ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, int wanted, struct ocfs2_alloc_context *meta_ac, struct buffer_head *bhs[]) { int count, status, i; u16 suballoc_bit_start; u32 num_got; u64 first_blkno; struct ocfs2_extent_block *eb; mlog_entry_void(); count = 0; while (count < wanted) { status = ocfs2_claim_metadata(osb, handle, meta_ac, wanted - count, &suballoc_bit_start, &num_got, &first_blkno); if (status < 0) { mlog_errno(status); goto bail; } for(i = count; i < (num_got + count); i++) { bhs[i] = sb_getblk(osb->sb, first_blkno); if (bhs[i] == NULL) { status = -EIO; mlog_errno(status); goto bail; } ocfs2_set_new_buffer_uptodate(inode, bhs[i]); status = ocfs2_journal_access(handle, inode, bhs[i], OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); eb = (struct ocfs2_extent_block *) bhs[i]->b_data; /* Ok, setup the minimal stuff here. */ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); eb->h_blkno = cpu_to_le64(first_blkno); eb->h_fs_generation = cpu_to_le32(osb->fs_generation); #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS /* we always use slot zero's suballocator */ eb->h_suballoc_slot = 0; #else eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); #endif eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); suballoc_bit_start++; first_blkno++; /* We'll also be dirtied by the caller, so * this isn't absolutely necessary. */ status = ocfs2_journal_dirty(handle, bhs[i]); if (status < 0) { mlog_errno(status); goto bail; } } count += num_got; } status = 0; bail: if (status < 0) { for(i = 0; i < wanted; i++) { if (bhs[i]) brelse(bhs[i]); bhs[i] = NULL; } } mlog_exit(status); return status; } /* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. * * last_eb_bh is required as we have to update it's next_leaf pointer * for the new last extent block. * * the new branch will be 'empty' in the sense that every block will * contain a single record with e_clusters == 0. */ static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head *eb_bh, struct buffer_head *last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int status, new_blocks, i; u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; mlog_entry_void(); BUG_ON(!last_eb_bh); fe = (struct ocfs2_dinode *) fe_bh->b_data; if (eb_bh) { eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else el = &fe->id2.i_list; /* we never add a branch to a leaf. */ BUG_ON(!el->l_tree_depth); new_blocks = le16_to_cpu(el->l_tree_depth); /* allocate the number of new eb blocks we need */ new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!new_eb_bhs) { status = -ENOMEM; mlog_errno(status); goto bail; } status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, meta_ac, new_eb_bhs); if (status < 0) { mlog_errno(status); goto bail; } /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. * * when we leave the loop, new_last_eb_blk will point to the * newest leaf, and next_blkno will point to the topmost extent * block. */ next_blkno = new_last_eb_blk = 0; for(i = 0; i < new_blocks; i++) { bh = new_eb_bhs[i]; eb = (struct ocfs2_extent_block *) bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } eb_el = &eb->h_list; status = ocfs2_journal_access(handle, inode, bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } eb->h_next_leaf_blk = 0; eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_next_free_rec = cpu_to_le16(1); eb_el->l_recs[0].e_cpos = fe->i_clusters; eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); eb_el->l_recs[0].e_clusters = cpu_to_le32(0); if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); status = ocfs2_journal_dirty(handle, bh); if (status < 0) { mlog_errno(status); goto bail; } next_blkno = le64_to_cpu(eb->h_blkno); } /* This is a bit hairy. We want to update up to three blocks * here without leaving any of them in an inconsistent state * in case of error. We don't have to worry about * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ status = ocfs2_journal_access(handle, inode, last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } status = ocfs2_journal_access(handle, inode, fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } if (eb_bh) { status = ocfs2_journal_access(handle, inode, eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } } /* Link the new branch into the rest of the tree (el will * either be on the fe, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_cpos = fe->i_clusters; el->l_recs[i].e_clusters = 0; le16_add_cpu(&el->l_next_free_rec, 1); /* fe needs a new last extent block pointer, as does the * next_leaf on the previously last-extent-block. */ fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); status = ocfs2_journal_dirty(handle, last_eb_bh); if (status < 0) mlog_errno(status); status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) mlog_errno(status); if (eb_bh) { status = ocfs2_journal_dirty(handle, eb_bh); if (status < 0) mlog_errno(status); } status = 0; bail: if (new_eb_bhs) { for (i = 0; i < new_blocks; i++) if (new_eb_bhs[i]) brelse(new_eb_bhs[i]); kfree(new_eb_bhs); } mlog_exit(status); return status; } /* * adds another level to the allocation tree. * returns back the new extent block so you can add a branch to it * after this call. */ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { int status, i; struct buffer_head *new_eb_bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *fe_el; struct ocfs2_extent_list *eb_el; mlog_entry_void(); status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, &new_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } eb_el = &eb->h_list; fe = (struct ocfs2_dinode *) fe_bh->b_data; fe_el = &fe->id2.i_list; status = ocfs2_journal_access(handle, inode, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } /* copy the fe data into the new extent block */ eb_el->l_tree_depth = fe_el->l_tree_depth; eb_el->l_next_free_rec = fe_el->l_next_free_rec; for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; } status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } status = ocfs2_journal_access(handle, inode, fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } /* update fe now */ le16_add_cpu(&fe_el->l_tree_depth, 1); fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_blkno = eb->h_blkno; fe_el->l_recs[0].e_clusters = fe->i_clusters; for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { fe_el->l_recs[i].e_cpos = 0; fe_el->l_recs[i].e_clusters = 0; fe_el->l_recs[i].e_blkno = 0; } fe_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk * becomes the allocated extent block */ if (fe_el->l_tree_depth == cpu_to_le16(1)) fe->i_last_eb_blk = eb->h_blkno; status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { mlog_errno(status); goto bail; } *ret_new_eb_bh = new_eb_bh; new_eb_bh = NULL; status = 0; bail: if (new_eb_bh) brelse(new_eb_bh); mlog_exit(status); return status; } /* * Expects the tree to already have room in the rightmost leaf for the * extent. Updates all the extent blocks (and the dinode) on the way * down. */ static int ocfs2_do_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 start_blk, u32 new_clusters) { int status, i, num_bhs = 0; u64 next_blkno; u16 next_free; struct buffer_head **eb_bhs = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; mlog_entry_void(); status = ocfs2_journal_access(handle, inode, fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } fe = (struct ocfs2_dinode *) fe_bh->b_data; el = &fe->id2.i_list; if (el->l_tree_depth) { /* This is another operation where we want to be * careful about our tree updates. An error here means * none of the previous changes we made should roll * forward. As a result, we have to record the buffers * for this part of the tree in an array and reserve a * journal write to them before making any changes. */ num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), GFP_KERNEL); if (!eb_bhs) { status = -ENOMEM; mlog_errno(status); goto bail; } i = 0; while(el->l_tree_depth) { next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(inode->i_sb, "Dinode %llu has a bad extent list", (unsigned long long)OCFS2_I(inode)->ip_blkno); status = -EIO; goto bail; } next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); BUG_ON(i >= num_bhs); status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], OCFS2_BH_CACHED, inode); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } status = ocfs2_journal_access(handle, inode, eb_bhs[i], OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } el = &eb->h_list; i++; /* When we leave this loop, eb_bhs[num_bhs - 1] will * hold the bottom-most leaf extent block. */ } BUG_ON(el->l_tree_depth); el = &fe->id2.i_list; /* If we have tree depth, then the fe update is * trivial, and we want to switch el out for the * bottom-most leaf in order to update it with the * actual extent data below. */ next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(inode->i_sb, "Dinode %llu has a bad extent list", (unsigned long long)OCFS2_I(inode)->ip_blkno); status = -EIO; goto bail; } le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, new_clusters); /* (num_bhs - 1) to avoid the leaf */ for(i = 0; i < (num_bhs - 1); i++) { eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; el = &eb->h_list; /* finally, make our actual change to the * intermediate extent blocks. */ next_free = le16_to_cpu(el->l_next_free_rec); le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, new_clusters); status = ocfs2_journal_dirty(handle, eb_bhs[i]); if (status < 0) mlog_errno(status); } BUG_ON(i != (num_bhs - 1)); /* note that the leaf block wasn't touched in * the loop above */ eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; el = &eb->h_list; BUG_ON(el->l_tree_depth); } /* yay, we can finally add the actual extent now! */ i = le16_to_cpu(el->l_next_free_rec) - 1; if (le16_to_cpu(el->l_next_free_rec) && ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); } else if (le16_to_cpu(el->l_next_free_rec) && (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { /* having an empty extent at eof is legal. */ if (el->l_recs[i].e_cpos != fe->i_clusters) { ocfs2_error(inode->i_sb, "Dinode %llu trailing extent is bad: " "cpos (%u) != number of clusters (%u)", (unsigned long long)OCFS2_I(inode)->ip_blkno, le32_to_cpu(el->l_recs[i].e_cpos), le32_to_cpu(fe->i_clusters)); status = -EIO; goto bail; } el->l_recs[i].e_blkno = cpu_to_le64(start_blk); el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); } else { /* No contiguous record, or no empty record at eof, so * we add a new one. */ BUG_ON(le16_to_cpu(el->l_next_free_rec) >= le16_to_cpu(el->l_count)); i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(start_blk); el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); el->l_recs[i].e_cpos = fe->i_clusters; le16_add_cpu(&el->l_next_free_rec, 1); } /* * extent_map errors are not fatal, so they are ignored outside * of flushing the thing. */ status = ocfs2_extent_map_append(inode, &el->l_recs[i], new_clusters); if (status) { mlog_errno(status); ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); } status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) mlog_errno(status); if (fe->id2.i_list.l_tree_depth) { status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); if (status < 0) mlog_errno(status); } status = 0; bail: if (eb_bhs) { for (i = 0; i < num_bhs; i++) if (eb_bhs[i]) brelse(eb_bhs[i]); kfree(eb_bhs); } mlog_exit(status); return status; } /* * Should only be called when there is no space left in any of the * leaf nodes. What we want to do is find the lowest tree depth * non-leaf extent block with room for new records. There are three * valid results of this search: * * 1) a lowest extent block is found, then we pass it back in * *lowest_eb_bh and return '0' * * 2) the search fails to find anything, but the dinode has room. We * pass NULL back in *lowest_eb_bh, but still return '0' * * 3) the search fails to find anything AND the dinode is full, in * which case we return > 0 * * return status < 0 indicates an error. */ static int ocfs2_find_branch_target(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head **target_bh) { int status = 0, i; u64 blkno; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; struct buffer_head *lowest_bh = NULL; mlog_entry_void(); *target_bh = NULL; fe = (struct ocfs2_dinode *) fe_bh->b_data; el = &fe->id2.i_list; while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(inode->i_sb, "Dinode %llu has empty " "extent list (next_free_rec == 0)", (unsigned long long)OCFS2_I(inode)->ip_blkno); status = -EIO; goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { ocfs2_error(inode->i_sb, "Dinode %llu has extent " "list where extent # %d has no physical " "block start", (unsigned long long)OCFS2_I(inode)->ip_blkno, i); status = -EIO; goto bail; } if (bh) { brelse(bh); bh = NULL; } status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, inode); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } el = &eb->h_list; if (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) { if (lowest_bh) brelse(lowest_bh); lowest_bh = bh; get_bh(lowest_bh); } } /* If we didn't find one and the fe doesn't have any room, * then return '1' */ if (!lowest_bh && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) status = 1; *target_bh = lowest_bh; bail: if (bh) brelse(bh); mlog_exit(status); return status; } /* the caller needs to update fe->i_clusters */ int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 start_blk, u32 new_clusters, struct ocfs2_alloc_context *meta_ac) { int status, i, shift; struct buffer_head *last_eb_bh = NULL; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; mlog_entry_void(); mlog(0, "add %u clusters starting at block %llu to inode %llu\n", new_clusters, (unsigned long long)start_blk, (unsigned long long)OCFS2_I(inode)->ip_blkno); fe = (struct ocfs2_dinode *) fe_bh->b_data; el = &fe->id2.i_list; if (el->l_tree_depth) { /* jump to end of tree */ status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), &last_eb_bh, OCFS2_BH_CACHED, inode); if (status < 0) { mlog_exit(status); goto bail; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; el = &eb->h_list; } /* Can we allocate without adding/shifting tree bits? */ i = le16_to_cpu(el->l_next_free_rec) - 1; if (le16_to_cpu(el->l_next_free_rec) == 0 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) || le32_to_cpu(el->l_recs[i].e_clusters) == 0 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) goto out_add; mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " "tree now.\n"); shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); if (shift < 0) { status = shift; mlog_errno(status); goto bail; } /* We traveled all the way to the bottom of the allocation tree * and didn't find room for any more extents - we need to add * another tree level */ if (shift) { /* if we hit a leaf, we'd better be empty :) */ BUG_ON(le16_to_cpu(el->l_next_free_rec) != le16_to_cpu(el->l_count)); BUG_ON(bh); mlog(0, "ocfs2_allocate_extent: need to shift tree depth " "(current = %u)\n", le16_to_cpu(fe->id2.i_list.l_tree_depth)); /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, meta_ac, &bh); if (status < 0) { mlog_errno(status); goto bail; } /* Special case: we have room now if we shifted from * tree_depth 0 */ if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) goto out_add; } /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, meta_ac); if (status < 0) { mlog_errno(status); goto bail; } out_add: /* Finally, we can add clusters. */ status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, start_blk, new_clusters); if (status < 0) mlog_errno(status); bail: if (bh) brelse(bh); if (last_eb_bh) brelse(last_eb_bh); mlog_exit(status); return status; } static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), "slot %d, invalid truncate log parameters: used = " "%u, count = %u\n", osb->slot_num, le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); } static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, unsigned int new_start) { unsigned int tail_index; unsigned int current_tail; /* No records, nothing to coalesce */ if (!le16_to_cpu(tl->tl_used)) return 0; tail_index = le16_to_cpu(tl->tl_used) - 1; current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); return current_tail == new_start; } static int ocfs2_truncate_log_append(struct ocfs2_super *osb, handle_t *handle, u64 start_blk, unsigned int num_clusters) { int status, index; unsigned int start_cluster, tl_count; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; mlog_entry("start_blk = %llu, num_clusters = %u\n", (unsigned long long)start_blk, num_clusters); BUG_ON(mutex_trylock(&tl_inode->i_mutex)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; if (!OCFS2_IS_VALID_DINODE(di)) { OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); status = -EIO; goto bail; } tl_count = le16_to_cpu(tl->tl_count); mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || tl_count == 0, "Truncate record count on #%llu invalid " "wanted %u, actual %u\n", (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, ocfs2_truncate_recs_per_inode(osb->sb), le16_to_cpu(tl->tl_count)); /* Caller should have known to flush before calling us. */ index = le16_to_cpu(tl->tl_used); if (index >= tl_count) { status = -ENOSPC; mlog_errno(status); goto bail; } status = ocfs2_journal_access(handle, tl_inode, tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } mlog(0, "Log truncate of %u clusters starting at cluster %u to " "%llu (index = %d)\n", num_clusters, start_cluster, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { /* * Move index back to the record we are coalescing with. * ocfs2_truncate_log_can_coalesce() guarantees nonzero */ index--; num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", index, le32_to_cpu(tl->tl_recs[index].t_start), num_clusters); } else { tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); tl->tl_used = cpu_to_le16(index + 1); } tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); status = ocfs2_journal_dirty(handle, tl_bh); if (status < 0) { mlog_errno(status); goto bail; } bail: mlog_exit(status); return status; } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { int status = 0; int i; unsigned int num_clusters; u64 start_blk; struct ocfs2_truncate_rec rec; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; mlog_entry_void(); di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access(handle, tl_inode, tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } tl->tl_used = cpu_to_le16(i); status = ocfs2_journal_dirty(handle, tl_bh); if (status < 0) { mlog_errno(status); goto bail; } /* TODO: Perhaps we can calculate the bulk of the * credits up front rather than extending like * this. */ status = ocfs2_extend_trans(handle, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); if (status < 0) { mlog_errno(status); goto bail; } rec = tl->tl_recs[i]; start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, le32_to_cpu(rec.t_start)); num_clusters = le32_to_cpu(rec.t_clusters); /* if start_blk is not set, we ignore the record as * invalid. */ if (start_blk) { mlog(0, "free record %d, start = %u, clusters = %u\n", i, le32_to_cpu(rec.t_start), num_clusters); status = ocfs2_free_clusters(handle, data_alloc_inode, data_alloc_bh, start_blk, num_clusters); if (status < 0) { mlog_errno(status); goto bail; } } i--; } bail: mlog_exit(status); return status; } /* Expects you to already be holding tl_inode->i_mutex */ static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; mlog_entry_void(); BUG_ON(mutex_trylock(&tl_inode->i_mutex)); di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; if (!OCFS2_IS_VALID_DINODE(di)) { OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); status = -EIO; goto out; } num_to_flush = le16_to_cpu(tl->tl_used); mlog(0, "Flush %u records from truncate log #%llu\n", num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); if (!num_to_flush) { status = 0; goto out; } data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); if (!data_alloc_inode) { status = -EINVAL; mlog(ML_ERROR, "Could not get bitmap inode!\n"); goto out; } mutex_lock(&data_alloc_inode->i_mutex); status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { mlog_errno(status); goto out_mutex; } handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out_unlock; } status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); ocfs2_commit_trans(osb, handle); out_unlock: brelse(data_alloc_bh); ocfs2_meta_unlock(data_alloc_inode, 1); out_mutex: mutex_unlock(&data_alloc_inode->i_mutex); iput(data_alloc_inode); out: mlog_exit(status); return status; } int ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; struct inode *tl_inode = osb->osb_tl_inode; mutex_lock(&tl_inode->i_mutex); status = __ocfs2_flush_truncate_log(osb); mutex_unlock(&tl_inode->i_mutex); return status; } static void ocfs2_truncate_log_worker(struct work_struct *work) { int status; struct ocfs2_super *osb = container_of(work, struct ocfs2_super, osb_truncate_log_wq.work); mlog_entry_void(); status = ocfs2_flush_truncate_log(osb); if (status < 0) mlog_errno(status); mlog_exit(status); } #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, int cancel) { if (osb->osb_tl_inode) { /* We want to push off log flushes while truncates are * still running. */ if (cancel) cancel_delayed_work(&osb->osb_truncate_log_wq); queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); } } static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int slot_num, struct inode **tl_inode, struct buffer_head **tl_bh) { int status; struct inode *inode = NULL; struct buffer_head *bh = NULL; inode = ocfs2_get_system_file_inode(osb, TRUNCATE_LOG_SYSTEM_INODE, slot_num); if (!inode) { status = -EINVAL; mlog(ML_ERROR, "Could not get load truncate log inode!\n"); goto bail; } status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (status < 0) { iput(inode); mlog_errno(status); goto bail; } *tl_inode = inode; *tl_bh = bh; bail: mlog_exit(status); return status; } /* called during the 1st stage of node recovery. we stamp a clean * truncate log and pass back a copy for processing later. if the * truncate log does not require processing, a *tl_copy is set to * NULL. */ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, int slot_num, struct ocfs2_dinode **tl_copy) { int status; struct inode *tl_inode = NULL; struct buffer_head *tl_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; *tl_copy = NULL; mlog(0, "recover truncate log from slot %d\n", slot_num); status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); if (status < 0) { mlog_errno(status); goto bail; } di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; if (!OCFS2_IS_VALID_DINODE(di)) { OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); status = -EIO; goto bail; } if (le16_to_cpu(tl->tl_used)) { mlog(0, "We'll have %u logs to recover\n", le16_to_cpu(tl->tl_used)); *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); if (!(*tl_copy)) { status = -ENOMEM; mlog_errno(status); goto bail; } /* Assuming the write-out below goes well, this copy * will be passed back to recovery for processing. */ memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); /* All we need to do to clear the truncate log is set * tl_used. */ tl->tl_used = 0; status = ocfs2_write_block(osb, tl_bh, tl_inode); if (status < 0) { mlog_errno(status); goto bail; } } bail: if (tl_inode) iput(tl_inode); if (tl_bh) brelse(tl_bh); if (status < 0 && (*tl_copy)) { kfree(*tl_copy); *tl_copy = NULL; } mlog_exit(status); return status; } int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, struct ocfs2_dinode *tl_copy) { int status = 0; int i; unsigned int clusters, num_recs, start_cluster; u64 start_blk; handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_truncate_log *tl; mlog_entry_void(); if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); return -EINVAL; } tl = &tl_copy->id2.i_dealloc; num_recs = le16_to_cpu(tl->tl_used); mlog(0, "cleanup %u records from %llu\n", num_recs, (unsigned long long)tl_copy->i_blkno); mutex_lock(&tl_inode->i_mutex); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); if (status < 0) { mlog_errno(status); goto bail_up; } } handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto bail_up; } clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); status = ocfs2_truncate_log_append(osb, handle, start_blk, clusters); ocfs2_commit_trans(osb, handle); if (status < 0) { mlog_errno(status); goto bail_up; } } bail_up: mutex_unlock(&tl_inode->i_mutex); mlog_exit(status); return status; } void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) { int status; struct inode *tl_inode = osb->osb_tl_inode; mlog_entry_void(); if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); flush_workqueue(ocfs2_wq); status = ocfs2_flush_truncate_log(osb); if (status < 0) mlog_errno(status); brelse(osb->osb_tl_bh); iput(osb->osb_tl_inode); } mlog_exit_void(); } int ocfs2_truncate_log_init(struct ocfs2_super *osb) { int status; struct inode *tl_inode = NULL; struct buffer_head *tl_bh = NULL; mlog_entry_void(); status = ocfs2_get_truncate_log_info(osb, osb->slot_num, &tl_inode, &tl_bh); if (status < 0) mlog_errno(status); /* ocfs2_truncate_log_shutdown keys on the existence of * osb->osb_tl_inode so we don't set any of the osb variables * until we're sure all is well. */ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker); osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; mlog_exit(status); return status; } /* This function will figure out whether the currently last extent * block will be deleted, and if it will, what the new last extent * block will be so we can update his h_next_leaf_blk field, as well * as the dinodes i_last_eb_blk */ static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe, u32 new_i_clusters, struct buffer_head *old_last_eb, struct buffer_head **new_last_eb) { int i, status = 0; u64 block = 0; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; *new_last_eb = NULL; if (!OCFS2_IS_VALID_DINODE(fe)) { OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); status = -EIO; goto bail; } /* we have no tree, so of course, no last_eb. */ if (!fe->id2.i_list.l_tree_depth) goto bail; /* trunc to zero special case - this makes tree_depth = 0 * regardless of what it is. */ if (!new_i_clusters) goto bail; eb = (struct ocfs2_extent_block *) old_last_eb->b_data; el = &(eb->h_list); BUG_ON(!el->l_next_free_rec); /* Make sure that this guy will actually be empty after we * clear away the data. */ if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) goto bail; /* Ok, at this point, we know that last_eb will definitely * change, so lets traverse the tree and find the second to * last extent block. */ el = &(fe->id2.i_list); /* go down the tree, */ do { for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { if (le32_to_cpu(el->l_recs[i].e_cpos) < new_i_clusters) { block = le64_to_cpu(el->l_recs[i].e_blkno); break; } } BUG_ON(i < 0); if (bh) { brelse(bh); bh = NULL; } status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, inode); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } } while (el->l_tree_depth); *new_last_eb = bh; get_bh(*new_last_eb); mlog(0, "returning block %llu\n", (unsigned long long)le64_to_cpu(eb->h_blkno)); bail: if (bh) brelse(bh); return status; } static int ocfs2_do_truncate(struct ocfs2_super *osb, unsigned int clusters_to_del, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head *old_last_eb_bh, handle_t *handle, struct ocfs2_truncate_context *tc) { int status, i, depth; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_block *last_eb = NULL; struct ocfs2_extent_list *el; struct buffer_head *eb_bh = NULL; struct buffer_head *last_eb_bh = NULL; u64 next_eb = 0; u64 delete_blk = 0; fe = (struct ocfs2_dinode *) fe_bh->b_data; status = ocfs2_find_new_last_ext_blk(osb, inode, fe, le32_to_cpu(fe->i_clusters) - clusters_to_del, old_last_eb_bh, &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } if (last_eb_bh) last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; status = ocfs2_journal_access(handle, inode, fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } el = &(fe->id2.i_list); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - clusters_to_del; spin_unlock(&OCFS2_I(inode)->ip_lock); le32_add_cpu(&fe->i_clusters, -clusters_to_del); fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); i = le16_to_cpu(el->l_next_free_rec) - 1; BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); /* tree depth zero, we can just delete the clusters, otherwise * we need to record the offset of the next level extent block * as we may overwrite it. */ if (!el->l_tree_depth) delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(el->l_recs[i].e_clusters)); else next_eb = le64_to_cpu(el->l_recs[i].e_blkno); if (!el->l_recs[i].e_clusters) { /* if we deleted the whole extent record, then clear * out the other fields and update the extent * list. For depth > 0 trees, we've already recorded * the extent block in 'next_eb' */ el->l_recs[i].e_cpos = 0; el->l_recs[i].e_blkno = 0; BUG_ON(!el->l_next_free_rec); le16_add_cpu(&el->l_next_free_rec, -1); } depth = le16_to_cpu(el->l_tree_depth); if (!fe->i_clusters) { /* trunc to zero is a special case. */ el->l_tree_depth = 0; fe->i_last_eb_blk = 0; } else if (last_eb) fe->i_last_eb_blk = last_eb->h_blkno; status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { mlog_errno(status); goto bail; } if (last_eb) { /* If there will be a new last extent block, then by * definition, there cannot be any leaves to the right of * him. */ status = ocfs2_journal_access(handle, inode, last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } last_eb->h_next_leaf_blk = 0; status = ocfs2_journal_dirty(handle, last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } } /* if our tree depth > 0, update all the tree blocks below us. */ while (depth) { mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", depth, (unsigned long long)next_eb); status = ocfs2_read_block(osb, next_eb, &eb_bh, OCFS2_BH_CACHED, inode); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *)eb_bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } el = &(eb->h_list); status = ocfs2_journal_access(handle, inode, eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); i = le16_to_cpu(el->l_next_free_rec) - 1; mlog(0, "extent block %llu, before: record %d: " "(%u, %u, %llu), next = %u\n", (unsigned long long)le64_to_cpu(eb->h_blkno), i, le32_to_cpu(el->l_recs[i].e_cpos), le32_to_cpu(el->l_recs[i].e_clusters), (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), le16_to_cpu(el->l_next_free_rec)); BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); next_eb = le64_to_cpu(el->l_recs[i].e_blkno); /* bottom-most block requires us to delete data.*/ if (!el->l_tree_depth) delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(el->l_recs[i].e_clusters)); if (!el->l_recs[i].e_clusters) { el->l_recs[i].e_cpos = 0; el->l_recs[i].e_blkno = 0; BUG_ON(!el->l_next_free_rec); le16_add_cpu(&el->l_next_free_rec, -1); } mlog(0, "extent block %llu, after: record %d: " "(%u, %u, %llu), next = %u\n", (unsigned long long)le64_to_cpu(eb->h_blkno), i, le32_to_cpu(el->l_recs[i].e_cpos), le32_to_cpu(el->l_recs[i].e_clusters), (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), le16_to_cpu(el->l_next_free_rec)); status = ocfs2_journal_dirty(handle, eb_bh); if (status < 0) { mlog_errno(status); goto bail; } if (!el->l_next_free_rec) { mlog(0, "deleting this extent block.\n"); ocfs2_remove_from_cache(inode, eb_bh); BUG_ON(el->l_recs[0].e_clusters); BUG_ON(el->l_recs[0].e_cpos); BUG_ON(el->l_recs[0].e_blkno); if (eb->h_suballoc_slot == 0) { /* * This code only understands how to * lock the suballocator in slot 0, * which is fine because allocation is * only ever done out of that * suballocator too. A future version * might change that however, so avoid * a free if we don't know how to * handle it. This way an fs incompat * bit will not be necessary. */ status = ocfs2_free_extent_block(handle, tc->tc_ext_alloc_inode, tc->tc_ext_alloc_bh, eb); if (status < 0) { mlog_errno(status); goto bail; } } } brelse(eb_bh); eb_bh = NULL; depth--; } BUG_ON(!delete_blk); status = ocfs2_truncate_log_append(osb, handle, delete_blk, clusters_to_del); if (status < 0) { mlog_errno(status); goto bail; } status = 0; bail: if (!status) ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); else ocfs2_extent_map_drop(inode, 0); mlog_exit(status); return status; } /* * It is expected, that by the time you call this function, * inode->i_size and fe->i_size have been adjusted. * * WARNING: This will kfree the truncate context */ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_truncate_context *tc) { int status, i, credits, tl_sem = 0; u32 clusters_to_del, target_i_clusters; u64 last_eb = 0; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *last_eb_bh; handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; mlog_entry_void(); down_write(&OCFS2_I(inode)->ip_alloc_sem); target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); last_eb_bh = tc->tc_last_eb_bh; tc->tc_last_eb_bh = NULL; fe = (struct ocfs2_dinode *) fe_bh->b_data; if (fe->id2.i_list.l_tree_depth) { eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; el = &eb->h_list; } else el = &fe->id2.i_list; last_eb = le64_to_cpu(fe->i_last_eb_blk); start: mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " "last_eb = %llu, fe->i_last_eb_blk = %llu, " "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { mlog(0, "last_eb changed!\n"); BUG_ON(!fe->id2.i_list.l_tree_depth); last_eb = le64_to_cpu(fe->i_last_eb_blk); /* i_last_eb_blk may have changed, read it if * necessary. We don't have to worry about the * truncate to zero case here (where there becomes no * last_eb) because we never loop back after our work * is done. */ if (last_eb_bh) { brelse(last_eb_bh); last_eb_bh = NULL; } status = ocfs2_read_block(osb, last_eb, &last_eb_bh, OCFS2_BH_CACHED, inode); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); status = -EIO; goto bail; } el = &(eb->h_list); } /* by now, el will point to the extent list on the bottom most * portion of this tree. */ i = le16_to_cpu(el->l_next_free_rec) - 1; if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); else clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + le32_to_cpu(el->l_recs[i].e_cpos)) - target_i_clusters; mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); mutex_lock(&tl_inode->i_mutex); tl_sem = 1; /* ocfs2_truncate_log_needs_flush guarantees us at least one * record is free for use. If there isn't any, we flush to get * an empty truncate log. */ if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); if (status < 0) { mlog_errno(status); goto bail; } } credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, fe, el); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; mlog_errno(status); goto bail; } inode->i_ctime = inode->i_mtime = CURRENT_TIME; status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) mlog_errno(status); status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, last_eb_bh, handle, tc); if (status < 0) { mlog_errno(status); goto bail; } mutex_unlock(&tl_inode->i_mutex); tl_sem = 0; ocfs2_commit_trans(osb, handle); handle = NULL; BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); if (le32_to_cpu(fe->i_clusters) > target_i_clusters) goto start; bail: up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_schedule_truncate_log_flush(osb, 1); if (tl_sem) mutex_unlock(&tl_inode->i_mutex); if (handle) ocfs2_commit_trans(osb, handle); if (last_eb_bh) brelse(last_eb_bh); /* This will drop the ext_alloc cluster lock for us */ ocfs2_free_truncate_context(tc); mlog_exit(status); return status; } /* * Expects the inode to already be locked. This will figure out which * inodes need to be locked and will put them on the returned truncate * context. */ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_truncate_context **tc) { int status, metadata_delete; unsigned int new_i_clusters; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *last_eb_bh = NULL; struct inode *ext_alloc_inode = NULL; struct buffer_head *ext_alloc_bh = NULL; mlog_entry_void(); *tc = NULL; new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); fe = (struct ocfs2_dinode *) fe_bh->b_data; mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" "%llu\n", fe->i_clusters, new_i_clusters, (unsigned long long)fe->i_size); if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " "%u and size %llu whereas struct inode has " "cluster count %u and size %llu which caused an " "invalid truncate to %u clusters.", (unsigned long long)le64_to_cpu(fe->i_blkno), le32_to_cpu(fe->i_clusters), (unsigned long long)le64_to_cpu(fe->i_size), OCFS2_I(inode)->ip_clusters, i_size_read(inode), new_i_clusters); mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); status = -EIO; goto bail; } *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); if (!(*tc)) { status = -ENOMEM; mlog_errno(status); goto bail; } metadata_delete = 0; if (fe->id2.i_list.l_tree_depth) { /* If we have a tree, then the truncate may result in * metadata deletes. Figure this out from the * rightmost leaf block.*/ status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), &last_eb_bh, OCFS2_BH_CACHED, inode); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); brelse(last_eb_bh); status = -EIO; goto bail; } el = &(eb->h_list); if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) metadata_delete = 1; } (*tc)->tc_last_eb_bh = last_eb_bh; if (metadata_delete) { mlog(0, "Will have to delete metadata for this trunc. " "locking allocator.\n"); ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); if (!ext_alloc_inode) { status = -ENOMEM; mlog_errno(status); goto bail; } mutex_lock(&ext_alloc_inode->i_mutex); (*tc)->tc_ext_alloc_inode = ext_alloc_inode; status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1); if (status < 0) { mlog_errno(status); goto bail; } (*tc)->tc_ext_alloc_bh = ext_alloc_bh; (*tc)->tc_ext_alloc_locked = 1; } status = 0; bail: if (status < 0) { if (*tc) ocfs2_free_truncate_context(*tc); *tc = NULL; } mlog_exit_void(); return status; } static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) { if (tc->tc_ext_alloc_inode) { if (tc->tc_ext_alloc_locked) ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); iput(tc->tc_ext_alloc_inode); } if (tc->tc_ext_alloc_bh) brelse(tc->tc_ext_alloc_bh); if (tc->tc_last_eb_bh) brelse(tc->tc_last_eb_bh); kfree(tc); }