diff options
Diffstat (limited to 'fs')
59 files changed, 5451 insertions, 1260 deletions
@@ -1420,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err) } /* - * split a bio - only worry about a bio with a single page - * in it's iovec + * split a bio - only worry about a bio with a single page in its iovec */ struct bio_pair *bio_split(struct bio *bi, int first_sectors) { diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c84ca1f5259a..51bfdfc8fcda 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -20,7 +20,6 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> -#include <linux/ftrace.h> #include "async-thread.h" #define WORK_QUEUED_BIT 0 @@ -195,6 +194,9 @@ again_locked: if (!list_empty(&worker->pending)) continue; + if (kthread_should_stop()) + break; + /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); @@ -208,7 +210,8 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - schedule(); + if (!kthread_should_stop()) + schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dbb724124633..e5b2533b691a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static noinline void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, - int level, int slot, u64 objectid) +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; @@ -1447,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) } /* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + return -EAGAIN; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +/* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf * level of the path (level 0) @@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow) { struct extent_buffer *b; - struct extent_buffer *tmp; int slot; int ret; int level; - int should_reada = p->reada; int lowest_unlock = 1; - int blocksize; u8 lowest_level = 0; - u64 blocknr; - u64 gen; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1502,7 +1608,11 @@ again: if (cow) { int wret; - /* is a cow on this block not required */ + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -1557,51 +1667,15 @@ cow_done: if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if ((p->search_for_split || ins_len > 0) && - btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); - - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - } else if (ins_len < 0 && - btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + ret = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (ret == -EAGAIN) + goto again; + else if (ret) + goto done; + b = p->nodes[level]; + slot = p->slots[level]; - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - if (!b) { - btrfs_release_path(NULL, p); - goto again; - } - slot = p->slots[level]; - BUG_ON(btrfs_header_nritems(b) == 1); - } unlock_up(p, level, lowest_unlock); /* this is only true while dropping a snapshot */ @@ -1610,44 +1684,11 @@ cow_done: goto done; } - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + ret = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (ret == -EAGAIN) + goto again; - tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - b = tmp; - } else { - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. - */ - if (level > 0) { - btrfs_release_path(NULL, p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - - tmp = read_tree_block(root, blocknr, - blocksize, gen); - if (tmp) - free_extent_buffer(tmp); - goto again; - } else { - btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - b = read_node_slot(root, b, slot); - } - } if (!p->skip_locking) { int lret; @@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(!path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); - if (slot > nritems) - BUG(); + BUG_ON(slot > nritems); if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) BUG(); if (slot != nritems) { @@ -4086,28 +4126,44 @@ next: int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { int slot; - int level = 1; + int level; struct extent_buffer *c; - struct extent_buffer *next = NULL; + struct extent_buffer *next; struct btrfs_key key; u32 nritems; int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; btrfs_release_path(root, path); + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) return ret; - btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (nritems > 0 && path->slots[0] < nritems - 1) { path->slots[0]++; + ret = 0; goto done; } while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; + if (!path->nodes[level]) { + ret = 1; + goto done; + } slot = path->slots[level] + 1; c = path->nodes[level]; if (slot >= btrfs_header_nritems(c)) { level++; - if (level == BTRFS_MAX_LEVEL) - return 1; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } continue; } @@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } - /* the path was set to blocking above */ - if (level == 1 && (path->locks[1] || path->skip_locking) && - path->reada) - reada_for_search(root, path, level, slot, 0); + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; - next = read_node_slot(root, c, slot); if (!path->skip_locking) { - btrfs_assert_tree_locked(c); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } break; } @@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) c = path->nodes[level]; if (path->locks[level]) btrfs_tree_unlock(c); + free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) path->locks[level] = 1; + if (!level) break; - btrfs_set_path_blocking(path); - if (level == 1 && path->locks[1] && path->reada) - reada_for_search(root, path, level, slot, 0); - next = read_node_slot(root, next, 0); + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + if (!path->skip_locking) { btrfs_assert_tree_locked(path->nodes[level]); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } } + ret = 0; done: unlock_up(path, 0, 1); - return 0; + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9417713542a2..ad96495dedc5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_MAX 9 /* - * the key defines the order in the tree, and so it also defines (optimal) - * block layout. objectid corresonds to the inode number. The flags - * tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with flags of 1 might refer to the inode - * data, flags of 2 may point to file data in the btree and flags == 3 - * may point to extents. + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. * * offset is the starting byte offset for this key in the stream. * @@ -200,7 +203,7 @@ struct btrfs_dev_item { /* * starting byte of this partition on the device, - * to allowr for stripe alignment in the future + * to allow for stripe alignment in the future */ __le64 start_offset; @@ -633,18 +636,35 @@ struct btrfs_space_info { struct rw_semaphore groups_sem; }; -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; }; struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; spinlock_t lock; - struct mutex alloc_mutex; struct mutex cache_mutex; u64 pinned; u64 reserved; @@ -656,6 +676,7 @@ struct btrfs_block_group_cache { struct btrfs_space_info *space_info; /* free space cache stuff */ + spinlock_t tree_lock; struct rb_root free_space_bytes; struct rb_root free_space_offset; @@ -667,6 +688,11 @@ struct btrfs_block_group_cache { /* usage count */ atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; }; struct btrfs_leaf_ref_tree { @@ -728,7 +754,6 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; @@ -839,8 +864,12 @@ struct btrfs_fs_info { spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; - u64 last_alloc; - u64 last_data_alloc; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; spinlock_t ref_cache_lock; u64 total_ref_cache_size; @@ -932,7 +961,6 @@ struct btrfs_root { }; /* - * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in * the FS @@ -963,7 +991,7 @@ struct btrfs_root { #define BTRFS_EXTENT_CSUM_KEY 128 /* - * root items point to tree roots. There are typically in the root + * root items point to tree roots. They are typically in the root * tree used by the super block to find all the other trees */ #define BTRFS_ROOT_ITEM_KEY 132 @@ -1010,6 +1038,8 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); @@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask); int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -/* free-space-cache.c */ -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes); -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cbf7dc8ae3ec..d6c01c096a40 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -18,7 +18,6 @@ #include <linux/sched.h> #include <linux/sort.h> -#include <linux/ftrace.h> #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92d73929d381..92caa8035f36 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -38,6 +38,7 @@ #include "locking.h" #include "ref-cache.h" #include "tree-log.h" +#include "free-space-cache.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio) ret = extent_range_uptodate(io_tree, start + length, start + buf_len - 1); - if (ret == 1) - return ret; return ret; } @@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5e7cae63d80..178df4c67de4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "volumes.h" #include "locking.h" #include "ref-cache.h" +#include "free-space-cache.h" #define PENDING_EXTENT_INSERT 0 #define PENDING_EXTENT_DELETE 1 @@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, u64 extent_start, extent_end, size; int ret; - mutex_lock(&info->pinned_mutex); while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, @@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - mutex_unlock(&info->pinned_mutex); return 0; } @@ -291,8 +290,8 @@ next: block_group->key.objectid + block_group->key.offset); - remove_sb_from_cache(root, block_group); block_group->cached = 1; + remove_sb_from_cache(root, block_group); ret = 0; err: btrfs_free_path(path); @@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -static inline void put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { if (atomic_dec_and_test(&cache->count)) kfree(cache); @@ -399,12 +398,12 @@ again: div_factor(cache->key.offset, factor)) { group_start = cache->key.objectid; spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); goto found; } } spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); cond_resched(); } if (!wrapped) { @@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) if (!block_group || block_group->ro) readonly = 1; if (block_group) - put_block_group(block_group); + btrfs_put_block_group(block_group); return readonly; } @@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, WARN_ON(ret); } } - put_block_group(cache); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; } @@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return 0; bytenr = cache->key.objectid; - put_block_group(cache); + btrfs_put_block_group(cache); return bytenr; } @@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); if (pin) { set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); @@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } - mutex_unlock(&root->fs_info->pinned_mutex); while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, if (cache->cached) btrfs_add_free_space(cache, bytenr, len); } - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root, } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { ret = find_first_extent_bit(pinned_extents, last, &start, &end, EXTENT_DIRTY); @@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } - mutex_unlock(&root->fs_info->pinned_mutex); return 0; } @@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int ret; while (1) { - mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_unlock(&root->fs_info->pinned_mutex); return ret; } @@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, free_extent_buffer(buf); pinit: btrfs_set_path_blocking(path); - mutex_lock(&root->fs_info->pinned_mutex); /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); @@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); @@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; - u64 total_needed = num_bytes; - u64 *last_ptr = NULL; - u64 last_wanted = 0; + struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - int chunk_alloc_done = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; - struct list_head *head = NULL, *cur = NULL; - int loop = 0; - int extra_loop = 0; struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ins->objectid = 0; ins->offset = 0; + space_info = __find_space_info(root->fs_info, data); + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; if (data & BTRFS_BLOCK_GROUP_METADATA) { - last_ptr = &root->fs_info->last_alloc; + last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) - last_ptr = &root->fs_info->last_data_alloc; + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } if (last_ptr) { - if (*last_ptr) { - hint_byte = *last_ptr; - last_wanted = *last_ptr; - } else - empty_size += empty_cluster; - } else { - empty_cluster = 0; + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); } + search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (last_wanted && search_start != last_wanted) { - last_wanted = 0; - empty_size += empty_cluster; + if (!last_ptr) { + empty_cluster = 0; + loop = 1; } - total_needed += empty_size; - block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (!block_group) - block_group = btrfs_lookup_first_block_group(root->fs_info, - search_start); - space_info = __find_space_info(root->fs_info, data); + if (search_start == hint_byte) { + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + if (block_group && block_group_bits(block_group, data)) { + down_read(&space_info->groups_sem); + goto have_block_group; + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: down_read(&space_info->groups_sem); - while (1) { - struct btrfs_free_space *free_space; - /* - * the only way this happens if our hint points to a block - * group thats not of the proper type, while looping this - * should never happen - */ - if (empty_size) - extra_loop = 1; + list_for_each_entry(block_group, &space_info->block_groups, list) { + u64 offset; - if (!block_group) - goto new_group_no_lock; + atomic_inc(&block_group->count); + search_start = block_group->key.objectid; +have_block_group: if (unlikely(!block_group->cached)) { mutex_lock(&block_group->cache_mutex); ret = cache_block_group(root, block_group); mutex_unlock(&block_group->cache_mutex); - if (ret) + if (ret) { + btrfs_put_block_group(block_group); break; + } } - mutex_lock(&block_group->alloc_mutex); - if (unlikely(!block_group_bits(block_group, data))) - goto new_group; - if (unlikely(block_group->ro)) - goto new_group; + goto loop; - free_space = btrfs_find_free_space(block_group, search_start, - total_needed); - if (free_space) { - u64 start = block_group->key.objectid; - u64 end = block_group->key.objectid + - block_group->key.offset; + if (last_ptr) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } - search_start = stripe_align(root, free_space->offset); + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + atomic_inc(&block_group->count); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + + last_ptr_loop = 1; + search_start = block_group->key.objectid; + goto have_block_group; + } + spin_unlock(&last_ptr->lock); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) - goto new_group; + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); - /* move on to the next group */ - if (search_start + num_bytes > end) - goto new_group; + last_ptr_loop = 0; - if (last_wanted && search_start != last_wanted) { - total_needed += empty_cluster; - empty_size += empty_cluster; - last_wanted = 0; + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { /* - * if search_start is still in this block group - * then we just re-search this block group + * now pull our allocation out of this + * cluster */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - continue; + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; } - - /* else we go to the next block group */ - goto new_group; } - - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - last_wanted = 0; - continue; - } - - /* else we go to the next block group */ - goto new_group; + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + if (loop < 2) { + btrfs_return_cluster_to_free_space(NULL, + last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; } + spin_unlock(&last_ptr->refill_lock); + } - ins->objectid = search_start; - ins->offset = num_bytes; + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + if (!offset) + goto loop; +checks: + search_start = stripe_align(root, offset); + + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } - btrfs_remove_free_space_lock(block_group, search_start, - num_bytes); - /* we are all good, lets return */ - mutex_unlock(&block_group->alloc_mutex); - break; + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; } -new_group: - mutex_unlock(&block_group->alloc_mutex); - put_block_group(block_group); - block_group = NULL; -new_group_no_lock: - /* don't try to compare new allocations against the - * last allocation any more - */ - last_wanted = 0; - /* - * Here's how this works. - * loop == 0: we were searching a block group via a hint - * and didn't find anything, so we start at - * the head of the block groups and keep searching - * loop == 1: we're searching through all of the block groups - * if we hit the head again we have searched - * all of the block groups for this space and we - * need to try and allocate, if we cant error out. - * loop == 2: we allocated more space and are looping through - * all of the block groups again. - */ - if (loop == 0) { - head = &space_info->block_groups; - cur = head->next; - loop++; - } else if (loop == 1 && cur == head) { - int keep_going; - - /* at this point we give up on the empty_size - * allocations and just try to allocate the min - * space. - * - * The extra_loop field was set if an empty_size - * allocation was attempted above, and if this - * is try we need to try the loop again without - * the additional empty_size. + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + + btrfs_add_free_space(block_group, offset, num_bytes); + /* + * if search_start is still in this block group + * then we just re-search this block group */ - total_needed -= empty_size; - empty_size = 0; - keep_going = extra_loop; - loop++; + if (search_start >= block_group->key.objectid && + search_start < (block_group->key.objectid + + block_group->key.offset)) + goto have_block_group; + goto loop; + } - if (allowed_chunk_alloc && !chunk_alloc_done) { - up_read(&space_info->groups_sem); - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); - down_read(&space_info->groups_sem); - if (ret < 0) - goto loop_check; - head = &space_info->block_groups; - /* - * we've allocated a new chunk, keep - * trying - */ - keep_going = 1; - chunk_alloc_done = 1; - } else if (!allowed_chunk_alloc) { - space_info->force_alloc = 1; - } -loop_check: - if (keep_going) { - cur = head->next; - extra_loop = 0; - } else { - break; - } - } else if (cur == head) { - break; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + /* we are all good, lets return */ + break; +loop: + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); + + /* loop == 0, try to find a clustered alloc in every block group + * loop == 1, try again after forcing a chunk allocation + * loop == 2, set empty_size and empty_cluster to 0 and try again + */ + if (!ins->objectid && loop < 3 && + (empty_size || empty_cluster || allowed_chunk_alloc)) { + if (loop >= 2) { + empty_size = 0; + empty_cluster = 0; } - block_group = list_entry(cur, struct btrfs_block_group_cache, - list); - atomic_inc(&block_group->count); + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + allowed_chunk_alloc = 0; + } else { + space_info->force_alloc = 1; + } - search_start = block_group->key.objectid; - cur = cur->next; + if (loop < 3) { + loop++; + goto search; + } + ret = -ENOSPC; + } else if (!ins->objectid) { + ret = -ENOSPC; } /* we found what we needed */ @@ -2783,21 +2782,10 @@ loop_check: if (!(data & BTRFS_BLOCK_GROUP_DATA)) trans->block_group = block_group->key.objectid; - if (last_ptr) - *last_ptr = ins->objectid + ins->offset; + btrfs_put_block_group(block_group); ret = 0; - } else if (!ret) { - printk(KERN_ERR "btrfs searching for %llu bytes, " - "num_bytes %llu, loop %d, allowed_alloc %d\n", - (unsigned long long)total_needed, - (unsigned long long)num_bytes, - loop, allowed_chunk_alloc); - ret = -ENOSPC; } - if (block_group) - put_block_group(block_group); - up_read(&space_info->groups_sem); return ret; } @@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - put_block_group(cache); + btrfs_put_block_group(cache); update_reserved_extents(root, start, len, 0); return ret; @@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); BUG_ON(ret); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, ref_generation, owner, ins, 1); return ret; @@ -5729,7 +5717,7 @@ next: WARN_ON(block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&block_group->item) > 0); spin_unlock(&block_group->lock); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = 0; out: btrfs_free_path(path); @@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); @@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); block_group->space_info->full = 0; - put_block_group(block_group); - put_block_group(block_group); + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 08085af089e2..eb2bee8b7fbf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, disko = 0; flags = 0; - switch (em->block_start) { - case EXTENT_MAP_LAST_BYTE: + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - break; - case EXTENT_MAP_HOLE: + } else if (em->block_start == EXTENT_MAP_HOLE) { flags |= FIEMAP_EXTENT_UNWRITTEN; - break; - case EXTENT_MAP_INLINE: + } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); - break; - case EXTENT_MAP_DELALLOC: + } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - break; - default: + } else { disko = em->block_start; - break; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 50da69da20ce..b187917b36fa 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree, rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; - free_extent_map(merge); goto out; } atomic_inc(&em->refs); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d1e5f0e84c58..768b9523662d 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,15 @@ #include <linux/sched.h> #include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; static int tree_insert_offset(struct rb_root *root, u64 offset, struct rb_node *node) @@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, } /* - * searches the tree for the given offset. If contains is set we will return - * the free space that contains the given offset. If contains is not set we - * will return the free space that starts at or after the given offset and is - * at least bytes long. + * searches the tree for the given offset. + * + * fuzzy == 1: this is used for allocations where we are given a hint of where + * to look for free space. Because the hint may not be completely on an offset + * mark, or the hint may no longer point to free space we need to fudge our + * results a bit. So we look for free space starting at or after offset with at + * least bytes size. We prefer to find as close to the given offset as we can. + * Also if the offset is within a free space range, then we will return the free + * space that contains the given offset, which means we can return a free space + * chunk with an offset before the provided offset. + * + * fuzzy == 0: this is just a normal tree search. Give us the free space that + * starts at the given offset which is at least bytes size, and if its not there + * return NULL. */ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, u64 offset, u64 bytes, - int contains) + int fuzzy) { struct rb_node *n = root->rb_node; struct btrfs_free_space *entry, *ret = NULL; @@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, entry = rb_entry(n, struct btrfs_free_space, offset_index); if (offset < entry->offset) { - if (!contains && + if (fuzzy && (!ret || entry->offset < ret->offset) && (bytes <= entry->bytes)) ret = entry; n = n->rb_left; } else if (offset > entry->offset) { - if ((entry->offset + entry->bytes - 1) >= offset && + if (fuzzy && + (entry->offset + entry->bytes - 1) >= offset && bytes <= entry->bytes) { ret = entry; break; @@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int ret = 0; + BUG_ON(!info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, &info->offset_index); if (ret) @@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, return ret; } -static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *right_info; struct btrfs_free_space *left_info; struct btrfs_free_space *info = NULL; - struct btrfs_free_space *alloc_info; int ret = 0; - alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!alloc_info) + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) return -ENOMEM; + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 1); + offset+bytes, 0, 0); left_info = tree_search_offset(&block_group->free_space_offset, offset-1, 0, 1); - if (right_info && right_info->offset == offset+bytes) { + if (right_info) { unlink_free_space(block_group, right_info); - info = right_info; - info->offset = offset; - info->bytes += bytes; - } else if (right_info && right_info->offset != offset+bytes) { - printk(KERN_ERR "btrfs adding space in the middle of an " - "existing free space area. existing: " - "offset=%llu, bytes=%llu. new: offset=%llu, " - "bytes=%llu\n", (unsigned long long)right_info->offset, - (unsigned long long)right_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); + info->bytes += right_info->bytes; + kfree(right_info); } - if (left_info) { + if (left_info && left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); - - if (unlikely((left_info->offset + left_info->bytes) != - offset)) { - printk(KERN_ERR "btrfs free space to the left " - "of new free space isn't " - "quite right. existing: offset=%llu, " - "bytes=%llu. new: offset=%llu, bytes=%llu\n", - (unsigned long long)left_info->offset, - (unsigned long long)left_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); - } - - if (info) { - info->offset = left_info->offset; - info->bytes += left_info->bytes; - kfree(left_info); - } else { - info = left_info; - info->bytes += bytes; - } + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); } - if (info) { - ret = link_free_space(block_group, info); - if (!ret) - info = NULL; - goto out; - } - - info = alloc_info; - alloc_info = NULL; - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(block_group, info); if (ret) kfree(info); -out: + + spin_unlock(&block_group->tree_lock); + if (ret) { printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); - if (ret == -EEXIST) - BUG(); + BUG_ON(ret == -EEXIST); } - kfree(alloc_info); - return ret; } -static int -__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *info; int ret = 0; + spin_lock(&block_group->tree_lock); + info = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - if (info && info->offset == offset) { if (info->bytes < bytes) { printk(KERN_ERR "Found free space at %llu, size %llu," @@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; + spin_unlock(&block_group->tree_lock); goto out; } unlink_free_space(block_group, info); if (info->bytes == bytes) { kfree(info); + spin_unlock(&block_group->tree_lock); goto out; } @@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->bytes -= bytes; ret = link_free_space(block_group, info); + spin_unlock(&block_group->tree_lock); BUG_ON(ret); } else if (info && info->offset < offset && info->offset + info->bytes >= offset + bytes) { @@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, */ kfree(info); } - + spin_unlock(&block_group->tree_lock); /* step two, insert a new info struct to cover anything * before the hole */ - ret = __btrfs_add_free_space(block_group, old_start, - offset - old_start); + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); BUG_ON(ret); } else { + spin_unlock(&block_group->tree_lock); + if (!info) { + printk(KERN_ERR "couldn't find space %llu to free\n", + (unsigned long long)offset); + printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", + block_group->cached, block_group->key.objectid, + block_group->key.offset); + btrfs_dump_free_space(block_group, bytes); + } else if (info) { + printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " + "but wanted offset=%llu bytes=%llu\n", + info->offset, info->bytes, offset, bytes); + } WARN_ON(1); } out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - - return ret; -} - -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret = 0; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_remove_free_space(block_group, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - ret = __btrfs_remove_free_space(block_group, offset, bytes); - - return ret; -} - void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { @@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; + printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, + info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); @@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) return ret; } +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->window_start = 0; + node = rb_first(&cluster->root); + while(node) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + link_free_space(block_group, entry); + } + list_del_init(&cluster->block_group_list); + + btrfs_put_block_group(cluster->block_group); + cluster->block_group = NULL; + cluster->root.rb_node = NULL; +out: + spin_unlock(&cluster->lock); + return 0; +} + void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space *info; struct rb_node *node; + struct btrfs_free_cluster *cluster; + struct btrfs_free_cluster *safe; + + spin_lock(&block_group->tree_lock); + + list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, + block_group_list) { + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + } - mutex_lock(&block_group->alloc_mutex); while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { info = rb_entry(node, struct btrfs_free_space, bytes_index); unlink_free_space(block_group, info); kfree(info); if (need_resched()) { - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); cond_resched(); - mutex_lock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); } } - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); } -#if 0 -static struct btrfs_free_space *btrfs_find_free_space_offset(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret; + struct btrfs_free_space *entry = NULL; + u64 ret = 0; - mutex_lock(&block_group->alloc_mutex); - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - mutex_unlock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); + entry = tree_search_offset(&block_group->free_space_offset, offset, + bytes + empty_size, 1); + if (!entry) + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes + empty_size); + if (entry) { + unlink_free_space(block_group, entry); + ret = entry->offset; + entry->offset += bytes; + entry->bytes -= bytes; + + if (!entry->bytes) + kfree(entry); + else + link_free_space(block_group, entry); + } + spin_unlock(&block_group->tree_lock); return ret; } -static struct btrfs_free_space *btrfs_find_free_space_bytes(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) { - struct btrfs_free_space *ret; + int ret; - mutex_lock(&block_group->alloc_mutex); + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); - ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); + /* now return any extents the cluster had on it */ + spin_lock(&block_group->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&block_group->tree_lock); + /* finally drop our ref */ + btrfs_put_block_group(block_group); return ret; } -#endif -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { + struct rb_node *node; + + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + kfree(entry); + } + break; + } +out: + spin_unlock(&cluster->lock); + return ret; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret = NULL; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + struct btrfs_free_space *next; + struct btrfs_free_space *last; + u64 min_bytes; + u64 window_start; + u64 window_free; + u64 max_extent = 0; + int total_retries = 0; + int ret; + + /* for metadata, allow allocates with more holes */ + if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } +again: + min_bytes = min(min_bytes, bytes + empty_size); + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, min_bytes); + if (!entry) { + ret = -ENOSPC; + goto out; + } + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + + while(1) { + /* out window is just right, lets fill it */ + if (window_free >= bytes + empty_size) + break; - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - if (!ret) - ret = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes); + node = rb_next(&last->offset_index); + if (!node) { + ret = -ENOSPC; + goto out; + } + next = rb_entry(node, struct btrfs_free_space, offset_index); + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (next->offset - window_start > (bytes + empty_size) * 2) { + entry = next; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = 0; + total_retries++; + if (total_retries % 256 == 0) { + if (min_bytes >= (bytes + empty_size)) { + ret = -ENOSPC; + goto out; + } + /* + * grow our allocation a bit, we're not having + * much luck + */ + min_bytes *= 2; + goto again; + } + } else { + last = next; + window_free += next->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + } + + cluster->window_start = entry->offset; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + * + * The cluster includes an rbtree, but only uses the offset index + * of each free space cache entry. + */ + while(1) { + node = rb_next(&entry->offset_index); + unlink_free_space(block_group, entry); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index); + BUG_ON(ret); + + if (!node || entry == last) + break; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + ret = 0; + cluster->max_size = max_extent; + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); + cluster->block_group = block_group; +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); return ret; } + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root.rb_node = NULL; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 000000000000..ab0bdc0a63ce --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06d8db5afb08..a0d1dd492a58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir) { ret = btrfs_set_inode_index(dir, index); - if (ret) + if (ret) { + iput(inode); return ERR_PTR(ret); + } } /* * index_cnt is ignored for everything but a dir, @@ -3565,6 +3567,7 @@ fail: if (dir) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); + iput(inode); return ERR_PTR(ret); } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a5310c0f41e2..1c36e5cd8f55 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) /* * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for every long, and often they don't block - * at all. For a dbench 50 run, if we don't spin one the blocking bit + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit * at all, the context switch rate can jump up to 400,000/sec or more. * * So, we're still stuck with this crummy spin on the blocking bit, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 19a4daf03ccb..9744af9d71e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/smp_lock.h> #include <linux/backing-dev.h> @@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, + Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -83,6 +85,8 @@ static match_table_t tokens = { {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, {Opt_err, NULL}, }; @@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; default: break; } @@ -363,9 +375,8 @@ fail_close: int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_sb(sb); int ret; - root = btrfs_sb(sb); if (sb->s_flags & MS_RDONLY) return 0; @@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_extent != (u64)-1) + seq_printf(seq, ",max_extent=%llu", info->max_extent); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) + seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",no-treelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flush-on-commit"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; +} + static void btrfs_write_super(struct super_block *sb) { sb->s_dirt = 0; @@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = { .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, - .show_options = generic_show_options, + .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_dirty_inode, .alloc_inode = btrfs_alloc_inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 664782c6a2df..2869b3361eb6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root) GFP_NOFS); BUG_ON(!cur_trans); root->fs_info->generation++; - root->fs_info->last_alloc = 0; - root->fs_info->last_data_alloc = 0; cur_trans->num_writers = 1; cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; @@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); btrfs_run_ordered_operations(root, 0); @@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (snap_pending) { + if (flush_on_commit || snap_pending) { + if (flush_on_commit) + btrfs_start_delalloc_inodes(root); ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc9b87a7975b..25f20ea11f27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { - if (wc->pin) { - mutex_lock(&log->fs_info->pinned_mutex); + if (wc->pin) btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - } if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -1224,8 +1222,7 @@ insert: ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) - BUG(); + BUG_ON(ret && ret != -ENOENT); goto out; } @@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd06e18e5aac..e0913e469728 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/random.h> +#include <linux/iocontext.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run = 0; unsigned long limit; + unsigned long last_waited = 0; - bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -207,7 +209,32 @@ loop_lock: if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { struct bio *old_head; + struct io_context *ioc; + ioc = current->io_context; + + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + continue; + } spin_lock(&device->io_lock); old_head = device->pending_bios; @@ -231,6 +258,18 @@ loop_lock: if (device->pending_bios) goto loop_lock; spin_unlock(&device->io_lock); + + /* + * IO has already been through a long path to get here. Checksumming, + * async helper threads, perhaps compression. We've done a pretty + * good job of collecting a batch of IO and should just unplug + * the device right away. + * + * This will help anyone who is waiting on the IO, they might have + * already unplugged, but managed to do so before the bio they + * cared about found its way down here. + */ + blk_run_backing_dev(bdi, NULL); done: return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 86c44e9ae110..2185de72ff7d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -76,7 +76,7 @@ struct btrfs_device { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent coyp of the super */ + /* the device with this id has the most recent copy of the super */ u64 latest_devid; u64 latest_trans; u64 num_devices; diff --git a/fs/compat.c b/fs/compat.c index 1c859dae758f..3f84d5f15889 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1236,7 +1236,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, asmlinkage ssize_t compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_high, u32 pos_low) + unsigned long vlen, u32 pos_low, u32 pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; struct file *file; @@ -1293,7 +1293,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, asmlinkage ssize_t compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_high, u32 pos_low) + unsigned long vlen, u32 pos_low, u32 pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; struct file *file; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 81ae9ea3c6e1..0662ba6de85a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -30,6 +30,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; +static bool debugfs_registered; static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) { @@ -496,6 +497,16 @@ exit: } EXPORT_SYMBOL_GPL(debugfs_rename); +/** + * debugfs_initialized - Tells whether debugfs has been registered + */ +bool debugfs_initialized(void) +{ + return debugfs_registered; +} +EXPORT_SYMBOL_GPL(debugfs_initialized); + + static struct kobject *debug_kobj; static int __init debugfs_init(void) @@ -509,11 +520,16 @@ static int __init debugfs_init(void) retval = register_filesystem(&debug_fs_type); if (retval) kobject_put(debug_kobj); + else + debugfs_registered = true; + return retval; } static void __exit debugfs_exit(void) { + debugfs_registered = false; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); unregister_filesystem(&debug_fs_type); kobject_put(debug_kobj); diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 7505482a08fa..418b6f3b0ae8 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -18,7 +18,7 @@ config EXT4_FS filesystem; while there will be some performance gains from the delayed allocation and inode table readahead, the best performance gains will require enabling ext4 features in the - filesystem, or formating a new filesystem as an ext4 + filesystem, or formatting a new filesystem as an ext4 filesystem initially. To compile this file system support as a module, choose M here. The diff --git a/fs/file_table.c b/fs/file_table.c index b74a8e1da913..54018fe48840 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, fmode_t mode, const struct file_operations *fop) { struct file *file; - struct path; file = get_empty_filp(); if (!file) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index eed480639902..91013ff7dd53 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -435,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * If older_than_this is non-NULL, then only write out inodes which * had their first dirtying at a time earlier than *older_than_this. * - * If we're a pdlfush thread, then implement pdflush collision avoidance + * If we're a pdflush thread, then implement pdflush collision avoidance * against the entire list. * * If `bdi' is non-zero then we're being asked to writeback a specific queue. diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 06da05261e04..8b8eebc5614b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) fuse_put_request(fc, req); return -ENOMEM; } + req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4e340fedf768..2b25133524a3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_read_in); req->in.args[0].value = inarg; - req->out.argpages = 1; req->out.argvar = 1; req->out.numargs = 1; req->out.args[0].size = count; @@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page) attr_ver = fuse_get_attr_version(fc); req->out.page_zeroing = 1; + req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; num_read = fuse_send_read(req, file, inode, pos, count, NULL); @@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, struct fuse_conn *fc = get_fuse_conn(inode); loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; + + req->out.argpages = 1; req->out.page_zeroing = 1; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); req->misc.read.attr_ver = fuse_get_attr_version(fc); @@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file, inarg->flags = file ? file->f_flags : 0; req->in.h.opcode = FUSE_WRITE; req->in.h.nodeid = get_node_id(inode); - req->in.argpages = 1; req->in.numargs = 2; if (fc->minor < 9) req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; @@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode, if (IS_ERR(req)) return PTR_ERR(req); + req->in.argpages = 1; req->num_pages = 1; req->pages[0] = page; req->page_offset = offset; @@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, size_t count = 0; int err; + req->in.argpages = 1; req->page_offset = offset; do { @@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, - unsigned nbytes, int write) + unsigned *nbytesp, int write) { + unsigned nbytes = *nbytesp; unsigned long user_addr = (unsigned long) buf; unsigned offset = user_addr & ~PAGE_MASK; int npages; - /* This doesn't work with nfsd */ - if (!current->mm) - return -EPERM; + /* Special case for kernel I/O: can copy directly into the buffer */ + if (segment_eq(get_fs(), KERNEL_DS)) { + if (write) + req->in.args[1].value = (void *) user_addr; + else + req->out.args[0].value = (void *) user_addr; + + return 0; + } nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); down_read(¤t->mm->mmap_sem); - npages = get_user_pages(current, current->mm, user_addr, npages, write, + npages = get_user_pages(current, current->mm, user_addr, npages, !write, 0, req->pages, NULL); up_read(¤t->mm->mmap_sem); if (npages < 0) @@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, req->num_pages = npages; req->page_offset = offset; + + if (write) + req->in.argpages = 1; + else + req->out.argpages = 1; + + nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; + *nbytesp = min(*nbytesp, nbytes); + return 0; } @@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, while (count) { size_t nres; - size_t nbytes_limit = min(count, nmax); - size_t nbytes; - int err = fuse_get_user_pages(req, buf, nbytes_limit, !write); + size_t nbytes = min(count, nmax); + int err = fuse_get_user_pages(req, buf, &nbytes, write); if (err) { res = err; break; } - nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - nbytes = min(nbytes_limit, nbytes); + if (write) nres = fuse_send_write(req, file, inode, pos, nbytes, current->files); @@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page) fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); copy_highpage(tmp_page, page); + req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; req->page_offset = 0; @@ -1274,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + return generic_file_mmap(file, vma); +} + static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, struct file_lock *fl) { @@ -1908,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .llseek = fuse_file_llseek, .read = fuse_direct_read, .write = fuse_direct_write, + .mmap = fuse_direct_mmap, .open = fuse_open, .flush = fuse_flush, .release = fuse_release, @@ -1917,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, - /* no mmap and splice_read */ + /* no splice_read */ }; static const struct address_space_operations fuse_file_aops = { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 19e3a96aa02c..678a067d9251 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, }; +static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + dx_root->dr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + return le64_to_cpu(dx_root->dr_last_eb_blk); +} + +static void ocfs2_dx_root_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + le32_add_cpu(&dx_root->dr_clusters, clusters); +} + +static int ocfs2_dx_root_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); + + return 0; +} + +static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + et->et_root_el = &dx_root->dr_list; +} + +static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { + .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, + .eo_update_clusters = ocfs2_dx_root_update_clusters, + .eo_sanity_check = ocfs2_dx_root_sanity_check, + .eo_fill_root_el = ocfs2_dx_root_fill_root_el, +}; + static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh, @@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, &ocfs2_xattr_value_et_ops); } +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, + NULL, &ocfs2_dx_root_et_ops); +} + static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 new_last_eb_blk) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index cceff5c37f47..353254ba29e1 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct ocfs2_xattr_value_buf *vb); +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); /* * Read an extent block into *bh. If *bh is NULL, a bh will be diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8e1709a679b7..b2c52b3a1484 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, } const struct address_space_operations ocfs2_aops = { - .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, - .writepage = ocfs2_writepage, - .write_begin = ocfs2_write_begin, - .write_end = ocfs2_write_end, - .bmap = ocfs2_bmap, - .sync_page = block_sync_page, - .direct_IO = ocfs2_direct_IO, - .invalidatepage = ocfs2_invalidatepage, - .releasepage = ocfs2_releasepage, - .migratepage = buffer_migrate_page, + .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, + .writepage = ocfs2_writepage, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, + .bmap = ocfs2_bmap, + .sync_page = block_sync_page, + .direct_IO = ocfs2_direct_IO, + .invalidatepage = ocfs2_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 04697ba7f73e..4f85eceab376 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -33,6 +33,7 @@ #include <linux/random.h> #include <linux/crc32.h> #include <linux/time.h> +#include <linux/debugfs.h> #include "heartbeat.h" #include "tcp.h" @@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +#define O2HB_DEBUG_DIR "o2hb" +#define O2HB_DEBUG_LIVENODES "livenodes" +static struct dentry *o2hb_debug_dir; +static struct dentry *o2hb_debug_livenodes; + static LIST_HEAD(o2hb_all_regions); static struct o2hb_callback { @@ -905,7 +911,77 @@ static int o2hb_thread(void *data) return 0; } -void o2hb_init(void) +#ifdef CONFIG_DEBUG_FS +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + char *buf = NULL; + int i = -1; + int out = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + o2hb_fill_node_map(map, sizeof(map)); + + while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + + i_size_write(inode, out); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static struct file_operations o2hb_debug_fops = { + .open = o2hb_debug_open, + .release = o2hb_debug_release, + .read = o2hb_debug_read, + .llseek = generic_file_llseek, +}; + +void o2hb_exit(void) +{ + if (o2hb_debug_livenodes) + debugfs_remove(o2hb_debug_livenodes); + if (o2hb_debug_dir) + debugfs_remove(o2hb_debug_dir); +} + +int o2hb_init(void) { int i; @@ -918,6 +994,24 @@ void o2hb_init(void) INIT_LIST_HEAD(&o2hb_node_events); memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, + S_IFREG|S_IRUSR, + o2hb_debug_dir, NULL, + &o2hb_debug_fops); + if (!o2hb_debug_livenodes) { + mlog_errno(-ENOMEM); + debugfs_remove(o2hb_debug_dir); + return -ENOMEM; + } + + return 0; } /* if we're already in a callback then we're already serialized by the sem */ diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index e511339886b3..2f1649253b49 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, unsigned bytes); -void o2hb_init(void); +void o2hb_exit(void); +int o2hb_init(void); int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); int o2hb_check_local_node_heartbeating(void); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 70e8fa9e2539..7ee6188bc79a 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -881,6 +881,7 @@ static void __exit exit_o2nm(void) o2cb_sys_shutdown(); o2net_exit(); + o2hb_exit(); } static int __init init_o2nm(void) @@ -889,11 +890,13 @@ static int __init init_o2nm(void) cluster_print_version(); - o2hb_init(); + ret = o2hb_init(); + if (ret) + goto out; ret = o2net_init(); if (ret) - goto out; + goto out_o2hb; ret = o2net_register_hb_callbacks(); if (ret) @@ -916,6 +919,8 @@ out_callbacks: o2net_unregister_hb_callbacks(); out_o2net: o2net_exit(); +out_o2hb: + o2hb_exit(); out: return ret; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f2c4098cf337..e71160cda110 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -41,6 +41,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/quotaops.h> +#include <linux/sort.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -58,6 +59,7 @@ #include "namei.h" #include "suballoc.h" #include "super.h" +#include "sysfile.h" #include "uptodate.h" #include "buffer_head_io.h" @@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ocfs2_extend_dir(struct ocfs2_super *osb, - struct inode *dir, - struct buffer_head *parent_fe_bh, - unsigned int blocks_wanted, - struct buffer_head **new_de_bh); static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, @@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); +static int ocfs2_dir_indexed(struct inode *inode); /* * These are distinct checks because future versions of the file system will * want to have a trailing dirent structure independent of indexing. */ -static int ocfs2_dir_has_trailer(struct inode *dir) +static int ocfs2_supports_dir_trailer(struct inode *dir) { + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); + return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); } -static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) +/* + * "new' here refers to the point at which we're creating a new + * directory via "mkdir()", but also when we're expanding an inline + * directory. In either case, we don't yet have the indexing bit set + * on the directory, so the standard checks will fail in when metaecc + * is turned off. Only directory-initialization type functions should + * use this then. Everything else wants ocfs2_supports_dir_trailer() + */ +static int ocfs2_new_dir_wants_trailer(struct inode *dir) { - return ocfs2_meta_ecc(osb); + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + return ocfs2_meta_ecc(osb) || + ocfs2_supports_indexed_dirs(osb); } static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) @@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir, { unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); - if (!ocfs2_dir_has_trailer(dir)) + if (!ocfs2_supports_dir_trailer(dir)) return 0; if (offset != toff) @@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir, } static void ocfs2_init_dir_trailer(struct inode *inode, - struct buffer_head *bh) + struct buffer_head *bh, u16 rec_len) { struct ocfs2_dir_block_trailer *trailer; @@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode, cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); trailer->db_blkno = cpu_to_le64(bh->b_blocknr); + trailer->db_free_rec_len = cpu_to_le16(rec_len); +} +/* + * Link an unindexed block with a dir trailer structure into the index free + * list. This function will modify dirdata_bh, but assumes you've already + * passed it to the journal. + */ +static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, + struct buffer_head *dx_root_bh, + struct buffer_head *dirdata_bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer; + + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) +{ + return res->dl_prev_leaf_bh == NULL; +} + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) +{ + brelse(res->dl_dx_root_bh); + brelse(res->dl_leaf_bh); + brelse(res->dl_dx_leaf_bh); + brelse(res->dl_prev_leaf_bh); +} + +static int ocfs2_dir_indexed(struct inode *inode) +{ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) + return 1; + return 0; +} + +static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) +{ + return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; +} + +/* + * Hashing code adapted from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len, + struct ocfs2_dx_hinfo *hinfo) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + const char *p; + __u32 in[8], buf[4]; + + /* + * XXX: Is this really necessary, if the index is never looked + * at by readdir? Is a hash value of '0' a bad idea? + */ + if ((len == 1 && !strncmp(".", name, 1)) || + (len == 2 && !strncmp("..", name, 2))) { + buf[0] = buf[1] = 0; + goto out; + } + +#ifdef OCFS2_DEBUG_DX_DIRS + /* + * This makes it very easy to debug indexing problems. We + * should never allow this to be selected without hand editing + * this file though. + */ + buf[0] = buf[1] = len; + goto out; +#endif + + memcpy(buf, osb->osb_dx_seed, sizeof(buf)); + + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + +out: + hinfo->major_hash = buf[0]; + hinfo->minor_hash = buf[1]; } /* @@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb, } /* + * Validate a directory trailer. + * + * We check the trailer here rather than in ocfs2_validate_dir_block() + * because that function doesn't have the inode to test. + */ +static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); + if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: " + "signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); + goto out; + } + if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid " + "db_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + if (le64_to_cpu(trailer->db_parent_dinode) != + OCFS2_I(dir)->ip_blkno) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode " + "#%llu has an invalid parent_dinode " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } +out: + return rc; +} + +/* * This function forces all errors to -EIO for consistency with its * predecessor, ocfs2_bread(). We haven't audited what returning the * real error codes would do to callers. We log the real codes with @@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, { int rc = 0; struct buffer_head *tmp = *bh; - struct ocfs2_dir_block_trailer *trailer; rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, ocfs2_validate_dir_block); @@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, goto out; } - /* - * We check the trailer here rather than in - * ocfs2_validate_dir_block() because that function doesn't have - * the inode to test. - */ if (!(flags & OCFS2_BH_READAHEAD) && - ocfs2_dir_has_trailer(inode)) { - trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); - if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(inode->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)tmp->b_blocknr, 7, - trailer->db_signature); - goto out; - } - if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) { - rc = -EINVAL; - ocfs2_error(inode->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)tmp->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); - goto out; - } - if (le64_to_cpu(trailer->db_parent_dinode) != - OCFS2_I(inode)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(inode->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)tmp->b_blocknr, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + ocfs2_supports_dir_trailer(inode)) { + rc = ocfs2_check_dir_trailer(inode, tmp); + if (rc) { + if (!*bh) + brelse(tmp); + mlog_errno(rc); goto out; } } @@ -379,6 +553,141 @@ out: return rc ? -EIO : 0; } +/* + * Read the block at 'phys' which belongs to this directory + * inode. This function does no virtual->physical block translation - + * what's passed in is assumed to be a valid directory block. + */ +static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, + struct buffer_head **bh) +{ + int ret; + struct buffer_head *tmp = *bh; + + ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ocfs2_supports_dir_trailer(dir)) { + ret = ocfs2_check_dir_trailer(dir, tmp); + if (ret) { + if (!*bh) + brelse(tmp); + mlog_errno(ret); + goto out; + } + } + + if (!ret && !*bh) + *bh = tmp; +out: + return ret; +} + +static int ocfs2_validate_dx_root(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + + BUG_ON(!buffer_uptodate(bh)); + + dx_root = (struct ocfs2_dx_root_block *) bh->b_data; + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index root block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { + ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, + struct buffer_head **dx_root_bh) +{ + int ret; + u64 blkno = le64_to_cpu(di->i_dx_root); + struct buffer_head *tmp = *dx_root_bh; + + ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_root_bh) + *dx_root_bh = tmp; + + return ret; +} + +static int ocfs2_validate_dx_leaf(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index leaf block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { + ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", + 7, dx_leaf->dl_signature); + return -EROFS; + } + + return 0; +} + +static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, + struct buffer_head **dx_leaf_bh) +{ + int ret; + struct buffer_head *tmp = *dx_leaf_bh; + + ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_leaf_bh) + *dx_leaf_bh = tmp; + + return ret; +} + +/* + * Read a series of dx_leaf blocks. This expects all buffer_head + * pointers to be NULL on function entry. + */ +static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, + struct buffer_head **dx_leaf_bhs) +{ + int ret; + + ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, + ocfs2_validate_dx_leaf); + if (ret) + mlog_errno(ret); + + return ret; +} + static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_entry **res_dir) @@ -480,39 +789,340 @@ cleanup_and_exit: return ret; } +static int ocfs2_dx_dir_lookup_rec(struct inode *inode, + struct ocfs2_extent_list *el, + u32 major_hash, + u32 *ret_cpos, + u64 *ret_phys_blkno, + unsigned int *ret_clen) +{ + int ret = 0, i, found; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "btree tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= major_hash) { + found = 1; + break; + } + } + + if (!found) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in btree", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + if (ret_phys_blkno) + *ret_phys_blkno = le64_to_cpu(rec->e_blkno); + if (ret_cpos) + *ret_cpos = le32_to_cpu(rec->e_cpos); + if (ret_clen) + *ret_clen = le16_to_cpu(rec->e_leaf_clusters); + +out: + brelse(eb_bh); + return ret; +} + +/* + * Returns the block index, from the start of the cluster which this + * hash belongs too. + */ +static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + u32 minor_hash) +{ + return minor_hash & osb->osb_dx_mask; +} + +static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + struct ocfs2_dx_hinfo *hinfo) +{ + return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); +} + +static int ocfs2_dx_dir_lookup(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_dx_hinfo *hinfo, + u32 *ret_cpos, + u64 *ret_phys_blkno) +{ + int ret = 0; + unsigned int cend, uninitialized_var(clen); + u32 uninitialized_var(cpos); + u64 uninitialized_var(blkno); + u32 name_hash = hinfo->major_hash; + + ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, + &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + cend = cpos + clen; + if (name_hash >= cend) { + /* We want the last cluster */ + blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1); + cpos += clen - 1; + } else { + blkno += ocfs2_clusters_to_blocks(inode->i_sb, + name_hash - cpos); + cpos = name_hash; + } + + /* + * We now have the cluster which should hold our entry. To + * find the exact block from the start of the cluster to + * search, we take the lower bits of the hash. + */ + blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo); + + if (ret_phys_blkno) + *ret_phys_blkno = blkno; + if (ret_cpos) + *ret_cpos = cpos; + +out: + + return ret; +} + +static int ocfs2_dx_dir_search(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dx_root_block *dx_root, + struct ocfs2_dir_lookup_result *res) +{ + int ret, i, found; + u64 uninitialized_var(phys); + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = NULL; + struct buffer_head *dir_ent_bh = NULL; + struct ocfs2_dir_entry *dir_ent = NULL; + struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; + struct ocfs2_extent_list *dr_el; + struct ocfs2_dx_entry_list *entry_list; + + ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); + + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + goto search; + } + + dr_el = &dx_root->dr_list; + + ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x " + "returns: %llu\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + namelen, name, hinfo->major_hash, hinfo->minor_hash, + (unsigned long long)phys); + + ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; + + mlog(0, "leaf info: num_used: %d, count: %d\n", + le16_to_cpu(dx_leaf->dl_list.de_num_used), + le16_to_cpu(dx_leaf->dl_list.de_count)); + + entry_list = &dx_leaf->dl_list; + +search: + /* + * Empty leaf is legal, so no need to check for that. + */ + found = 0; + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) + || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) + continue; + + /* + * Search unindexed leaf block now. We're not + * guaranteed to find anything. + */ + ret = ocfs2_read_dir_block_direct(dir, + le64_to_cpu(dx_entry->dx_dirent_blk), + &dir_ent_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: We should check the unindexed block here, + * before using it. + */ + + found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen, + 0, dir_ent_bh->b_data, + dir->i_sb->s_blocksize, &dir_ent); + if (found == 1) + break; + + if (found == -1) { + /* This means we found a bad directory entry. */ + ret = -EIO; + mlog_errno(ret); + goto out; + } + + brelse(dir_ent_bh); + dir_ent_bh = NULL; + } + + if (found <= 0) { + ret = -ENOENT; + goto out; + } + + res->dl_leaf_bh = dir_ent_bh; + res->dl_entry = dir_ent; + res->dl_dx_leaf_bh = dx_leaf_bh; + res->dl_dx_entry = dx_entry; + + ret = 0; +out: + if (ret) { + brelse(dx_leaf_bh); + brelse(dir_ent_bh); + } + return ret; +} + +static int ocfs2_find_entry_dx(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); + if (ret) { + if (ret != -ENOENT) + mlog_errno(ret); + goto out; + } + + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + /* * Try to find an entry of the provided name within 'dir'. * - * If nothing was found, NULL is returned. Otherwise, a buffer_head - * and pointer to the dir entry are passed back. + * If nothing was found, -ENOENT is returned. Otherwise, zero is + * returned and the struct 'res' will contain information useful to + * other directory manipulation functions. * * Caller can NOT assume anything about the contents of the - * buffer_head - it is passed back only so that it can be passed into - * any one of the manipulation functions (add entry, delete entry, - * etc). As an example, bh in the extent directory case is a data - * block, in the inline-data case it actually points to an inode. + * buffer_heads - they are passed back only so that it can be passed + * into any one of the manipulation functions (add entry, delete + * entry, etc). As an example, bh in the extent directory case is a + * data block, in the inline-data case it actually points to an inode, + * in the indexed directory case, multiple buffers are involved. */ -struct buffer_head *ocfs2_find_entry(const char *name, int namelen, - struct inode *dir, - struct ocfs2_dir_entry **res_dir) +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, struct ocfs2_dir_lookup_result *lookup) { - *res_dir = NULL; + struct buffer_head *bh; + struct ocfs2_dir_entry *res_dir = NULL; + if (ocfs2_dir_indexed(dir)) + return ocfs2_find_entry_dx(name, namelen, dir, lookup); + + /* + * The unindexed dir code only uses part of the lookup + * structure, so there's no reason to push it down further + * than this. + */ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return ocfs2_find_entry_id(name, namelen, dir, res_dir); + bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); + else + bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + + if (bh == NULL) + return -ENOENT; - return ocfs2_find_entry_el(name, namelen, dir, res_dir); + lookup->dl_leaf_bh = bh; + lookup->dl_entry = res_dir; + return 0; } /* * Update inode number and type of a previously found directory entry. */ int ocfs2_update_entry(struct inode *dir, handle_t *handle, - struct buffer_head *de_bh, struct ocfs2_dir_entry *de, + struct ocfs2_dir_lookup_result *res, struct inode *new_entry_inode) { int ret; ocfs2_journal_access_func access = ocfs2_journal_access_db; + struct ocfs2_dir_entry *de = res->dl_entry; + struct buffer_head *de_bh = res->dl_leaf_bh; /* * The same code works fine for both inline-data and extent @@ -538,6 +1148,10 @@ out: return ret; } +/* + * __ocfs2_delete_entry deletes a directory entry by merging it with the + * previous entry + */ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, struct buffer_head *bh, char *first_de, @@ -587,6 +1201,181 @@ bail: return status; } +static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) +{ + unsigned int hole; + + if (le64_to_cpu(de->inode) == 0) + hole = le16_to_cpu(de->rec_len); + else + hole = le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len); + + return hole; +} + +static int ocfs2_find_max_rec_len(struct super_block *sb, + struct buffer_head *dirblock_bh) +{ + int size, this_hole, largest_hole = 0; + char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; + struct ocfs2_dir_entry *de; + + trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); + size = ocfs2_dir_trailer_blk_off(sb); + limit = start + size; + de_buf = start; + de = (struct ocfs2_dir_entry *)de_buf; + do { + if (de_buf != trailer) { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + } + + de_buf += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)de_buf; + } while (de_buf < limit); + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; +} + +static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, + int index) +{ + int num_used = le16_to_cpu(entry_list->de_num_used); + + if (num_used == 1 || index == (num_used - 1)) + goto clear; + + memmove(&entry_list->de_entries[index], + &entry_list->de_entries[index + 1], + (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); +clear: + num_used--; + memset(&entry_list->de_entries[num_used], 0, + sizeof(struct ocfs2_dx_entry)); + entry_list->de_num_used = cpu_to_le16(num_used); +} + +static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, index, max_rec_len, add_to_free_list = 0; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + struct buffer_head *leaf_bh = lookup->dl_leaf_bh; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; + struct ocfs2_dir_block_trailer *trailer; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * This function gets a bit messy because we might have to + * modify the root block, regardless of whether the indexed + * entries are stored inline. + */ + + /* + * *Only* set 'entry_list' here, based on where we're looking + * for the indexed entries. Later, we might still want to + * journal both blocks, based on free list state. + */ + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + } else { + dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; + entry_list = &dx_leaf->dl_list; + } + + /* Neither of these are a disk corruption - that should have + * been caught by lookup, before we got here. */ + BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); + BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); + + index = (char *)dx_entry - (char *)entry_list->de_entries; + index /= sizeof(*dx_entry); + + if (index >= le16_to_cpu(entry_list->de_num_used)) { + mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, index, + entry_list, dx_entry); + return -EIO; + } + + /* + * We know that removal of this dirent will leave enough room + * for a new one, so add this block to the free list if it + * isn't already there. + */ + trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (trailer->db_free_rec_len == 0) + add_to_free_list = 1; + + /* + * Add the block holding our index into the journal before + * removing the unindexed entry. If we get an error return + * from __ocfs2_delete_entry(), then it hasn't removed the + * entry yet. Likewise, successful return means we *must* + * remove the indexed entry. + * + * We're also careful to journal the root tree block here as + * the entry count needs to be updated. Also, we might be + * adding to the start of the free list. + */ + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_journal_access_dl(handle, dir, + lookup->dl_dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + mlog(0, "Dir %llu: delete entry at index: %d\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, index); + + ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, + leaf_bh, leaf_bh->b_data, leaf_bh->b_size); + if (ret) { + mlog_errno(ret); + goto out; + } + + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + if (add_to_free_list) { + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); + ocfs2_journal_dirty(handle, dx_root_bh); + } + + /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ + ocfs2_journal_dirty(handle, leaf_bh); + + le32_add_cpu(&dx_root->dr_num_entries, -1); + ocfs2_journal_dirty(handle, dx_root_bh); + + ocfs2_dx_list_remove_entry(entry_list, index); + + if (!ocfs2_dx_root_inline(dx_root)) + ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); + +out: + return ret; +} + static inline int ocfs2_delete_entry_id(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, @@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle, } /* - * ocfs2_delete_entry deletes a directory entry by merging it with the - * previous entry + * Delete a directory entry. Hide the details of directory + * implementation from the caller. */ int ocfs2_delete_entry(handle_t *handle, struct inode *dir, - struct ocfs2_dir_entry *de_del, - struct buffer_head *bh) + struct ocfs2_dir_lookup_result *res) { + if (ocfs2_dir_indexed(dir)) + return ocfs2_delete_entry_dx(handle, dir, res); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return ocfs2_delete_entry_id(handle, dir, de_del, bh); + return ocfs2_delete_entry_id(handle, dir, res->dl_entry, + res->dl_leaf_bh); - return ocfs2_delete_entry_el(handle, dir, de_del, bh); + return ocfs2_delete_entry_el(handle, dir, res->dl_entry, + res->dl_leaf_bh); } /* @@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de, return 0; } +static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf, + struct ocfs2_dx_entry *dx_new_entry) +{ + int i; + + i = le16_to_cpu(dx_leaf->dl_list.de_num_used); + dx_leaf->dl_list.de_entries[i] = *dx_new_entry; + + le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); +} + +static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk) +{ + int i; + struct ocfs2_dx_entry *dx_entry; + + i = le16_to_cpu(entry_list->de_num_used); + dx_entry = &entry_list->de_entries[i]; + + memset(dx_entry, 0, sizeof(*dx_entry)); + dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); + dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); + dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); + + le16_add_cpu(&entry_list->de_num_used, 1); +} + +static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct buffer_head *dx_leaf_bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf; + + ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); + ocfs2_journal_dirty(handle, dx_leaf_bh); + +out: + return ret; +} + +static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct ocfs2_dx_root_block *dx_root) +{ + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); +} + +static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + ocfs2_dx_inline_root_insert(dir, handle, + &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + dx_root); + } else { + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + lookup->dl_dx_leaf_bh); + if (ret) + goto out; + } + + le32_add_cpu(&dx_root->dr_num_entries, 1); + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static void ocfs2_remove_block_from_free_list(struct inode *dir, + handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + struct ocfs2_dir_block_trailer *trailer, *prev; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *bh; + + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + dx_root = (struct ocfs2_dx_root_block *)bh->b_data; + dx_root->dr_free_blk = trailer->db_free_next; + } else { + bh = lookup->dl_prev_leaf_bh; + prev = ocfs2_trailer_from_bh(bh, dir->i_sb); + prev->db_free_next = trailer->db_free_next; + } + + trailer->db_free_rec_len = cpu_to_le16(0); + trailer->db_free_next = cpu_to_le64(0); + + ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); +} + +/* + * This expects that a journal write has been reserved on + * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh + */ +static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int max_rec_len; + struct ocfs2_dir_block_trailer *trailer; + + /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); + if (max_rec_len) { + /* + * There's still room in this block, so no need to remove it + * from the free list. In this case, we just want to update + * the rec len accounting. + */ + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); + } else { + ocfs2_remove_block_from_free_list(dir, handle, lookup); + } +} + /* we don't always have a dentry for what we want to add, so people * like orphan dir can call this instead. * - * If you pass me insert_bh, I'll skip the search of the other dir - * blocks and put the record in there. + * The lookup context must have been filled from + * ocfs2_prepare_dir_for_insert. */ int __ocfs2_add_entry(handle_t *handle, struct inode *dir, const char *name, int namelen, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, - struct buffer_head *insert_bh) + struct ocfs2_dir_lookup_result *lookup) { unsigned long offset; unsigned short rec_len; @@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle, struct super_block *sb = dir->i_sb; int retval, status; unsigned int size = sb->s_blocksize; + struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; mlog_entry_void(); @@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle, if (!namelen) return -EINVAL; - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_dir_indexed(dir)) { + struct buffer_head *bh; + + /* + * An indexed dir may require that we update the free space + * list. Reserve a write to the previous node in the list so + * that we don't fail later. + * + * XXX: This can be either a dx_root_block, or an unindexed + * directory tree leaf block. + */ + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + retval = ocfs2_journal_access_dr(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } else { + bh = lookup->dl_prev_leaf_bh; + retval = ocfs2_journal_access_db(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } + if (retval) { + mlog_errno(retval); + return retval; + } + } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { data_start = di->id2.i_data.id_data; size = i_size_read(dir); @@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle, status = ocfs2_journal_access_di(handle, dir, insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - else + else { status = ocfs2_journal_access_db(handle, dir, insert_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_WRITE); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_insert(dir, + handle, + lookup); + if (status) { + mlog_errno(status); + goto bail; + } + } + } + /* By now the buffer is marked for journaling */ offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { @@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle, de->name_len = namelen; memcpy(de->name, name, namelen); + if (ocfs2_dir_indexed(dir)) + ocfs2_recalc_free_list(dir, handle, lookup); + dir->i_version++; status = ocfs2_journal_dirty(handle, insert_bh); retval = 0; @@ -870,6 +1851,10 @@ out: return 0; } +/* + * NOTE: This function can be called against unindexed directories, + * and indexed ones. + */ static int ocfs2_dir_foreach_blk_el(struct inode *inode, u64 *f_version, loff_t *f_pos, void *priv, @@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name, int namelen, u64 *blkno, struct inode *inode, - struct buffer_head **dirent_bh, - struct ocfs2_dir_entry **dirent) + struct ocfs2_dir_lookup_result *lookup) { int status = -ENOENT; - mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", - namelen, name, blkno, inode, dirent_bh, dirent); + mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); - *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); - if (!*dirent_bh || !*dirent) { - status = -ENOENT; + status = ocfs2_find_entry(name, namelen, inode, lookup); + if (status) goto leave; - } - *blkno = le64_to_cpu((*dirent)->inode); + *blkno = le64_to_cpu(lookup->dl_entry->inode); status = 0; leave: - if (status < 0) { - *dirent = NULL; - brelse(*dirent_bh); - *dirent_bh = NULL; - } - mlog_exit(status); return status; } @@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno) { int ret; - struct buffer_head *bh = NULL; - struct ocfs2_dir_entry *dirent = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; - ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); - brelse(bh); + ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup); + ocfs2_free_dir_lookup_result(&lookup); return ret; } @@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir, int namelen) { int ret; - struct buffer_head *dirent_bh = NULL; - struct ocfs2_dir_entry *dirent = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("dir %llu, name '%.*s'\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); ret = -EEXIST; - dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); - if (dirent_bh) + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) goto bail; ret = 0; bail: - brelse(dirent_bh); + ocfs2_free_dir_lookup_result(&lookup); mlog_exit(ret); return ret; @@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv { unsigned seen_dot; unsigned seen_dot_dot; unsigned seen_other; + unsigned dx_dir; }; static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) @@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, /* * Check the positions of "." and ".." records to be sure * they're in the correct place. + * + * Indexed directories don't need to proceed past the first + * two entries, so we end the scan after seeing '..'. Despite + * that, we allow the scan to proceed In the event that we + * have a corrupted indexed directory (no dot or dot dot + * entries). This allows us to double check for existing + * entries which might not have been found in the index. */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; @@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, if (name_len == 2 && !strncmp("..", name, 2) && pos == OCFS2_DIR_REC_LEN(1)) { p->seen_dot_dot = 1; + + if (p->dx_dir && p->seen_dot) + return 1; + return 0; } p->seen_other = 1; return 1; } + +static int ocfs2_empty_dir_dx(struct inode *inode, + struct ocfs2_empty_dir_priv *priv) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_dx_root_block *dx_root; + + priv->dx_dir = 1; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(inode, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (le32_to_cpu(dx_root->dr_num_entries) != 2) + priv->seen_other = 1; + +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + /* * routine to check that the specified directory is empty (for rmdir) * * Returns 1 if dir is empty, zero otherwise. + * + * XXX: This is a performance problem for unindexed directories. */ int ocfs2_empty_dir(struct inode *inode) { @@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode) memset(&priv, 0, sizeof(priv)); + if (ocfs2_dir_indexed(inode)) { + ret = ocfs2_empty_dir_dx(inode, &priv); + if (ret) + mlog_errno(ret); + /* + * We still run ocfs2_dir_foreach to get the checks + * for "." and "..". + */ + } + ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); if (ret) mlog_errno(ret); @@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, - struct ocfs2_alloc_context *data_ac) + struct ocfs2_alloc_context *data_ac, + struct buffer_head **ret_new_bh) { int status; unsigned int size = osb->sb->s_blocksize; @@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, mlog_entry_void(); - if (ocfs2_supports_dir_trailer(osb)) + if (ocfs2_new_dir_wants_trailer(inode)) size = ocfs2_dir_trailer_blk_off(parent->i_sb); status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, @@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, memset(new_bh->b_data, 0, osb->sb->s_blocksize); de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); - if (ocfs2_supports_dir_trailer(osb)) - ocfs2_init_dir_trailer(inode, new_bh); + if (ocfs2_new_dir_wants_trailer(inode)) { + int size = le16_to_cpu(de->rec_len); + + /* + * Figure out the size of the hole left over after + * insertion of '.' and '..'. The trailer wants this + * information. + */ + size -= OCFS2_DIR_REC_LEN(2); + size -= sizeof(struct ocfs2_dir_block_trailer); + + ocfs2_init_dir_trailer(inode, new_bh, size); + } status = ocfs2_journal_dirty(handle, new_bh); if (status < 0) { @@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, } status = 0; + if (ret_new_bh) { + *ret_new_bh = new_bh; + new_bh = NULL; + } bail: brelse(new_bh); @@ -1336,20 +2384,427 @@ bail: return status; } +static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dirdata_bh, + struct ocfs2_alloc_context *meta_ac, + int dx_inline, u32 num_entries, + struct buffer_head **ret_dx_root_bh) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + u16 dr_suballoc_bit; + u64 dr_blkno; + unsigned int num_bits; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, + &num_bits, &dr_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Dir %llu, attach new index block: %llu\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)dr_blkno); + + dx_root_bh = sb_getblk(osb->sb, dr_blkno); + if (dx_root_bh == NULL) { + ret = -EIO; + goto out; + } + ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); + + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + memset(dx_root, 0, osb->sb->s_blocksize); + strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); + dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); + dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); + dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); + dx_root->dr_blkno = cpu_to_le64(dr_blkno); + dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); + dx_root->dr_num_entries = cpu_to_le32(num_entries); + if (le16_to_cpu(trailer->db_free_rec_len)) + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + else + dx_root->dr_free_blk = cpu_to_le64(0); + + if (dx_inline) { + dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; + dx_root->dr_entries.de_count = + cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); + } else { + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + } + + ret = ocfs2_journal_dirty(handle, dx_root_bh); + if (ret) + mlog_errno(ret); + + ret = ocfs2_journal_access_di(handle, dir, di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + di->i_dx_root = cpu_to_le64(dr_blkno); + + OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + + *ret_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + +out: + brelse(dx_root_bh); + return ret; +} + +static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 start_blk) +{ + int ret, i; + struct ocfs2_dx_leaf *dx_leaf; + struct buffer_head *bh; + + for (i = 0; i < num_dx_leaves; i++) { + bh = sb_getblk(osb->sb, start_blk + i); + if (bh == NULL) { + ret = -EIO; + goto out; + } + dx_leaves[i] = bh; + + ocfs2_set_new_buffer_uptodate(dir, bh); + + ret = ocfs2_journal_access_dl(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data; + + memset(dx_leaf, 0, osb->sb->s_blocksize); + strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE); + dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation); + dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr); + dx_leaf->dl_list.de_count = + cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); + + mlog(0, + "Dir %llu, format dx_leaf: %llu, entry count: %u\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(dx_leaf->dl_list.de_count)); + + ocfs2_journal_dirty(handle, bh); + } + + ret = 0; +out: + return ret; +} + +/* + * Allocates and formats a new cluster for use in an indexed dir + * leaf. This version will not do the extent insert, so that it can be + * used by operations which need careful ordering. + */ +static int __ocfs2_dx_dir_new_cluster(struct inode *dir, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 *ret_phys_blkno) +{ + int ret; + u32 phys, num; + u64 phys_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + /* + * XXX: For create, this should claim cluster for the index + * *before* the unindexed insert so that we have a better + * chance of contiguousness as the directory grows in number + * of entries. + */ + ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Format the new cluster first. That way, we're inserting + * valid data. + */ + phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys); + ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves, + num_dx_leaves, phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_phys_blkno = phys_blkno; +out: + return ret; +} + +static int ocfs2_dx_dir_new_cluster(struct inode *dir, + struct ocfs2_extent_tree *et, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves) +{ + int ret; + u64 phys_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, + num_dx_leaves, &phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, + meta_ac); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb, + int *ret_num_leaves) +{ + int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1); + struct buffer_head **dx_leaves; + + dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *), + GFP_NOFS); + if (dx_leaves && ret_num_leaves) + *ret_num_leaves = num_dx_leaves; + + return dx_leaves; +} + +static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *leaf_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_hinfo hinfo; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * Our strategy is to create the directory as though it were + * unindexed, then add the index block. This works with very + * little complication since the state of a new directory is a + * very well known quantity. + * + * Essentially, we have two dirents ("." and ".."), in the 1st + * block which need indexing. These are easily inserted into + * the index block. + */ + + ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, + data_ac, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, + meta_ac, 1, 2, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ + ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + + ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + +out: + brelse(dx_root_bh); + brelse(leaf_bh); + return ret; +} + int ocfs2_fill_new_dir(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, - struct ocfs2_alloc_context *data_ac) + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) + { BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); + if (ocfs2_supports_indexed_dirs(osb)) + return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh, + data_ac, meta_ac); + return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, - data_ac); + data_ac, NULL); +} + +static int ocfs2_dx_dir_index_block(struct inode *dir, + handle_t *handle, + struct buffer_head **dx_leaves, + int num_dx_leaves, + u32 *num_dx_entries, + struct buffer_head *dirent_bh) +{ + int ret, namelen, i; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct buffer_head *dx_leaf_bh; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + namelen = de->name_len; + if (!namelen || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo); + + i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo); + dx_leaf_bh = dx_leaves[i]; + + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo, + dirent_blk, dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *num_dx_entries = *num_dx_entries + 1; + +inc: + de_buf += le16_to_cpu(de->rec_len); + } + +out: + return ret; +} + +/* + * XXX: This expects dx_root_bh to already be part of the transaction. + */ +static void ocfs2_dx_dir_index_root_block(struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dirent_bh) +{ + char *de_buf, *limit; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_entry *de; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (!de->name_len || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); + + mlog(0, + "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n", + (unsigned long long)dir->i_ino, hinfo.major_hash, + hinfo.minor_hash, + le16_to_cpu(dx_root->dr_entries.de_num_used), + de->name_len, de->name); + + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, + dirent_blk); + + le32_add_cpu(&dx_root->dr_num_entries, 1); +inc: + de_buf += le16_to_cpu(de->rec_len); + } +} + +/* + * Count the number of inline directory entries in di_bh and compare + * them against the number of entries we can hold in an inline dx root + * block. + */ +static int ocfs2_new_dx_should_be_inline(struct inode *dir, + struct buffer_head *di_bh) +{ + int dirent_count = 0; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + de_buf = di->id2.i_data.id_data; + limit = de_buf + i_size_read(dir); + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (de->name_len && de->inode) + dirent_count++; + + de_buf += le16_to_cpu(de->rec_len); + } + + /* We are careful to leave room for one extra record. */ + return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); } /* @@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb, * expansion from an inline directory to one with extents. The first dir block * in that case is taken from the inline data portion of the inode block. * + * This will also return the largest amount of contiguous space for a dirent + * in the block. That value is *not* necessarily the last dirent, even after + * expansion. The directory indexing code wants this value for free space + * accounting. We do this here since we're already walking the entire dir + * block. + * * We add the dir trailer if this filesystem wants it. */ -static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, - struct super_block *sb) +static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, + struct inode *dir) { + struct super_block *sb = dir->i_sb; struct ocfs2_dir_entry *de; struct ocfs2_dir_entry *prev_de; char *de_buf, *limit; unsigned int new_size = sb->s_blocksize; - unsigned int bytes; + unsigned int bytes, this_hole; + unsigned int largest_hole = 0; - if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + if (ocfs2_new_dir_wants_trailer(dir)) new_size = ocfs2_dir_trailer_blk_off(sb); bytes = new_size - old_size; @@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, de_buf = start; de = (struct ocfs2_dir_entry *)de_buf; do { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + prev_de = de; de_buf += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)de_buf; } while (de_buf < limit); le16_add_cpu(&prev_de->rec_len, bytes); + + /* We need to double check this after modification of the final + * dirent. */ + this_hole = ocfs2_figure_dirent_hole(prev_de); + if (this_hole > largest_hole) + largest_hole = this_hole; + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; } /* @@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, */ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, struct buffer_head **first_block_bh) { - u32 alloc, bit_off, len; + u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0; struct super_block *sb = dir->i_sb; - int ret, credits = ocfs2_inline_to_extents_credits(sb); - u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; + int ret, i, num_dx_leaves = 0, dx_inline = 0, + credits = ocfs2_inline_to_extents_credits(sb); + u64 dx_insert_blkno, blkno, + bytes = blocks_wanted << sb->s_blocksize_bits; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(dir); struct ocfs2_alloc_context *data_ac; + struct ocfs2_alloc_context *meta_ac = NULL; struct buffer_head *dirdata_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct buffer_head **dx_leaves = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; struct ocfs2_extent_tree et; - int did_quota = 0; + struct ocfs2_extent_tree dx_et; + int did_quota = 0, bytes_allocated = 0; ocfs2_init_dinode_extent_tree(&et, dir, di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); + dx_alloc = 0; + + if (ocfs2_supports_indexed_dirs(osb)) { + credits += ocfs2_add_dir_index_credits(sb); + + dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); + if (!dx_inline) { + /* Add one more cluster for an index leaf */ + dx_alloc++; + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, + &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + /* This gets us the dx_root */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } /* - * We should never need more than 2 clusters for this - - * maximum dirent size is far less than one block. In fact, - * the only time we'd need more than one cluster is if + * We should never need more than 2 clusters for the unindexed + * tree - maximum dirent size is far less than one block. In + * fact, the only time we'd need more than one cluster is if * blocksize == clustersize and the dirent won't fit in the * extra space that the expansion to a single block gives. As * of today, that only happens on 4k/4k file systems. @@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, /* * Prepare for worst case allocation scenario of two separate - * extents. + * extents in the unindexed tree. */ if (alloc == 2) credits += OCFS2_SUBALLOC_ALLOC; @@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, alloc))) { + ocfs2_clusters_to_bytes(osb->sb, + alloc + dx_alloc))) { ret = -EDQUOT; goto out_commit; } did_quota = 1; + + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Allocate our index cluster first, to maximize the + * possibility that unindexed leaves grow + * contiguously. + */ + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, + dx_leaves, num_dx_leaves, + &dx_insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + } + /* * Try to claim as many clusters as the bitmap can give though * if we only get one now, that's enough to continue. The rest @@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, mlog_errno(ret); goto out_commit; } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); /* * Operations are carefully ordered so that we set up the new @@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); memset(dirdata_bh->b_data + i_size_read(dir), 0, sb->s_blocksize - i_size_read(dir)); - ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); - if (ocfs2_supports_dir_trailer(osb)) - ocfs2_init_dir_trailer(dir, dirdata_bh); + i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); + if (ocfs2_new_dir_wants_trailer(dir)) { + /* + * Prepare the dir trailer up front. It will otherwise look + * like a valid dirent. Even if inserting the index fails + * (unlikely), then all we'll have done is given first dir + * block a small amount of fragmentation. + */ + ocfs2_init_dir_trailer(dir, dirdata_bh, i); + } ret = ocfs2_journal_dirty(handle, dirdata_bh); if (ret) { @@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Dx dirs with an external cluster need to do this up + * front. Inline dx root's get handled later, after + * we've allocated our root block. We get passed back + * a total number of items so that dr_num_entries can + * be correctly set once the dx_root has been + * allocated. + */ + ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, + num_dx_leaves, &num_dx_entries, + dirdata_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + /* * Set extent, i_size, etc on the directory. After this, the * inode should contain the same exact dirents as before and @@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } + if (ocfs2_supports_indexed_dirs(osb)) { + ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, + dirdata_bh, meta_ac, dx_inline, + num_dx_entries, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (dx_inline) { + ocfs2_dx_dir_index_root_block(dir, dx_root_bh, + dirdata_bh); + } else { + ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); + ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, + dx_insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + } + } + /* * We asked for two clusters, but only got one in the 1st * pass. Claim the 2nd cluster as a separate extent. @@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, mlog_errno(ret); goto out_commit; } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); } *first_block_bh = dirdata_bh; dirdata_bh = NULL; + if (ocfs2_supports_indexed_dirs(osb)) { + unsigned int off; + + if (!dx_inline) { + /* + * We need to return the correct block within the + * cluster which should hold our entry. + */ + off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + &lookup->dl_hinfo); + get_bh(dx_leaves[off]); + lookup->dl_dx_leaf_bh = dx_leaves[off]; + } + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + } out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, 2)); + vfs_dq_free_space_nodirty(dir, bytes_allocated); + ocfs2_commit_trans(osb, handle); out_sem: @@ -1587,8 +3178,17 @@ out_sem: out: if (data_ac) ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } brelse(dirdata_bh); + brelse(dx_root_bh); return ret; } @@ -1658,11 +3258,14 @@ bail: * is to be turned into an extent based one. The size of the dirent to * insert might be larger than the space gained by growing to just one * block, so we may have to grow the inode by two blocks in that case. + * + * If the directory is already indexed, dx_root_bh must be provided. */ static int ocfs2_extend_dir(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, struct buffer_head **new_de_bh) { int status = 0; @@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; struct ocfs2_extent_tree et; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; mlog_entry_void(); if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * This would be a code error as an inline directory should + * never have an index root. + */ + BUG_ON(dx_root_bh); + status = ocfs2_expand_inline_dir(dir, parent_fe_bh, - blocks_wanted, &new_bh); + blocks_wanted, lookup, + &new_bh); if (status) { mlog_errno(status); goto bail; } + /* Expansion from inline to an indexed directory will + * have given us this. */ + dx_root_bh = lookup->dl_dx_root_bh; + if (blocks_wanted == 1) { /* * If the new dirent will fit inside the space @@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } do_extend: + if (ocfs2_dir_indexed(dir)) + credits++; /* For attaching the new dirent block to the + * dx_root */ + down_write(&OCFS2_I(dir)->ip_alloc_sem); drop_alloc_sem = 1; @@ -1781,9 +3400,19 @@ do_extend: de = (struct ocfs2_dir_entry *) new_bh->b_data; de->inode = 0; - if (ocfs2_dir_has_trailer(dir)) { + if (ocfs2_supports_dir_trailer(dir)) { de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); - ocfs2_init_dir_trailer(dir, new_bh); + + ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_link_trailer(dir, handle, + dx_root_bh, new_bh); + if (status) { + mlog_errno(status); + goto bail; + } + } } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } @@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, * This calculates how many free bytes we'd have in block zero, should * this function force expansion to an extent tree. */ - if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + if (ocfs2_new_dir_wants_trailer(dir)) free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); else free_space = dir->i_sb->s_blocksize - i_size_read(dir); @@ -1970,12 +3599,766 @@ bail: return status; } +static int dx_leaf_sort_cmp(const void *a, const void *b) +{ + const struct ocfs2_dx_entry *entry1 = a; + const struct ocfs2_dx_entry *entry2 = b; + u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash); + u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash); + u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash); + u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash); + + if (major_hash1 > major_hash2) + return 1; + if (major_hash1 < major_hash2) + return -1; + + /* + * It is not strictly necessary to sort by minor + */ + if (minor_hash1 > minor_hash2) + return 1; + if (minor_hash1 < minor_hash2) + return -1; + return 0; +} + +static void dx_leaf_sort_swap(void *a, void *b, int size) +{ + struct ocfs2_dx_entry *entry1 = a; + struct ocfs2_dx_entry *entry2 = b; + struct ocfs2_dx_entry tmp; + + BUG_ON(size != sizeof(*entry1)); + + tmp = *entry1; + *entry1 = *entry2; + *entry2 = tmp; +} + +static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num = le16_to_cpu(dl_list->de_num_used); + + for (i = 0; i < (num - 1); i++) { + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) != + le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash)) + return 0; + } + + return 1; +} + +/* + * Find the optimal value to split this leaf on. This expects the leaf + * entries to be in sorted order. + * + * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is + * the hash we want to insert. + * + * This function is only concerned with the major hash - that which + * determines which cluster an item belongs to. + */ +static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf, + u32 leaf_cpos, u32 insert_hash, + u32 *split_hash) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num_used = le16_to_cpu(dl_list->de_num_used); + int allsame; + + /* + * There's a couple rare, but nasty corner cases we have to + * check for here. All of them involve a leaf where all value + * have the same hash, which is what we look for first. + * + * Most of the time, all of the above is false, and we simply + * pick the median value for a split. + */ + allsame = ocfs2_dx_leaf_same_major(dx_leaf); + if (allsame) { + u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash); + + if (val == insert_hash) { + /* + * No matter where we would choose to split, + * the new entry would want to occupy the same + * block as these. Since there's no space left + * in their existing block, we know there + * won't be space after the split. + */ + return -ENOSPC; + } + + if (val == leaf_cpos) { + /* + * Because val is the same as leaf_cpos (which + * is the smallest value this leaf can have), + * yet is not equal to insert_hash, then we + * know that insert_hash *must* be larger than + * val (and leaf_cpos). At least cpos+1 in value. + * + * We also know then, that there cannot be an + * adjacent extent (otherwise we'd be looking + * at it). Choosing this value gives us a + * chance to get some contiguousness. + */ + *split_hash = leaf_cpos + 1; + return 0; + } + + if (val > insert_hash) { + /* + * val can not be the same as insert hash, and + * also must be larger than leaf_cpos. Also, + * we know that there can't be a leaf between + * cpos and val, otherwise the entries with + * hash 'val' would be there. + */ + *split_hash = val; + return 0; + } + + *split_hash = insert_hash; + return 0; + } + + /* + * Since the records are sorted and the checks above + * guaranteed that not all records in this block are the same, + * we simple travel forward, from the median, and pick the 1st + * record whose value is larger than leaf_cpos. + */ + for (i = (num_used / 2); i < num_used; i++) + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) > + leaf_cpos) + break; + + BUG_ON(i == num_used); /* Should be impossible */ + *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash); + return 0; +} + +/* + * Transfer all entries in orig_dx_leaves whose major hash is equal to or + * larger than split_hash into new_dx_leaves. We use a temporary + * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks. + * + * Since the block offset inside a leaf (cluster) is a constant mask + * of minor_hash, we can optimize - an item at block offset X within + * the original cluster, will be at offset X within the new cluster. + */ +static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, + handle_t *handle, + struct ocfs2_dx_leaf *tmp_dx_leaf, + struct buffer_head **orig_dx_leaves, + struct buffer_head **new_dx_leaves, + int num_dx_leaves) +{ + int i, j, num_used; + u32 major_hash; + struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; + struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry *dx_entry; + + tmp_list = &tmp_dx_leaf->dl_list; + + for (i = 0; i < num_dx_leaves; i++) { + orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; + orig_list = &orig_dx_leaf->dl_list; + new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; + new_list = &new_dx_leaf->dl_list; + + num_used = le16_to_cpu(orig_list->de_num_used); + + memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize); + tmp_list->de_num_used = cpu_to_le16(0); + memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used); + + for (j = 0; j < num_used; j++) { + dx_entry = &orig_list->de_entries[j]; + major_hash = le32_to_cpu(dx_entry->dx_major_hash); + if (major_hash >= split_hash) + ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf, + dx_entry); + else + ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf, + dx_entry); + } + memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize); + + ocfs2_journal_dirty(handle, orig_dx_leaves[i]); + ocfs2_journal_dirty(handle, new_dx_leaves[i]); + } +} + +static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, + struct ocfs2_dx_root_block *dx_root) +{ + int credits = ocfs2_clusters_to_blocks(osb->sb, 2); + + credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1); + credits += ocfs2_quota_trans_credits(osb->sb); + return credits; +} + +/* + * Find the median value in dx_leaf_bh and allocate a new leaf to move + * half our entries into. + */ +static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dx_leaf_bh, + struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos, + u64 leaf_blkno) +{ + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + int credits, ret, i, num_used, did_quota = 0; + u32 cpos, split_hash, insert_hash = hinfo->major_hash; + u64 orig_leaves_start; + int num_dx_leaves; + struct buffer_head **orig_dx_leaves = NULL; + struct buffer_head **new_dx_leaves = NULL; + struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL; + struct ocfs2_extent_tree et; + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; + + mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, insert_hash); + + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + /* + * XXX: This is a rather large limit. We should use a more + * realistic value. + */ + if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX) + return -ENOSPC; + + num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used); + if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) { + mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: " + "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, num_used); + ret = -EIO; + goto out; + } + + orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!orig_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL); + if (!new_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; + + ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * This block is changing anyway, so we can sort it in place. + */ + sort(dx_leaf->dl_list.de_entries, num_used, + sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, + dx_leaf_sort_swap); + + ret = ocfs2_journal_dirty(handle, dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, + &split_hash); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n", + leaf_cpos, split_hash, insert_hash); + + /* + * We have to carefully order operations here. There are items + * which want to be in the new cluster before insert, but in + * order to put those items in the new cluster, we alter the + * old cluster. A failure to insert gets nasty. + * + * So, start by reserving writes to the old + * cluster. ocfs2_dx_dir_new_cluster will reserve writes on + * the new cluster for us, before inserting it. The insert + * won't happen if there's an error before that. Once the + * insert is done then, we can transfer from one leaf into the + * other without fear of hitting any error. + */ + + /* + * The leaf transfer wants some scratch space so that we don't + * wind up doing a bunch of expensive memmove(). + */ + tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS); + if (!tmp_dx_leaf) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno); + ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves, + orig_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + for (i = 0; i < num_dx_leaves; i++) { + ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + cpos = split_hash; + ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, + data_ac, meta_ac, new_dx_leaves, + num_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, + orig_dx_leaves, new_dx_leaves, num_dx_leaves); + +out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (orig_dx_leaves || new_dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) { + if (orig_dx_leaves) + brelse(orig_dx_leaves[i]); + if (new_dx_leaves) + brelse(new_dx_leaves[i]); + } + kfree(orig_dx_leaves); + kfree(new_dx_leaves); + } + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + kfree(tmp_dx_leaf); + return ret; +} + +static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh, + const char *name, int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, rebalanced = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + u64 blkno; + u32 leaf_cpos; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + +restart_search: + ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, + &leaf_cpos, &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + + if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= + le16_to_cpu(dx_leaf->dl_list.de_count)) { + if (rebalanced) { + /* + * Rebalancing should have provided us with + * space in an appropriate leaf. + * + * XXX: Is this an abnormal condition then? + * Should we print a message here? + */ + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, + &lookup->dl_hinfo, leaf_cpos, + blkno); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + /* + * Restart the lookup. The rebalance might have + * changed which block our item fits into. Mark our + * progress, so we only execute this once. + */ + brelse(dx_leaf_bh); + dx_leaf_bh = NULL; + rebalanced = 1; + goto restart_search; + } + + lookup->dl_dx_leaf_bh = dx_leaf_bh; + dx_leaf_bh = NULL; + +out: + brelse(dx_leaf_bh); + return ret; +} + +static int ocfs2_search_dx_free_list(struct inode *dir, + struct buffer_head *dx_root_bh, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = -ENOSPC; + struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; + struct ocfs2_dir_block_trailer *db; + u64 next_block; + int rec_len = OCFS2_DIR_REC_LEN(namelen); + struct ocfs2_dx_root_block *dx_root; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + next_block = le64_to_cpu(dx_root->dr_free_blk); + + while (next_block) { + brelse(prev_leaf_bh); + prev_leaf_bh = leaf_bh; + leaf_bh = NULL; + + ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { + lookup->dl_leaf_bh = leaf_bh; + lookup->dl_prev_leaf_bh = prev_leaf_bh; + leaf_bh = NULL; + prev_leaf_bh = NULL; + break; + } + + next_block = le64_to_cpu(db->db_free_next); + } + + if (!next_block) + ret = -ENOSPC; + +out: + + brelse(leaf_bh); + brelse(prev_leaf_bh); + return ret; +} + +static int ocfs2_expand_inline_dx_root(struct inode *dir, + struct buffer_head *dx_root_bh) +{ + int ret, num_dx_leaves, i, j, did_quota = 0; + struct buffer_head **dx_leaves = NULL; + struct ocfs2_extent_tree et; + u64 insert_blkno; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + struct ocfs2_dx_entry *dx_entry; + struct ocfs2_dx_leaf *target_leaf; + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; + + /* + * We do this up front, before the allocation, so that a + * failure to add the dx_root_bh to the journal won't result + * us losing clusters. + */ + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, + num_dx_leaves, &insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Transfer the entries from our dx_root into the appropriate + * block + */ + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + j = __ocfs2_dx_dir_hash_idx(osb, + le32_to_cpu(dx_entry->dx_minor_hash)); + target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; + + ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); + + /* Each leaf has been passed to the journal already + * via __ocfs2_dx_dir_new_cluster() */ + } + + dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; + memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list)); + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + + /* This should never fail considering we start with an empty + * dx_root. */ + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, + insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + did_quota = 0; + + ocfs2_journal_dirty(handle, dx_root_bh); + +out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } + return ret; +} + +static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) +{ + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + if (le16_to_cpu(entry_list->de_num_used) >= + le16_to_cpu(entry_list->de_count)) + return -ENOSPC; + + return 0; +} + +static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, + struct buffer_head *di_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, free_dx_root = 1; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct buffer_head *leaf_bh = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) { + ret = -ENOSPC; + mlog_errno(ret); + goto out; + } + + if (ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_inline_dx_has_space(dx_root_bh); + + if (ret == 0) + goto search_el; + + /* + * We ran out of room in the root block. Expand it to + * an extent, then allow ocfs2_find_dir_space_dx to do + * the rest. + */ + ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Insert preparation for an indexed directory is split into two + * steps. The call to find_dir_space_dx reserves room in the index for + * an additional item. If we run out of space there, it's a real error + * we can't continue on. + */ + ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, + namelen, lookup); + if (ret) { + mlog_errno(ret); + goto out; + } + +search_el: + /* + * Next, we need to find space in the unindexed tree. This call + * searches using the free space linked list. If the unindexed tree + * lacks sufficient space, we'll expand it below. The expansion code + * is smart enough to add any new blocks to the free space list. + */ + ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); + if (ret && ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Do this up here - ocfs2_extend_dir might need the dx_root */ + lookup->dl_dx_root_bh = dx_root_bh; + free_dx_root = 0; + + if (ret == -ENOSPC) { + ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We make the assumption here that new leaf blocks are added + * to the front of our free list. + */ + lookup->dl_prev_leaf_bh = NULL; + lookup->dl_leaf_bh = leaf_bh; + } + +out: + if (free_dx_root) + brelse(dx_root_bh); + return ret; +} + +/* + * Get a directory ready for insert. Any directory allocation required + * happens here. Success returns zero, and enough context in the dir + * lookup result that ocfs2_add_entry() will be able complete the task + * with minimal performance impact. + */ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, const char *name, int namelen, - struct buffer_head **ret_de_bh) + struct ocfs2_dir_lookup_result *lookup) { int ret; unsigned int blocks_wanted = 1; @@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, mlog(0, "getting ready to insert namelen %d into dir %llu\n", namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); - *ret_de_bh = NULL; - if (!namelen) { ret = -EINVAL; mlog_errno(ret); goto out; } + /* + * Do this up front to reduce confusion. + * + * The directory might start inline, then be turned into an + * indexed one, in which case we'd need to hash deep inside + * ocfs2_find_dir_space_id(). Since + * ocfs2_prepare_dx_dir_for_insert() also needs this hash + * done, there seems no point in spreading out the calls. We + * can optimize away the case where the file system doesn't + * support indexing. + */ + if (ocfs2_supports_indexed_dirs(osb)) + ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); + + if (ocfs2_dir_indexed(dir)) { + ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, + name, namelen, lookup); + if (ret) + mlog_errno(ret); + goto out; + } + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, namelen, &bh, &blocks_wanted); @@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, BUG_ON(bh); ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, - &bh); + lookup, &bh); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); @@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, BUG_ON(!bh); } - *ret_de_bh = bh; + lookup->dl_leaf_bh = bh; bh = NULL; out: brelse(bh); return ret; } + +static int ocfs2_dx_dir_remove_index(struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + struct inode *dx_alloc_inode = NULL; + struct buffer_head *dx_alloc_bh = NULL; + handle_t *handle; + u64 blk; + u16 bit; + u64 bg_blkno; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + dx_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(dx_root->dr_suballoc_slot)); + if (!dx_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&dx_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, dir, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + di->i_dx_root = cpu_to_le64(0ULL); + + ocfs2_journal_dirty(handle, di_bh); + + blk = le64_to_cpu(dx_root->dr_blkno); + bit = le16_to_cpu(dx_root->dr_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, + bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(dx_alloc_inode, 1); + +out_mutex: + mutex_unlock(&dx_alloc_inode->i_mutex); + brelse(dx_alloc_bh); +out: + iput(dx_alloc_inode); + return ret; +} + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) +{ + int ret; + unsigned int uninitialized_var(clen); + u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos); + u64 uninitialized_var(blkno); + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!ocfs2_dir_indexed(dir)) + return 0; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (ocfs2_dx_root_inline(dx_root)) + goto remove_index; + + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + + /* XXX: What if dr_clusters is too large? */ + while (le32_to_cpu(dx_root->dr_clusters)) { + ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list, + major_hash, &cpos, &blkno, &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); + + ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos == 0) + break; + + major_hash = cpos - 1; + } + +remove_index: + ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(dir, dx_root_bh); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + brelse(dx_root_bh); + return ret; +} diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index c511e2e18e9f..e683f3deb645 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -26,44 +26,70 @@ #ifndef OCFS2_DIR_H #define OCFS2_DIR_H -struct buffer_head *ocfs2_find_entry(const char *name, - int namelen, - struct inode *dir, - struct ocfs2_dir_entry **res_dir); +struct ocfs2_dx_hinfo { + u32 major_hash; + u32 minor_hash; +}; + +struct ocfs2_dir_lookup_result { + struct buffer_head *dl_leaf_bh; /* Unindexed leaf + * block */ + struct ocfs2_dir_entry *dl_entry; /* Target dirent in + * unindexed leaf */ + + struct buffer_head *dl_dx_root_bh; /* Root of indexed + * tree */ + + struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */ + struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in + * indexed leaf */ + struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */ + + struct buffer_head *dl_prev_leaf_bh;/* Previous entry in + * dir free space + * list. NULL if + * previous entry is + * dx root block. */ +}; + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res); + +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup); int ocfs2_delete_entry(handle_t *handle, struct inode *dir, - struct ocfs2_dir_entry *de_del, - struct buffer_head *bh); + struct ocfs2_dir_lookup_result *res); int __ocfs2_add_entry(handle_t *handle, struct inode *dir, const char *name, int namelen, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, - struct buffer_head *insert_bh); + struct ocfs2_dir_lookup_result *lookup); static inline int ocfs2_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, - struct buffer_head *insert_bh) + struct ocfs2_dir_lookup_result *lookup) { return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, dentry->d_name.name, dentry->d_name.len, - inode, blkno, parent_fe_bh, insert_bh); + inode, blkno, parent_fe_bh, lookup); } int ocfs2_update_entry(struct inode *dir, handle_t *handle, - struct buffer_head *de_bh, struct ocfs2_dir_entry *de, + struct ocfs2_dir_lookup_result *res, struct inode *new_entry_inode); int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen); int ocfs2_empty_dir(struct inode *inode); + int ocfs2_find_files_on_disk(const char *name, int namelen, u64 *blkno, struct inode *inode, - struct buffer_head **dirent_bh, - struct ocfs2_dir_entry **dirent); + struct ocfs2_dir_lookup_result *res); int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno); int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); @@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct buffer_head *parent_fe_bh, const char *name, int namelen, - struct buffer_head **ret_de_bh); + struct ocfs2_dir_lookup_result *lookup); struct ocfs2_alloc_context; int ocfs2_fill_new_dir(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, - struct ocfs2_alloc_context *data_ac); + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac); + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh); struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, void *data); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index bb53714813ab..0102be35980c 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -52,16 +52,12 @@ enum dlm_mle_type { DLM_MLE_BLOCK, DLM_MLE_MASTER, - DLM_MLE_MIGRATION -}; - -struct dlm_lock_name { - u8 len; - u8 name[DLM_LOCKID_NAME_MAX]; + DLM_MLE_MIGRATION, + DLM_MLE_NUM_TYPES }; struct dlm_master_list_entry { - struct list_head list; + struct hlist_node master_hash_node; struct list_head hb_events; struct dlm_ctxt *dlm; spinlock_t spinlock; @@ -78,10 +74,10 @@ struct dlm_master_list_entry { enum dlm_mle_type type; struct o2hb_callback_func mle_hb_up; struct o2hb_callback_func mle_hb_down; - union { - struct dlm_lock_resource *res; - struct dlm_lock_name name; - } u; + struct dlm_lock_resource *mleres; + unsigned char mname[DLM_LOCKID_NAME_MAX]; + unsigned int mnamelen; + unsigned int mnamehash; }; enum dlm_ast_type { @@ -151,13 +147,14 @@ struct dlm_ctxt unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct dlm_recovery_ctxt reco; spinlock_t master_lock; - struct list_head master_list; + struct hlist_head **master_hash; struct list_head mle_hb_events; /* these give a really vague idea of the system load */ - atomic_t local_resources; - atomic_t remote_resources; - atomic_t unknown_resources; + atomic_t mle_tot_count[DLM_MLE_NUM_TYPES]; + atomic_t mle_cur_count[DLM_MLE_NUM_TYPES]; + atomic_t res_tot_count; + atomic_t res_cur_count; struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; @@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); } +static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm, + unsigned i) +{ + return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + + (i % DLM_BUCKETS_PER_PAGE); +} + /* these keventd work queue items are for less-frequently * called functions that cannot be directly called from the * net message handlers for some reason, usually because @@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len); int dlm_is_host_down(int errno); -void dlm_change_lockres_owner(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 owner); + struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, const char *lockid, int namelen, @@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) DLM_LOCK_RES_MIGRATING)); } +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); + /* create/destroy slab caches */ int dlm_init_master_caches(void); void dlm_destroy_master_caches(void); @@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter) return bit; } +static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + res->owner = owner; +} +static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + if (owner != res->owner) + dlm_set_lockres_owner(dlm, res, owner); +} #endif /* DLMCOMMON_H */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index b32f60a5acfb..df52f706f669 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes, static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) { int out = 0; - unsigned int namelen; - const char *name; char *mle_type; - if (mle->type != DLM_MLE_MASTER) { - namelen = mle->u.name.len; - name = mle->u.name.name; - } else { - namelen = mle->u.res->lockname.len; - name = mle->u.res->lockname.name; - } - if (mle->type == DLM_MLE_BLOCK) mle_type = "BLK"; else if (mle->type == DLM_MLE_MASTER) @@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) else mle_type = "MIG"; - out += stringify_lockname(name, namelen, buf + out, len - out); + out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); out += snprintf(buf + out, len - out, "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", mle_type, mle->master, mle->new_master, @@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = { static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) { struct dlm_master_list_entry *mle; - int out = 0; - unsigned long total = 0; + struct hlist_head *bucket; + struct hlist_node *list; + int i, out = 0; + unsigned long total = 0, longest = 0, bktcnt; out += snprintf(db->buf + out, db->len - out, "Dumping MLEs for Domain: %s\n", dlm->name); spin_lock(&dlm->master_lock); - list_for_each_entry(mle, &dlm->master_list, list) { - ++total; - if (db->len - out < 200) - continue; - out += dump_mle(mle, db->buf + out, db->len - out); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each(list, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + ++total; + ++bktcnt; + if (db->len - out < 200) + continue; + out += dump_mle(mle, db->buf + out, db->len - out); + } + longest = max(longest, bktcnt); + bktcnt = 0; } spin_unlock(&dlm->master_lock); out += snprintf(db->buf + out, db->len - out, - "Total on list: %ld\n", total); + "Total: %ld, Longest: %ld\n", total, longest); return out; } @@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) int out = 0; struct dlm_reco_node_data *node; char *state; - int lres, rres, ures, tres; - - lres = atomic_read(&dlm->local_resources); - rres = atomic_read(&dlm->remote_resources); - ures = atomic_read(&dlm->unknown_resources); - tres = lres + rres + ures; + int cur_mles = 0, tot_mles = 0; + int i; spin_lock(&dlm->spinlock); @@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) db->buf + out, db->len - out); out += snprintf(db->buf + out, db->len - out, "\n"); - /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ + /* Lock Resources: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + "Lock Resources: %d (%d)\n", + atomic_read(&dlm->res_cur_count), + atomic_read(&dlm->res_tot_count)); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + tot_mles += atomic_read(&dlm->mle_tot_count[i]); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + cur_mles += atomic_read(&dlm->mle_cur_count[i]); + + /* MLEs: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + "MLEs: %d (%d)\n", cur_mles, tot_mles); + + /* Blocking: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + " Blocking: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); + + /* Mastery: xxx (xxx) */ + out += snprintf(db->buf + out, db->len - out, + " Mastery: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); + + /* Migration: xxx (xxx) */ out += snprintf(db->buf + out, db->len - out, - "Mastered Resources Total: %d Locally: %d " - "Remotely: %d Unknown: %d\n", - tres, lres, rres, ures); + " Migration: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ out += snprintf(db->buf + out, db->len - out, "Lists: Dirty=%s Purge=%s PendingASTs=%s " - "PendingBASTs=%s Master=%s\n", + "PendingBASTs=%s\n", (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), - (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), - (list_empty(&dlm->master_list) ? "Empty" : "InUse")); + (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); /* Purge Count: xxx Refs: xxx */ out += snprintf(db->buf + out, db->len - out, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index d8d578f45613..4d9e6b288dd8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) if (dlm->lockres_hash) dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); + if (dlm->name) kfree(dlm->name); @@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, for (i = 0; i < DLM_HASH_BUCKETS; i++) INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); + dlm->master_hash = (struct hlist_head **) + dlm_alloc_pagevec(DLM_HASH_PAGES); + if (!dlm->master_hash) { + mlog_errno(-ENOMEM); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); + strcpy(dlm->name, domain); dlm->key = key; dlm->node_num = o2nm_this_node(); ret = dlm_create_debugfs_subroot(dlm); if (ret < 0) { + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); kfree(dlm->name); kfree(dlm); @@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, init_waitqueue_head(&dlm->reco.event); init_waitqueue_head(&dlm->ast_wq); init_waitqueue_head(&dlm->migration_wq); - INIT_LIST_HEAD(&dlm->master_list); INIT_LIST_HEAD(&dlm->mle_hb_events); dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; @@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->reco.new_master = O2NM_INVALID_NODE_NUM; dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; - atomic_set(&dlm->local_resources, 0); - atomic_set(&dlm->remote_resources, 0); - atomic_set(&dlm->unknown_resources, 0); + + atomic_set(&dlm->res_tot_count, 0); + atomic_set(&dlm->res_cur_count, 0); + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) { + atomic_set(&dlm->mle_tot_count[i], 0); + atomic_set(&dlm->mle_cur_count[i], 0); + } spin_lock_init(&dlm->work_lock); INIT_LIST_HEAD(&dlm->work_list); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 0a2813947853..f8b653fcd4dd 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, const char *name, unsigned int namelen) { - struct dlm_lock_resource *res; - if (dlm != mle->dlm) return 0; - if (mle->type == DLM_MLE_BLOCK || - mle->type == DLM_MLE_MIGRATION) { - if (namelen != mle->u.name.len || - memcmp(name, mle->u.name.name, namelen)!=0) - return 0; - } else { - res = mle->u.res; - if (namelen != res->lockname.len || - memcmp(res->lockname.name, name, namelen) != 0) - return 0; - } + if (namelen != mle->mnamelen || + memcmp(name, mle->mname, namelen) != 0) + return 0; + return 1; } @@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->dlm = dlm; mle->type = type; - INIT_LIST_HEAD(&mle->list); + INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); spin_lock_init(&mle->spinlock); @@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + if (mle->type == DLM_MLE_MASTER) { BUG_ON(!res); - mle->u.res = res; - } else if (mle->type == DLM_MLE_BLOCK) { - BUG_ON(!name); - memcpy(mle->u.name.name, name, namelen); - mle->u.name.len = namelen; - } else /* DLM_MLE_MIGRATION */ { + mle->mleres = res; + memcpy(mle->mname, res->lockname.name, res->lockname.len); + mle->mnamelen = res->lockname.len; + mle->mnamehash = res->lockname.hash; + } else { BUG_ON(!name); - memcpy(mle->u.name.name, name, namelen); - mle->u.name.len = namelen; + mle->mleres = NULL; + memcpy(mle->mname, name, namelen); + mle->mnamelen = namelen; + mle->mnamehash = dlm_lockid_hash(name, namelen); } + atomic_inc(&dlm->mle_tot_count[mle->type]); + atomic_inc(&dlm->mle_cur_count[mle->type]); + /* copy off the node_map and register hb callbacks on our copy */ memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); @@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, __dlm_mle_attach_hb_events(dlm, mle); } +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + if (!hlist_unhashed(&mle->master_hash_node)) + hlist_del_init(&mle->master_hash_node); +} + +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + struct hlist_head *bucket; + + assert_spin_locked(&dlm->master_lock); + + bucket = dlm_master_hash(dlm, mle->mnamehash); + hlist_add_head(&mle->master_hash_node, bucket); +} /* returns 1 if found, 0 if not */ static int dlm_find_mle(struct dlm_ctxt *dlm, @@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, char *name, unsigned int namelen) { struct dlm_master_list_entry *tmpmle; + struct hlist_head *bucket; + struct hlist_node *list; + unsigned int hash; assert_spin_locked(&dlm->master_lock); - list_for_each_entry(tmpmle, &dlm->master_list, list) { + hash = dlm_lockid_hash(name, namelen); + bucket = dlm_master_hash(dlm, hash); + hlist_for_each(list, bucket) { + tmpmle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref) mle = container_of(kref, struct dlm_master_list_entry, mle_refs); dlm = mle->dlm; - if (mle->type != DLM_MLE_MASTER) { - mlog(0, "calling mle_release for %.*s, type %d\n", - mle->u.name.len, mle->u.name.name, mle->type); - } else { - mlog(0, "calling mle_release for %.*s, type %d\n", - mle->u.res->lockname.len, - mle->u.res->lockname.name, mle->type); - } assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); + mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, + mle->type); + /* remove from list if not already */ - if (!list_empty(&mle->list)) - list_del_init(&mle->list); + __dlm_unlink_mle(dlm, mle); /* detach the mle from the domain node up/down events */ __dlm_mle_detach_hb_events(dlm, mle); + atomic_dec(&dlm->mle_cur_count[mle->type]); + /* NOTE: kfree under spinlock here. * if this is bad, we can move this to a freelist. */ kmem_cache_free(dlm_mle_cache, mle); @@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void) kmem_cache_destroy(dlm_lockres_cache); } -static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 owner) -{ - assert_spin_locked(&res->spinlock); - - mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner); - - if (owner == dlm->node_num) - atomic_inc(&dlm->local_resources); - else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN) - atomic_inc(&dlm->unknown_resources); - else - atomic_inc(&dlm->remote_resources); - - res->owner = owner; -} - -void dlm_change_lockres_owner(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, u8 owner) -{ - assert_spin_locked(&res->spinlock); - - if (owner == res->owner) - return; - - if (res->owner == dlm->node_num) - atomic_dec(&dlm->local_resources); - else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) - atomic_dec(&dlm->unknown_resources); - else - atomic_dec(&dlm->remote_resources); - - dlm_set_lockres_owner(dlm, res, owner); -} - - static void dlm_lockres_release(struct kref *kref) { struct dlm_lock_resource *res; @@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref) } spin_unlock(&dlm->track_lock); + atomic_dec(&dlm->res_cur_count); + dlm_put(dlm); if (!hlist_unhashed(&res->hash_node) || @@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, kref_init(&res->refs); + atomic_inc(&dlm->res_tot_count); + atomic_inc(&dlm->res_cur_count); + /* just for consistency */ spin_lock(&res->spinlock); dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -843,7 +831,7 @@ lookup: alloc_mle = NULL; dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); set_bit(dlm->node_num, mle->maybe_map); - list_add(&mle->list, &dlm->master_list); + __dlm_insert_mle(dlm, mle); /* still holding the dlm spinlock, check the recovery map * to see if there are any nodes that still need to be @@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name); mle->type = DLM_MLE_MASTER; - mle->u.res = res; + mle->mleres = res; } } } @@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res, BUG_ON(mle->type == DLM_MLE_MIGRATION); - if (mle->type != DLM_MLE_MASTER) { - request.namelen = mle->u.name.len; - memcpy(request.name, mle->u.name.name, request.namelen); - } else { - request.namelen = mle->u.res->lockname.len; - memcpy(request.name, mle->u.res->lockname.name, - request.namelen); - } + request.namelen = (u8)mle->mnamelen; + memcpy(request.name, mle->mname, request.namelen); again: ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, @@ -1575,7 +1557,7 @@ way_up_top: // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); - list_add(&mle->list, &dlm->master_list); + __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { // mlog(0, "mle was found\n"); @@ -1967,7 +1949,7 @@ ok: assert->node_idx, rr, extra_ref, mle->inuse); dlm_print_one_mle(mle); } - list_del_init(&mle->list); + __dlm_unlink_mle(dlm, mle); __dlm_mle_detach_hb_events(dlm, mle); __dlm_put_mle(mle); if (extra_ref) { @@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, tmp->master = master; atomic_set(&tmp->woken, 1); wake_up(&tmp->wq); - /* remove it from the list so that only one - * mle will be found */ - list_del_init(&tmp->list); - /* this was obviously WRONG. mle is uninited here. should be tmp. */ + /* remove it so that only one mle will be found */ + __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; mlog(0, "%s:%.*s: master=%u, newmaster=%u, " @@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mle->master = master; /* do this for consistency with other mle types */ set_bit(new_master, mle->maybe_map); - list_add(&mle->list, &dlm->master_list); + __dlm_insert_mle(dlm, mle); return ret; } - -void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +/* + * Sets the owner of the lockres, associated to the mle, to UNKNOWN + */ +static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) { - struct dlm_master_list_entry *mle, *next; struct dlm_lock_resource *res; - unsigned int hash; - mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); -top: - assert_spin_locked(&dlm->spinlock); + /* Find the lockres associated to the mle and set its owner to UNK */ + res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, + mle->mnamehash); + if (res) { + spin_unlock(&dlm->master_lock); - /* clean the master list */ - spin_lock(&dlm->master_lock); - list_for_each_entry_safe(mle, next, &dlm->master_list, list) { - BUG_ON(mle->type != DLM_MLE_BLOCK && - mle->type != DLM_MLE_MASTER && - mle->type != DLM_MLE_MIGRATION); - - /* MASTER mles are initiated locally. the waiting - * process will notice the node map change - * shortly. let that happen as normal. */ - if (mle->type == DLM_MLE_MASTER) - continue; + /* move lockres onto recovery list */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + dlm_move_lockres_to_recovery_list(dlm, res); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); - /* BLOCK mles are initiated by other nodes. - * need to clean up if the dead node would have - * been the master. */ - if (mle->type == DLM_MLE_BLOCK) { - int bit; + /* dump the mle */ + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + } - spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); - if (bit != dead_node) { - mlog(0, "mle found, but dead node %u would " - "not have been master\n", dead_node); - spin_unlock(&mle->spinlock); - } else { - /* must drop the refcount by one since the - * assert_master will never arrive. this - * may result in the mle being unlinked and - * freed, but there may still be a process - * waiting in the dlmlock path which is fine. */ - mlog(0, "node %u was expected master\n", - dead_node); - atomic_set(&mle->woken, 1); - spin_unlock(&mle->spinlock); - wake_up(&mle->wq); - /* do not need events any longer, so detach - * from heartbeat */ - __dlm_mle_detach_hb_events(dlm, mle); - __dlm_put_mle(mle); - } - continue; - } + return res; +} - /* everything else is a MIGRATION mle */ - - /* the rule for MIGRATION mles is that the master - * becomes UNKNOWN if *either* the original or - * the new master dies. all UNKNOWN lockreses - * are sent to whichever node becomes the recovery - * master. the new master is responsible for - * determining if there is still a master for - * this lockres, or if he needs to take over - * mastery. either way, this node should expect - * another message to resolve this. */ - if (mle->master != dead_node && - mle->new_master != dead_node) - continue; +static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + __dlm_mle_detach_hb_events(dlm, mle); - /* if we have reached this point, this mle needs to - * be removed from the list and freed. */ + spin_lock(&mle->spinlock); + __dlm_unlink_mle(dlm, mle); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); - /* remove from the list early. NOTE: unlinking - * list_head while in list_for_each_safe */ - __dlm_mle_detach_hb_events(dlm, mle); - spin_lock(&mle->spinlock); - list_del_init(&mle->list); + wake_up(&mle->wq); +} + +static void dlm_clean_block_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, u8 dead_node) +{ + int bit; + + BUG_ON(mle->type != DLM_MLE_BLOCK); + + spin_lock(&mle->spinlock); + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit != dead_node) { + mlog(0, "mle found, but dead node %u would not have been " + "master\n", dead_node); + spin_unlock(&mle->spinlock); + } else { + /* Must drop the refcount by one since the assert_master will + * never arrive. This may result in the mle being unlinked and + * freed, but there may still be a process waiting in the + * dlmlock path which is fine. */ + mlog(0, "node %u was expected master\n", dead_node); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - mlog(0, "%s: node %u died during migration from " - "%u to %u!\n", dlm->name, dead_node, - mle->master, mle->new_master); - /* if there is a lockres associated with this - * mle, find it and set its owner to UNKNOWN */ - hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); - res = __dlm_lookup_lockres(dlm, mle->u.name.name, - mle->u.name.len, hash); - if (res) { - /* unfortunately if we hit this rare case, our - * lock ordering is messed. we need to drop - * the master lock so that we can take the - * lockres lock, meaning that we will have to - * restart from the head of list. */ - spin_unlock(&dlm->master_lock); + /* Do not need events any longer, so detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } +} - /* move lockres onto recovery list */ - spin_lock(&res->spinlock); - dlm_set_lockres_owner(dlm, res, - DLM_LOCK_RES_OWNER_UNKNOWN); - dlm_move_lockres_to_recovery_list(dlm, res); - spin_unlock(&res->spinlock); - dlm_lockres_put(res); +void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_master_list_entry *mle; + struct dlm_lock_resource *res; + struct hlist_head *bucket; + struct hlist_node *list; + unsigned int i; - /* about to get rid of mle, detach from heartbeat */ - __dlm_mle_detach_hb_events(dlm, mle); + mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); +top: + assert_spin_locked(&dlm->spinlock); - /* dump the mle */ - spin_lock(&dlm->master_lock); - __dlm_put_mle(mle); - spin_unlock(&dlm->master_lock); + /* clean the master list */ + spin_lock(&dlm->master_lock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each(list, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + /* MASTER mles are initiated locally. The waiting + * process will notice the node map change shortly. + * Let that happen as normal. */ + if (mle->type == DLM_MLE_MASTER) + continue; + + /* BLOCK mles are initiated by other nodes. Need to + * clean up if the dead node would have been the + * master. */ + if (mle->type == DLM_MLE_BLOCK) { + dlm_clean_block_mle(dlm, mle, dead_node); + continue; + } - /* restart */ - goto top; - } + /* Everything else is a MIGRATION mle */ + + /* The rule for MIGRATION mles is that the master + * becomes UNKNOWN if *either* the original or the new + * master dies. All UNKNOWN lockres' are sent to + * whichever node becomes the recovery master. The new + * master is responsible for determining if there is + * still a master for this lockres, or if he needs to + * take over mastery. Either way, this node should + * expect another message to resolve this. */ + + if (mle->master != dead_node && + mle->new_master != dead_node) + continue; + + /* If we have reached this point, this mle needs to be + * removed from the list and freed. */ + dlm_clean_migration_mle(dlm, mle); + + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, + mle->new_master); + + /* If we find a lockres associated with the mle, we've + * hit this rare case that messes up our lock ordering. + * If so, we need to drop the master lock so that we can + * take the lockres lock, meaning that we will have to + * restart from the head of list. */ + res = dlm_reset_mleres_owner(dlm, mle); + if (res) + /* restart */ + goto top; - /* this may be the last reference */ - __dlm_put_mle(mle); + /* This may be the last reference */ + __dlm_put_mle(mle); + } } spin_unlock(&dlm->master_lock); } - int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 old_master) { diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 4060bb328bc8..d490b66ad9d7 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); if (!__dlm_lockres_unused(res)) { - spin_unlock(&res->spinlock); mlog(0, "%s:%.*s: tried to purge but not unused\n", dlm->name, res->lockname.len, res->lockname.name); - return -ENOTEMPTY; + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); + BUG(); } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "%s:%.*s: Delay dropref as this lockres is " + "being remastered\n", dlm->name, res->lockname.len, + res->lockname.name); + /* Re-add the lockres to the end of the purge list */ + if (!list_empty(&res->purge)) { + list_del_init(&res->purge); + list_add_tail(&res->purge, &dlm->purge_list); + } + spin_unlock(&res->spinlock); + return 0; + } + master = (res->owner == dlm->node_num); + if (!master) res->state |= DLM_LOCK_RES_DROPPING_REF; spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7219a86d34cc..e15fc7d50827 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { + .flags = 0, +}; + static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, @@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_rename_lops, osb); } +static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* nfs_sync lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC, + &ocfs2_nfs_sync_lops, osb); +} + void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp) { @@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); } +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, + 0, 0); + if (status < 0) + mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + + return status; +} + +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) +{ + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, + ex ? LKM_EXMODE : LKM_PRMODE); +} + int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; @@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); osb->cconn = conn; @@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, ocfs2_lock_res_free(&osb->osb_super_lockres); ocfs2_lock_res_free(&osb->osb_rename_lockres); + ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); ocfs2_cluster_disconnect(osb->cconn, hangup_pending); osb->cconn = NULL; @@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) { ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); } int ocfs2_drop_inode_locks(struct inode *inode) diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 3f8d9986b8e0..e1fd5721cd7f 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb, int ex); int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 2f27b332d8b3..de3da8eb558c 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -31,6 +31,7 @@ #include "ocfs2.h" +#include "alloc.h" #include "dir.h" #include "dlmglue.h" #include "dcache.h" @@ -38,6 +39,7 @@ #include "inode.h" #include "buffer_head_io.h" +#include "suballoc.h" struct ocfs2_inode_handle { @@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, struct ocfs2_inode_handle *handle) { struct inode *inode; + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 blkno = handle->ih_blkno; + int status, set; struct dentry *result; mlog_entry("(0x%p, 0x%p)\n", sb, handle); - if (handle->ih_blkno == 0) { - mlog_errno(-ESTALE); - return ERR_PTR(-ESTALE); + if (blkno == 0) { + mlog(0, "nfs wants inode with blkno: 0\n"); + result = ERR_PTR(-ESTALE); + goto bail; + } + + inode = ocfs2_ilookup(sb, blkno); + /* + * If the inode exists in memory, we only need to check it's + * generation number + */ + if (inode) + goto check_gen; + + /* + * This will synchronize us against ocfs2_delete_inode() on + * all nodes + */ + status = ocfs2_nfs_sync_lock(osb, 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + goto check_err; + } + + status = ocfs2_test_inode_bit(osb, blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + /* + * The blkno NFS gave us doesn't even show up + * as an inode, we return -ESTALE to be + * nice + */ + mlog(0, "test inode bit failed %d\n", status); + status = -ESTALE; + } else { + mlog(ML_ERROR, "test inode bit failed %d\n", status); + } + goto unlock_nfs_sync; + } + + /* If the inode allocator bit is clear, this inode must be stale */ + if (!set) { + mlog(0, "inode %llu suballoc bit is clear\n", blkno); + status = -ESTALE; + goto unlock_nfs_sync; } - inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); + inode = ocfs2_iget(osb, blkno, 0, 0); - if (IS_ERR(inode)) - return (void *)inode; +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(osb, 1); +check_err: + if (status < 0) { + if (status == -ESTALE) { + mlog(0, "stale inode ino: %llu generation: %u\n", + blkno, handle->ih_generation); + } + result = ERR_PTR(status); + goto bail; + } + + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + result = (void *)inode; + goto bail; + } + +check_gen: if (handle->ih_generation != inode->i_generation) { iput(inode); - return ERR_PTR(-ESTALE); + mlog(0, "stale inode ino: %llu generation: %u\n", blkno, + handle->ih_generation); + result = ERR_PTR(-ESTALE); + goto bail; } result = d_obtain_alias(inode); if (!IS_ERR(result)) result->d_op = &ocfs2_dentry_ops; + else + mlog_errno(PTR_ERR(result)); +bail: mlog_exit_ptr(result); return result; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 229e707bc050..10e1fa87396a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -38,6 +38,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "dir.h" #include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" @@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) oi->ip_attr |= OCFS2_DIRSYNC_FL; } +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +{ + struct ocfs2_find_inode_args args; + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = 0; + + return ilookup5(sb, blkno, ocfs2_find_actor, &args); +} struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { @@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); - inode->i_nlink = le16_to_cpu(fe->i_links_count); + inode->i_nlink = ocfs2_read_links_count(fe); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; @@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_set_inode_flags(inode); + OCFS2_I(inode)->ip_last_used_slot = 0; + OCFS2_I(inode)->ip_last_used_group = 0; mlog_exit_void(); } @@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode, } handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + - ocfs2_quota_trans_credits(inode->i_sb)); + ocfs2_quota_trans_credits(inode->i_sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + /* Remove any dir index tree */ + if (S_ISDIR(inode->i_mode)) { + status = ocfs2_dx_dir_truncate(inode, di_bh); + if (status) { + mlog_errno(status); + goto bail_unlock_dir; + } + } + /*Free extended attribute resources associated with this inode.*/ status = ocfs2_xattr_remove(inode, di_bh); if (status < 0) { @@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } + /* + * Synchronize us against ocfs2_get_dentry. We take this in + * shared mode so that all nodes can still concurrently + * process deletes. + */ + status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unblock; + } /* Lock down the inode. This gives us an up to date view of * it's metadata (for verification), and allows us to * serialize delete_inode on multiple nodes. @@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode) if (status != -ENOENT) mlog_errno(status); ocfs2_cleanup_delete_inode(inode, 0); - goto bail_unblock; + goto bail_unlock_nfs_sync; } /* Query the cluster. This will be the final decision made @@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode) bail_unlock_inode: ocfs2_inode_unlock(inode, 1); brelse(di_bh); + +bail_unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); + bail_unblock: status = sigprocmask(SIG_SETMASK, &oldset, NULL); if (status < 0) @@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, spin_unlock(&OCFS2_I(inode)->ip_lock); fe->i_size = cpu_to_le64(i_size_read(inode)); - fe->i_links_count = cpu_to_le16(inode->i_nlink); + ocfs2_set_links_count(fe, inode->i_nlink); fe->i_uid = cpu_to_le32(inode->i_uid); fe->i_gid = cpu_to_le32(inode->i_gid); fe->i_mode = cpu_to_le16(inode->i_mode); @@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode, OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); - inode->i_nlink = le16_to_cpu(fe->i_links_count); + inode->i_nlink = ocfs2_read_links_count(fe); inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); inode->i_mode = le16_to_cpu(fe->i_mode); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index eb3c302b38d3..ea71525aad41 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -72,6 +72,10 @@ struct ocfs2_inode_info struct inode vfs_inode; struct jbd2_inode ip_jinode; + + /* Only valid if the inode is the dir. */ + u32 ip_last_used_slot; + u64 ip_last_used_group; }; /* @@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); int ocfs2_inode_init_private(struct inode *inode); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 57d7d25a2b9a..a20a0f1e37fd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb) return __ocfs2_wait_on_mount(osb, 1); } - - /* - * The recovery_list is a simple linked list of node numbers to recover. - * It is protected by the recovery_lock. + * This replay_map is to track online/offline slots, so we could recover + * offline slots during recovery and mount */ -struct ocfs2_recovery_map { - unsigned int rm_used; - unsigned int *rm_entries; +enum ocfs2_replay_state { + REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */ + REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */ + REPLAY_DONE /* Replay was already queued */ }; +struct ocfs2_replay_map { + unsigned int rm_slots; + enum ocfs2_replay_state rm_state; + unsigned char rm_replay_slots[0]; +}; + +void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +{ + if (!osb->replay_map) + return; + + /* If we've already queued the replay, we don't have any more to do */ + if (osb->replay_map->rm_state == REPLAY_DONE) + return; + + osb->replay_map->rm_state = state; +} + +int ocfs2_compute_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map; + int i, node_num; + + /* If replay map is already set, we don't do it again */ + if (osb->replay_map) + return 0; + + replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + + (osb->max_slots * sizeof(char)), GFP_KERNEL); + + if (!replay_map) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + spin_lock(&osb->osb_lock); + + replay_map->rm_slots = osb->max_slots; + replay_map->rm_state = REPLAY_UNNEEDED; + + /* set rm_replay_slots for offline slot(s) */ + for (i = 0; i < replay_map->rm_slots; i++) { + if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT) + replay_map->rm_replay_slots[i] = 1; + } + + osb->replay_map = replay_map; + spin_unlock(&osb->osb_lock); + return 0; +} + +void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + int i; + + if (!replay_map) + return; + + if (replay_map->rm_state != REPLAY_NEEDED) + return; + + for (i = 0; i < replay_map->rm_slots; i++) + if (replay_map->rm_replay_slots[i]) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, + NULL, NULL); + replay_map->rm_state = REPLAY_DONE; +} + +void ocfs2_free_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + + if (!osb->replay_map) + return; + + kfree(replay_map); + osb->replay_map = NULL; +} + int ocfs2_recovery_init(struct ocfs2_super *osb) { struct ocfs2_recovery_map *rm; @@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = { }, }; +static struct ocfs2_triggers dr_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), +}; + +static struct ocfs2_triggers dl_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), +}; + static int __ocfs2_journal_access(handle_t *handle, struct inode *inode, struct buffer_head *bh, @@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, type); } +int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, + type); +} + +int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, + type); +} + int ocfs2_journal_access(handle_t *handle, struct inode *inode, struct buffer_head *bh, int type) { @@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, } /* Called by the mount code to queue recovery the last part of - * recovery for it's own slot. */ + * recovery for it's own and offline slot(s). */ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) { struct ocfs2_journal *journal = osb->journal; - if (osb->dirty) { - /* No need to queue up our truncate_log as regular - * cleanup will catch that. */ - ocfs2_queue_recovery_completion(journal, - osb->slot_num, - osb->local_alloc_copy, - NULL, - NULL); - ocfs2_schedule_truncate_log_flush(osb, 0); + /* No need to queue up our truncate_log as regular cleanup will catch + * that */ + ocfs2_queue_recovery_completion(journal, osb->slot_num, + osb->local_alloc_copy, NULL, NULL); + ocfs2_schedule_truncate_log_flush(osb, 0); - osb->local_alloc_copy = NULL; - osb->dirty = 0; - } + osb->local_alloc_copy = NULL; + osb->dirty = 0; + + /* queue to recover orphan slots for all offline slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + ocfs2_queue_replay_slots(osb); + ocfs2_free_replay_slots(osb); } void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) @@ -1236,6 +1350,14 @@ restart: goto bail; } + status = ocfs2_compute_replay_slots(osb); + if (status < 0) + mlog_errno(status); + + /* queue recovery for our own slot */ + ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, + NULL, NULL); + spin_lock(&osb->osb_lock); while (rm->rm_used) { /* It's always safe to remove entry zero, as we won't @@ -1301,11 +1423,8 @@ skip_recovery: ocfs2_super_unlock(osb, 1); - /* We always run recovery on our own orphan dir - the dead - * node(s) may have disallowd a previos inode delete. Re-processing - * is therefore required. */ - ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + /* queue recovery for offline slots */ + ocfs2_queue_replay_slots(osb); bail: mutex_lock(&osb->recovery_lock); @@ -1314,6 +1433,7 @@ bail: goto restart; } + ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; mb(); /* sync with ocfs2_recovery_thread_running */ wake_up(&osb->recovery_event); @@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, goto done; } + /* we need to run complete recovery for offline orphan slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 172850a9a12a..619dd7f6c053 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -38,6 +38,17 @@ enum ocfs2_journal_state { struct ocfs2_super; struct ocfs2_dinode; +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + unsigned int rm_used; + unsigned int *rm_entries; +}; + + struct ocfs2_journal { enum ocfs2_journal_state j_state; /* Journals current state */ @@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +int ocfs2_compute_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. @@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, /* dirblock */ int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, struct buffer_head *bh, int type); +/* ocfs2_dx_root_block */ +int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_dx_leaf */ +int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); /* Anything that has no ecc */ int ocfs2_journal_access(handle_t *handle, struct inode *inode, struct buffer_head *bh, int type); @@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb) } /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + - * bitmap block for the new bit) */ -#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) + * bitmap block for the new bit) dx_root update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) + +static inline int ocfs2_add_dir_index_credits(struct super_block *sb) +{ + /* 1 block for index, 2 allocs (data, metadata), 1 clusters + * worth of blocks for initial extent. */ + return 1 + 2 * OCFS2_SUBALLOC_ALLOC + + ocfs2_clusters_to_blocks(sb, 1); +} -/* parent fe, parent block, new file entry, inode alloc fe, inode alloc - * group descriptor + mkdir/symlink blocks + quota update */ -static inline int ocfs2_mknod_credits(struct super_block *sb) +/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode + * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr + * blocks + quota update */ +static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, + int xattr_credits) { - return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + + int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS; + + if (is_dir) + dir_credits += ocfs2_add_dir_index_credits(sb); + + return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits + ocfs2_quota_trans_credits(sb); } @@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb) #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota - * update on dir */ + * update on dir + index leaf + dx root update for free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + + return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_quota_trans_credits(sb); } /* inode + dir inode (if we unlink a dir), + dir entry block + orphan - * dir inode link */ + * dir inode link + dir inode index leaf + dir index root */ static inline int ocfs2_unlink_credits(struct super_block *sb) { /* The quota update from ocfs2_link_credits is unused here... */ - return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); + return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb); } /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + - * inode alloc group descriptor */ -#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) + * inode alloc group descriptor + orphan dir index leaf */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3) /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming - * directory + target unlink */ + * directory + target unlink + 3 x dir index leaves */ static inline int ocfs2_rename_credits(struct super_block *sb) { - return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); + return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb); } /* global bitmap dinode, group desc., relinked group, @@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb) + OCFS2_INODE_UPDATE_CREDITS \ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) +/* inode update, removal of dx root block from allocator */ +#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_SUBALLOC_FREE) + +static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) +{ + int credits = 1 + OCFS2_SUBALLOC_ALLOC; + + credits += ocfs2_clusters_to_blocks(sb, 1); + credits += ocfs2_quota_trans_credits(sb); + + return credits; +} + /* * Please note that the caller must make sure that root_el is the root * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise @@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, static inline int ocfs2_calc_symlink_credits(struct super_block *sb) { - int blocks = ocfs2_mknod_credits(sb); + int blocks = ocfs2_mknod_credits(sb, 0, 0); /* links can be longer than one block so we may update many * within our single allocated extent. */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ec70cdbe77fc..bac7e6abaf47 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> -#include <linux/debugfs.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -#ifdef CONFIG_OCFS2_FS_STATS - -static int ocfs2_la_debug_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - -#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE -#define LA_DEBUG_VER 1 -static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - static DEFINE_MUTEX(la_debug_mutex); - struct ocfs2_super *osb = file->private_data; - int written, ret; - char *buf = osb->local_alloc_debug_buf; - - mutex_lock(&la_debug_mutex); - memset(buf, 0, LA_DEBUG_BUF_SZ); - - written = snprintf(buf, LA_DEBUG_BUF_SZ, - "0x%x\t0x%llx\t%u\t%u\t0x%x\n", - LA_DEBUG_VER, - (unsigned long long)osb->la_last_gd, - osb->local_alloc_default_bits, - osb->local_alloc_bits, osb->local_alloc_state); - - ret = simple_read_from_buffer(userbuf, count, ppos, buf, written); - - mutex_unlock(&la_debug_mutex); - return ret; -} - -static const struct file_operations ocfs2_la_debug_fops = { - .open = ocfs2_la_debug_open, - .read = ocfs2_la_debug_read, -}; - -static void ocfs2_init_la_debug(struct ocfs2_super *osb) -{ - osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS); - if (!osb->local_alloc_debug_buf) - return; - - osb->local_alloc_debug = debugfs_create_file("local_alloc_stats", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_la_debug_fops); - if (!osb->local_alloc_debug) { - kfree(osb->local_alloc_debug_buf); - osb->local_alloc_debug_buf = NULL; - } -} - -static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) -{ - if (osb->local_alloc_debug) - debugfs_remove(osb->local_alloc_debug); - - if (osb->local_alloc_debug_buf) - kfree(osb->local_alloc_debug_buf); - - osb->local_alloc_debug_buf = NULL; - osb->local_alloc_debug = NULL; -} -#else /* CONFIG_OCFS2_FS_STATS */ -static void ocfs2_init_la_debug(struct ocfs2_super *osb) -{ - return; -} -static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) -{ - return; -} -#endif - static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { return (osb->local_alloc_state == OCFS2_LA_THROTTLED || @@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); - ocfs2_init_la_debug(osb); - if (osb->local_alloc_bits == 0) goto bail; @@ -299,9 +218,6 @@ bail: if (inode) iput(inode); - if (status < 0) - ocfs2_shutdown_la_debug(osb); - mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); mlog_exit(status); @@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) cancel_delayed_work(&osb->la_enable_wq); flush_workqueue(ocfs2_wq); - ocfs2_shutdown_la_debug(osb); - if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4b11762f249e..2220f93f668b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, struct inode *inode, char *name, - struct buffer_head **de_bh); + struct ocfs2_dir_lookup_result *lookup); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct ocfs2_dinode *fe, char *name, - struct buffer_head *de_bh, + struct ocfs2_dir_lookup_result *lookup, struct inode *orphan_dir_inode); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, @@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; struct buffer_head *new_fe_bh = NULL; - struct buffer_head *de_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *xattr_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; int want_clusters = 0; + int want_meta = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { .enable = 1, }; int did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, (unsigned long)dev, dentry->d_name.len, @@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir, return status; } - if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { + if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) { status = -EMLINK; goto leave; } dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; - if (!dirfe->i_links_count) { + if (!ocfs2_read_links_count(dirfe)) { /* can't make a file in a deleted directory. */ status = -ENOENT; goto leave; @@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir, /* get a spot inside the dir. */ status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, dentry->d_name.name, - dentry->d_name.len, &de_bh); + dentry->d_name.len, &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir, /* calculate meta data/clusters for setting security and acl xattr */ status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, - &si, &want_clusters, - &xattr_credits, &xattr_ac); + &si, &want_clusters, + &xattr_credits, &want_meta); if (status < 0) { mlog_errno(status); goto leave; } /* Reserve a cluster if creating an extent based directory. */ - if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) + if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { want_clusters += 1; + /* Dir indexing requires extra space as well */ + if (ocfs2_supports_indexed_dirs(osb)) + want_meta++; + } + + status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); if (status < 0) { if (status != -ENOSPC) @@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + - xattr_credits); + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), + xattr_credits)); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir, if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, - new_fe_bh, data_ac); + new_fe_bh, data_ac, meta_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir, mlog_errno(status); goto leave; } - le16_add_cpu(&dirfe->i_links_count, 1); + ocfs2_add_links_count(dirfe, 1); status = ocfs2_journal_dirty(handle, parent_fe_bh); if (status < 0) { mlog_errno(status); @@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir, } status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, - xattr_ac, data_ac); + meta_ac, data_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir, if (si.enable) { status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, - xattr_ac, data_ac); + meta_ac, data_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir, status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, - de_bh); + &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -423,11 +437,12 @@ leave: mlog(0, "Disk is full\n"); brelse(new_fe_bh); - brelse(de_bh); brelse(parent_fe_bh); kfree(si.name); kfree(si.value); + ocfs2_free_dir_lookup_result(&lookup); + if ((status < 0) && inode) { clear_nlink(inode); iput(inode); @@ -439,8 +454,8 @@ leave: if (data_ac) ocfs2_free_alloc_context(data_ac); - if (xattr_ac) - ocfs2_free_alloc_context(xattr_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); mlog_exit(status); @@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct ocfs2_extent_list *fel; u64 fe_blkno = 0; u16 suballoc_bit; + u16 feat; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, inode->i_mode, (unsigned long)dev, dentry->d_name.len, @@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, - &fe_blkno); + status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, + inode_ac, &suballoc_bit, &fe_blkno); if (status < 0) { mlog_errno(status); goto leave; @@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_mode = cpu_to_le16(inode->i_mode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); - fe->i_links_count = cpu_to_le16(inode->i_nlink); + + ocfs2_set_links_count(fe, inode->i_nlink); fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); @@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_dtime = 0; /* - * If supported, directories start with inline data. + * If supported, directories start with inline data. If inline + * isn't supported, but indexing is, we start them as indexed. */ + feat = le16_to_cpu(fe->i_dyn_features); if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { - u16 feat = le16_to_cpu(fe->i_dyn_features); - fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); fe->id2.i_data.id_count = cpu_to_le16( @@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry, int err; struct buffer_head *fe_bh = NULL; struct buffer_head *parent_fe_bh = NULL; - struct buffer_head *de_bh = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, old_dentry->d_name.len, old_dentry->d_name.name, @@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry, err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, dentry->d_name.name, - dentry->d_name.len, &de_bh); + dentry->d_name.len, &lookup); if (err < 0) { mlog_errno(err); goto out; @@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry, } fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { + if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) { err = -EMLINK; goto out_unlock_inode; } @@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry, inc_nlink(inode); inode->i_ctime = CURRENT_TIME; - fe->i_links_count = cpu_to_le16(inode->i_nlink); + ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); err = ocfs2_journal_dirty(handle, fe_bh); if (err < 0) { - le16_add_cpu(&fe->i_links_count, -1); + ocfs2_add_links_count(fe, -1); drop_nlink(inode); mlog_errno(err); goto out_commit; @@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry, err = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, - parent_fe_bh, de_bh); + parent_fe_bh, &lookup); if (err) { - le16_add_cpu(&fe->i_links_count, -1); + ocfs2_add_links_count(fe, -1); drop_nlink(inode); mlog_errno(err); goto out_commit; @@ -714,10 +731,11 @@ out_unlock_inode: out: ocfs2_inode_unlock(dir, 1); - brelse(de_bh); brelse(fe_bh); brelse(parent_fe_bh); + ocfs2_free_dir_lookup_result(&lookup); + mlog_exit(err); return err; @@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir, struct buffer_head *fe_bh = NULL; struct buffer_head *parent_node_bh = NULL; handle_t *handle = NULL; - struct ocfs2_dir_entry *dirent = NULL; - struct buffer_head *dirent_bh = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; - struct buffer_head *orphan_entry_bh = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); @@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir, } status = ocfs2_find_files_on_disk(dentry->d_name.name, - dentry->d_name.len, &blkno, - dir, &dirent_bh, &dirent); + dentry->d_name.len, &blkno, dir, + &lookup); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir, child_locked = 1; if (S_ISDIR(inode->i_mode)) { - if (!ocfs2_empty_dir(inode)) { - status = -ENOTEMPTY; - goto leave; - } else if (inode->i_nlink != 2) { + if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) { status = -ENOTEMPTY; goto leave; } @@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir, if (inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, - orphan_name, - &orphan_entry_bh); + orphan_name, &orphan_insert); if (status < 0) { mlog_errno(status); goto leave; @@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir, if (inode_is_unlinkable(inode)) { status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, - orphan_entry_bh, orphan_dir); + &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); goto leave; @@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir, } /* delete the name from the parent dir */ - status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); + status = ocfs2_delete_entry(handle, dir, &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir, if (S_ISDIR(inode->i_mode)) drop_nlink(inode); drop_nlink(inode); - fe->i_links_count = cpu_to_le16(inode->i_nlink); + ocfs2_set_links_count(fe, inode->i_nlink); status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { @@ -916,9 +929,10 @@ leave: } brelse(fe_bh); - brelse(dirent_bh); brelse(parent_node_bh); - brelse(orphan_entry_bh); + + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&lookup); mlog_exit(status); @@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { - int status = 0, rename_lock = 0, parents_locked = 0; - int old_child_locked = 0, new_child_locked = 0; + int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; + int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct inode *orphan_dir = NULL; @@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir, handle_t *handle = NULL; struct buffer_head *old_dir_bh = NULL; struct buffer_head *new_dir_bh = NULL; - struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL, - *new_de = NULL; - struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above - struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, - // this is the 1st dirent bh nlink_t old_dir_nlink = old_dir->i_nlink; struct ocfs2_dinode *old_di; + struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; + struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; + struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct ocfs2_dir_lookup_result target_insert = { NULL, }; /* At some point it might be nice to break this function up a * bit. */ @@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(old_inode->i_mode)) { u64 old_inode_parent; + update_dot_dot = 1; status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, - old_inode, &old_inode_de_bh, - &old_inode_dot_dot_de); + old_inode, + &old_inode_dot_dot_res); if (status) { status = -EIO; goto bail; @@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir, } if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= OCFS2_LINK_MAX) { + new_dir->i_nlink >= ocfs2_link_max(osb)) { status = -EMLINK; goto bail; } @@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir, * to delete it */ status = ocfs2_find_files_on_disk(new_dentry->d_name.name, new_dentry->d_name.len, - &newfe_blkno, new_dir, &new_de_bh, - &new_de); + &newfe_blkno, new_dir, + &target_lookup_res); /* The only error we allow here is -ENOENT because the new * file not existing is perfectly valid. */ if ((status < 0) && (status != -ENOENT)) { @@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + if (status == 0) + target_exists = 1; - if (!new_de && new_inode) { + if (!target_exists && new_inode) { /* * Target was unlinked by another node while we were * waiting to get to ocfs2_rename(). There isn't @@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir, /* In case we need to overwrite an existing file, we blow it * away first */ - if (new_de) { + if (target_exists) { /* VFS didn't think there existed an inode here, but * someone else in the cluster must have raced our * rename to create one. Today we error cleanly, in @@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; - mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " - "newfebh=%p bhblocknr=%llu\n", new_de, + mlog(0, "aha rename over existing... new_blkno=%llu " + "newfebh=%p bhblocknr=%llu\n", (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? (unsigned long long)newfe_bh->b_blocknr : 0ULL); @@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, new_inode, orphan_name, - &orphan_entry_bh); + &orphan_insert); if (status < 0) { mlog_errno(status); goto bail; @@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, new_dentry->d_name.name, new_dentry->d_name.len, - &insert_entry_bh); + &target_insert); if (status < 0) { mlog_errno(status); goto bail; @@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (new_de) { + if (target_exists) { if (S_ISDIR(new_inode->i_mode)) { - if (!ocfs2_empty_dir(new_inode) || - new_inode->i_nlink != 2) { + if (new_inode->i_nlink != 2 || + !ocfs2_empty_dir(new_inode)) { status = -ENOTEMPTY; goto bail; } @@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir, } if (S_ISDIR(new_inode->i_mode) || - (newfe->i_links_count == cpu_to_le16(1))){ + (ocfs2_read_links_count(newfe) == 1)) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe, orphan_name, - orphan_entry_bh, orphan_dir); + &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); goto bail; @@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir, } /* change the dirent to point to the correct inode */ - status = ocfs2_update_entry(new_dir, handle, new_de_bh, - new_de, old_inode); + status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, + old_inode); if (status < 0) { mlog_errno(status); goto bail; @@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir, new_dir->i_version++; if (S_ISDIR(new_inode->i_mode)) - newfe->i_links_count = 0; + ocfs2_set_links_count(newfe, 0); else - le16_add_cpu(&newfe->i_links_count, -1); + ocfs2_add_links_count(newfe, -1); status = ocfs2_journal_dirty(handle, newfe_bh); if (status < 0) { @@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir, /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, OCFS2_I(old_inode)->ip_blkno, - new_dir_bh, insert_entry_bh); + new_dir_bh, &target_insert); } old_inode->i_ctime = CURRENT_TIME; @@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir, * because the insert might have changed the type of directory * we're dealing with. */ - old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, - old_dentry->d_name.len, - old_dir, &old_de); - if (!old_de_bh) { - status = -EIO; + status = ocfs2_find_entry(old_dentry->d_name.name, + old_dentry->d_name.len, old_dir, + &old_entry_lookup); + if (status) goto bail; - } - status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); + status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); goto bail; @@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir, new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; - if (old_inode_de_bh) { - status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, - old_inode_dot_dot_de, new_dir); + + if (update_dot_dot) { + status = ocfs2_update_entry(old_inode, handle, + &old_inode_dot_dot_res, new_dir); old_dir->i_nlink--; if (new_inode) { new_inode->i_nlink--; @@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir, } else { struct ocfs2_dinode *fe; status = ocfs2_journal_access_di(handle, old_dir, - old_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; - fe->i_links_count = cpu_to_le16(old_dir->i_nlink); + ocfs2_set_links_count(fe, old_dir->i_nlink); status = ocfs2_journal_dirty(handle, old_dir_bh); } } - ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: @@ -1429,13 +1444,17 @@ bail: if (new_inode) iput(new_inode); + + ocfs2_free_dir_lookup_result(&target_lookup_res); + ocfs2_free_dir_lookup_result(&old_entry_lookup); + ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res); + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&target_insert); + brelse(newfe_bh); brelse(old_inode_bh); brelse(old_dir_bh); brelse(new_dir_bh); - brelse(new_de_bh); - brelse(old_de_bh); - brelse(old_inode_de_bh); brelse(orphan_entry_bh); brelse(insert_entry_bh); @@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir, struct inode *inode = NULL; struct super_block *sb; struct buffer_head *new_fe_bh = NULL; - struct buffer_head *de_bh = NULL; struct buffer_head *parent_fe_bh = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_dinode *dirfe; @@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir, .enable = 1, }; int did_quota = 0, did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir, } dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; - if (!dirfe->i_links_count) { + if (!ocfs2_read_links_count(dirfe)) { /* can't make a file in a deleted directory. */ status = -ENOENT; goto bail; @@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir, status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, dentry->d_name.name, - dentry->d_name.len, &de_bh); + dentry->d_name.len, &lookup); if (status < 0) { mlog_errno(status); goto bail; @@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir, status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, - de_bh); + &lookup); if (status < 0) { mlog_errno(status); goto bail; @@ -1772,9 +1791,9 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - brelse(de_bh); kfree(si.name); kfree(si.value); + ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) @@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, struct inode *inode, char *name, - struct buffer_head **de_bh) + struct ocfs2_dir_lookup_result *lookup) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; @@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, de_bh); + OCFS2_ORPHAN_NAMELEN, lookup); if (status < 0) { ocfs2_inode_unlock(orphan_dir_inode, 1); @@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe, char *name, - struct buffer_head *de_bh, + struct ocfs2_dir_lookup_result *lookup, struct inode *orphan_dir_inode) { struct buffer_head *orphan_dir_bh = NULL; @@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, * underneath us... */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) - le16_add_cpu(&orphan_fe->i_links_count, 1); - orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + ocfs2_add_links_count(orphan_fe, 1); + orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); status = ocfs2_journal_dirty(handle, orphan_dir_bh); if (status < 0) { @@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, status = __ocfs2_add_entry(handle, orphan_dir_inode, name, OCFS2_ORPHAN_NAMELEN, inode, OCFS2_I(inode)->ip_blkno, - orphan_dir_bh, de_bh); + orphan_dir_bh, lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, char name[OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; - struct buffer_head *target_de_bh = NULL; - struct ocfs2_dir_entry *target_de = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; mlog_entry_void(); @@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, OCFS2_ORPHAN_NAMELEN); /* find it's spot in the orphan directory */ - target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, - orphan_dir_inode, &target_de); - if (!target_de_bh) { - status = -ENOENT; + status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + &lookup); + if (status) { mlog_errno(status); goto leave; } /* remove it from the orphan directory */ - status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, - target_de_bh); + status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup); if (status < 0) { mlog_errno(status); goto leave; @@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) - le16_add_cpu(&orphan_fe->i_links_count, -1); - orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + ocfs2_add_links_count(orphan_fe, -1); + orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); status = ocfs2_journal_dirty(handle, orphan_dir_bh); if (status < 0) { @@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, } leave: - brelse(target_de_bh); + ocfs2_free_dir_lookup_result(&lookup); mlog_exit(status); return status; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 946d3c34b90b..1386281950db 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -209,6 +209,7 @@ enum ocfs2_mount_options struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; +struct ocfs2_replay_map; struct ocfs2_quota_recovery; struct ocfs2_dentry_lock; struct ocfs2_super @@ -264,6 +265,7 @@ struct ocfs2_super atomic_t vol_state; struct mutex recovery_lock; struct ocfs2_recovery_map *recovery_map; + struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; @@ -287,11 +289,6 @@ struct ocfs2_super u64 la_last_gd; -#ifdef CONFIG_OCFS2_FS_STATS - struct dentry *local_alloc_debug; - char *local_alloc_debug_buf; -#endif - /* Next three fields are for local node slot recovery during * mount. */ int dirty; @@ -305,9 +302,11 @@ struct ocfs2_super struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; + struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; + struct dentry *osb_ctxt; wait_queue_head_t recovery_event; @@ -344,6 +343,12 @@ struct ocfs2_super /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; + + unsigned int osb_dx_mask; + u32 osb_dx_seed[4]; + + /* the group we used to allocate inodes. */ + u64 osb_inode_alloc_group; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) + return 1; + return 0; +} + +static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) +{ + if (ocfs2_supports_indexed_dirs(osb)) + return OCFS2_DX_LINK_MAX; + return OCFS2_LINK_MAX; +} + +static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) +{ + u32 nlink = le16_to_cpu(di->i_links_count); + u32 hi = le16_to_cpu(di->i_links_count_hi); + + if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + + return nlink; +} + +static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) +{ + u16 lo, hi; + + lo = nlink; + hi = nlink >> OCFS2_LINKS_HI_SHIFT; + + di->i_links_count = cpu_to_le16(lo); + di->i_links_count_hi = cpu_to_le16(hi); +} + +static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) +{ + u32 links = ocfs2_read_links_count(di); + + links += n; + + ocfs2_set_links_count(di, links); +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) +#define OCFS2_IS_VALID_DX_ROOT(ptr) \ + (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) + +#define OCFS2_IS_VALID_DX_LEAF(ptr) \ + (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { @@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; } +static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, + u64 blocks) +{ + int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; + unsigned int clusters; + + clusters = ocfs2_blocks_to_clusters(sb, blocks); + return (u64)clusters << bits; +} + static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, u64 bytes) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 2332ef740f4f..7ab6e9e5e77c 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -66,6 +66,8 @@ #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" +#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" +#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -95,7 +97,8 @@ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ - | OCFS2_FEATURE_INCOMPAT_META_ECC) + | OCFS2_FEATURE_INCOMPAT_META_ECC \ + | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -151,6 +154,9 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 +/* Support for indexed directores */ +#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 + /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 @@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ OCFS2_DIR_ROUND) & \ ~OCFS2_DIR_ROUND) +#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) #define OCFS2_LINK_MAX 32000 +#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) +#define OCFS2_LINKS_HI_SHIFT 16 +#define OCFS2_DX_ENTRIES_MAX (0xffffffffU) #define S_SHIFT 12 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -628,8 +638,9 @@ struct ocfs2_super_block { /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; - __le32 s_reserved1; -/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ + __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. + * s_uuid_hash serves as seed[3]. */ +/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* @@ -679,7 +690,7 @@ struct ocfs2_dinode { belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ -/*10*/ __le16 i_reserved0; +/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ @@ -705,7 +716,8 @@ struct ocfs2_dinode { __le16 i_dyn_features; __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ -/*88*/ __le64 i_reserved2[6]; +/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ + __le64 i_reserved2[5]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer { /*40*/ }; + /* + * A directory entry in the indexed tree. We don't store the full name here, + * but instead provide a pointer to the full dirent in the unindexed tree. + * + * We also store name_len here so as to reduce the number of leaf blocks we + * need to search in case of collisions. + */ +struct ocfs2_dx_entry { + __le32 dx_major_hash; /* Used to find logical + * cluster in index */ + __le32 dx_minor_hash; /* Lower bits used to find + * block in cluster */ + __le64 dx_dirent_blk; /* Physical block in unindexed + * tree holding this dirent. */ +}; + +struct ocfs2_dx_entry_list { + __le32 de_reserved; + __le16 de_count; /* Maximum number of entries + * possible in de_entries */ + __le16 de_num_used; /* Current number of + * de_entries entries */ + struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries + * in a packed array of + * length de_num_used */ +}; + +#define OCFS2_DX_FLAG_INLINE 0x01 + +/* + * A directory indexing block. Each indexed directory has one of these, + * pointed to by ocfs2_dinode. + * + * This block stores an indexed btree root, and a set of free space + * start-of-list pointers. + */ +struct ocfs2_dx_root_block { + __u8 dr_signature[8]; /* Signature for verification */ + struct ocfs2_block_check dr_check; /* Error checking */ + __le16 dr_suballoc_slot; /* Slot suballocator this + * block belongs to. */ + __le16 dr_suballoc_bit; /* Bit offset in suballocator + * block group */ + __le32 dr_fs_generation; /* Must match super block */ + __le64 dr_blkno; /* Offset on disk, in blocks */ + __le64 dr_last_eb_blk; /* Pointer to last + * extent block */ + __le32 dr_clusters; /* Clusters allocated + * to the indexed tree. */ + __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ + __u8 dr_reserved0; + __le16 dr_reserved1; + __le64 dr_dir_blkno; /* Pointer to parent inode */ + __le32 dr_num_entries; /* Total number of + * names stored in + * this directory.*/ + __le32 dr_reserved2; + __le64 dr_free_blk; /* Pointer to head of free + * unindexed block list. */ + __le64 dr_reserved3[15]; + union { + struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 + * bits for maximum space + * efficiency. */ + struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of + * entries. We grow out + * to extents if this + * gets too big. */ + }; +}; + +/* + * The header of a leaf block in the indexed tree. + */ +struct ocfs2_dx_leaf { + __u8 dl_signature[8];/* Signature for verification */ + struct ocfs2_block_check dl_check; /* Error checking */ + __le64 dl_blkno; /* Offset on disk, in blocks */ + __le32 dl_fs_generation;/* Must match super block */ + __le32 dl_reserved0; + __le64 dl_reserved1; + struct ocfs2_dx_entry_list dl_list; +}; + /* * On disk allocator group structure for OCFS2 */ @@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr( return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; @@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + +static inline int ocfs2_dx_entries_per_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + static inline u16 ocfs2_local_alloc_size(struct super_block *sb) { u16 size; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index eb6f50c9ceca..a53ce87481bf 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -47,6 +47,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, OCFS2_LOCK_TYPE_QINFO, + OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_NUM_LOCK_TYPES }; @@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_QINFO: c = 'Q'; break; + case OCFS2_LOCK_TYPE_NFS_SYNC: + c = 'Y'; + break; default: c = '\0'; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a69628603e18..b4ca5911caaf 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -48,7 +48,8 @@ #include "buffer_head_io.h" #define NOT_ALLOC_NEW_GROUP 0 -#define ALLOC_NEW_GROUP 1 +#define ALLOC_NEW_GROUP 0x1 +#define ALLOC_GROUPS_FROM_GLOBAL 0x2 #define OCFS2_MAX_INODES_TO_STEAL 1024 @@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle, static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, struct buffer_head *bh, - u64 max_block); + u64 max_block, + u64 *last_alloc_group, + int flags); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, @@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u16 *bg_bit_off); static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, u32 bits_wanted, u64 max_block, + int flags, struct ocfs2_alloc_context **ac); void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) @@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, struct buffer_head *bh, - u64 max_block) + u64 max_block, + u64 *last_alloc_group, + int flags) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; @@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, cl = &fe->id2.i_chain; status = ocfs2_reserve_clusters_with_limit(osb, le16_to_cpu(cl->cl_cpg), - max_block, &ac); + max_block, flags, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, goto bail; } + if (last_alloc_group && *last_alloc_group != 0) { + mlog(0, "use old allocation group %llu for block group alloc\n", + (unsigned long long)*last_alloc_group); + ac->ac_last_group = *last_alloc_group; + } status = ocfs2_claim_clusters(osb, handle, ac, @@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); status = 0; + + /* save the new last alloc group so that the caller can cache it. */ + if (last_alloc_group) + *last_alloc_group = ac->ac_last_group; + bail: if (handle) ocfs2_commit_trans(osb, handle); @@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, int type, u32 slot, - int alloc_new_group) + u64 *last_alloc_group, + int flags) { int status; u32 bits_wanted = ac->ac_bits_wanted; @@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } - if (alloc_new_group != ALLOC_NEW_GROUP) { + if (!(flags & ALLOC_NEW_GROUP)) { mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " "and we don't alloc a new group for it.\n", slot, bits_wanted, free_bits); @@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, } status = ocfs2_block_group_alloc(osb, alloc_inode, bh, - ac->ac_max_block); + ac->ac_max_block, + last_alloc_group, flags); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, (*ac), EXTENT_ALLOC_SYSTEM_INODE, - slot, ALLOC_NEW_GROUP); + slot, NULL, ALLOC_NEW_GROUP); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, ac, INODE_ALLOC_SYSTEM_INODE, - slot, NOT_ALLOC_NEW_GROUP); + slot, NULL, + NOT_ALLOC_NEW_GROUP); if (status >= 0) { ocfs2_set_inode_steal_slot(osb, slot); break; @@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, { int status; s16 slot = ocfs2_get_inode_steal_slot(osb); + u64 alloc_group; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, goto inode_steal; atomic_set(&osb->s_num_inodes_stolen, 0); + alloc_group = osb->osb_inode_alloc_group; status = ocfs2_reserve_suballoc_bits(osb, *ac, INODE_ALLOC_SYSTEM_INODE, - osb->slot_num, ALLOC_NEW_GROUP); + osb->slot_num, + &alloc_group, + ALLOC_NEW_GROUP | + ALLOC_GROUPS_FROM_GLOBAL); if (status >= 0) { status = 0; + spin_lock(&osb->osb_lock); + osb->osb_inode_alloc_group = alloc_group; + spin_unlock(&osb->osb_lock); + mlog(0, "after reservation, new allocation group is " + "%llu\n", (unsigned long long)alloc_group); + /* * Some inodes must be freed by us, so try to allocate * from our own next time. @@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, ac, GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT, + OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); if (status < 0 && status != -ENOSPC) { mlog_errno(status); @@ -806,6 +836,7 @@ bail: * things a bit. */ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, u32 bits_wanted, u64 max_block, + int flags, struct ocfs2_alloc_context **ac) { int status; @@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, (*ac)->ac_max_block = max_block; status = -ENOSPC; - if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { + if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && + ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); @@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac) { - return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, + ALLOC_NEW_GROUP, ac); } /* @@ -1618,8 +1651,41 @@ bail: return status; } +static void ocfs2_init_inode_ac_group(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac) +{ + struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; + /* + * Try to allocate inodes from some specific group. + * + * If the parent dir has recorded the last group used in allocation, + * cool, use it. Otherwise if we try to allocate new inode from the + * same slot the parent dir belongs to, use the same chunk. + * + * We are very careful here to avoid the mistake of setting + * ac_last_group to a group descriptor from a different (unlocked) slot. + */ + if (OCFS2_I(dir)->ip_last_used_group && + OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) + ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; + else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(fe->i_blkno), + le16_to_cpu(fe->i_suballoc_bit)); +} + +static inline void ocfs2_save_inode_ac_group(struct inode *dir, + struct ocfs2_alloc_context *ac) +{ + OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; + OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; +} + int ocfs2_claim_new_inode(struct ocfs2_super *osb, handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, u16 *suballoc_bit, u64 *fe_blkno) @@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb, BUG_ON(ac->ac_bits_wanted != 1); BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + status = ocfs2_claim_suballoc_bits(osb, ac, handle, @@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb, *fe_blkno = bg_blkno + (u64) (*suballoc_bit); ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); status = 0; bail: mlog_exit(status); @@ -2116,3 +2185,162 @@ out: return ret; } + +/* + * Read the inode specified by blkno to get suballoc_slot and + * suballoc_bit. + */ +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + u16 *suballoc_slot, u16 *suballoc_bit) +{ + int status; + struct buffer_head *inode_bh = NULL; + struct ocfs2_dinode *inode_fe; + + mlog_entry("blkno: %llu\n", blkno); + + /* dirty read disk */ + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); + if (status < 0) { + mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status); + goto bail; + } + + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { + mlog(ML_ERROR, "invalid inode %llu requested\n", blkno); + status = -EINVAL; + goto bail; + } + + if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", + blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); + status = -EINVAL; + goto bail; + } + + if (suballoc_slot) + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); + if (suballoc_bit) + *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + +bail: + brelse(inode_bh); + + mlog_exit(status); + return status; +} + +/* + * test whether bit is SET in allocator bitmap or not. on success, 0 + * is returned and *res is 1 for SET; 0 otherwise. when fails, errno + * is returned and *res is meaningless. Call this after you have + * cluster locked against suballoc, or you may get a result based on + * non-up2date contents + */ +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, + struct inode *suballoc, + struct buffer_head *alloc_bh, u64 blkno, + u16 bit, int *res) +{ + struct ocfs2_dinode *alloc_fe; + struct ocfs2_group_desc *group; + struct buffer_head *group_bh = NULL; + u64 bg_blkno; + int status; + + mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); + + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { + mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", + (unsigned int)bit, + ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); + status = -EINVAL; + goto bail; + } + + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, + &group_bh); + if (status < 0) { + mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); + +bail: + brelse(group_bh); + + mlog_exit(status); + return status; +} + +/* + * Test if the bit representing this inode (blkno) is set in the + * suballocator. + * + * On success, 0 is returned and *res is 1 for SET; 0 otherwise. + * + * In the event of failure, a negative value is returned and *res is + * meaningless. + * + * Callers must make sure to hold nfs_sync_lock to prevent + * ocfs2_delete_inode() on another node from accessing the same + * suballocator concurrently. + */ +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) +{ + int status; + u16 suballoc_bit = 0, suballoc_slot = 0; + struct inode *inode_alloc_inode; + struct buffer_head *alloc_bh = NULL; + + mlog_entry("blkno: %llu", blkno); + + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, + &suballoc_bit); + if (status < 0) { + mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); + goto bail; + } + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + /* the error code could be inaccurate, but we are not able to + * get the correct one. */ + status = -EINVAL; + mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", + (u32)suballoc_slot); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", + (u32)suballoc_slot, status); + goto bail; + } + + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + blkno, suballoc_bit, res); + if (status < 0) + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + + ocfs2_inode_unlock(inode_alloc_inode, 0); + mutex_unlock(&inode_alloc_inode->i_mutex); + + iput(inode_alloc_inode); + brelse(alloc_bh); +bail: + mlog_exit(status); + return status; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index e3c13c77f9e8..8c9a78a43164 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb, u64 *blkno_start); int ocfs2_claim_new_inode(struct ocfs2_super *osb, handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, u16 *suballoc_bit, u64 *fe_blkno); @@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, struct ocfs2_alloc_context **meta_ac); + +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7ac83a81ee55..79ff8d9d37e0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -201,6 +201,170 @@ static const match_table_t tokens = { {Opt_err, NULL} }; +#ifdef CONFIG_DEBUG_FS +static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) +{ + int out = 0; + int i; + struct ocfs2_cluster_connection *cconn = osb->cconn; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + out += snprintf(buf + out, len - out, + "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", + "Device", osb->dev_str, osb->uuid_str, + osb->fs_generation, osb->vol_label); + + out += snprintf(buf + out, len - out, + "%10s => State: %d Flags: 0x%lX\n", "Volume", + atomic_read(&osb->vol_state), osb->osb_flags); + + out += snprintf(buf + out, len - out, + "%10s => Block: %lu Cluster: %d\n", "Sizes", + osb->sb->s_blocksize, osb->s_clustersize); + + out += snprintf(buf + out, len - out, + "%10s => Compat: 0x%X Incompat: 0x%X " + "ROcompat: 0x%X\n", + "Features", osb->s_feature_compat, + osb->s_feature_incompat, osb->s_feature_ro_compat); + + out += snprintf(buf + out, len - out, + "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", + osb->s_mount_opt, osb->s_atime_quantum); + + out += snprintf(buf + out, len - out, + "%10s => Stack: %s Name: %*s Version: %d.%d\n", + "Cluster", + (*osb->osb_cluster_stack == '\0' ? + "o2cb" : osb->osb_cluster_stack), + cconn->cc_namelen, cconn->cc_name, + cconn->cc_version.pv_major, cconn->cc_version.pv_minor); + + spin_lock(&osb->dc_task_lock); + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Count: %lu WakeSeq: %lu " + "WorkSeq: %lu\n", "DownCnvt", + task_pid_nr(osb->dc_task), osb->blocked_lock_count, + osb->dc_wake_sequence, osb->dc_work_sequence); + spin_unlock(&osb->dc_task_lock); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + "Recovery", + (osb->recovery_thread_task ? + task_pid_nr(osb->recovery_thread_task) : -1)); + if (rm->rm_used == 0) + out += snprintf(buf + out, len - out, " None\n"); + else { + for (i = 0; i < rm->rm_used; i++) + out += snprintf(buf + out, len - out, " %d", + rm->rm_entries[i]); + out += snprintf(buf + out, len - out, "\n"); + } + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", + task_pid_nr(osb->commit_task), osb->osb_commit_interval, + atomic_read(&osb->needs_checkpoint)); + + out += snprintf(buf + out, len - out, + "%10s => State: %d NumTxns: %d TxnId: %lu\n", + "Journal", osb->journal->j_state, + atomic_read(&osb->journal->j_num_trans), + osb->journal->j_trans_id); + + out += snprintf(buf + out, len - out, + "%10s => GlobalAllocs: %d LocalAllocs: %d " + "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", + "Stats", + atomic_read(&osb->alloc_stats.bitmap_data), + atomic_read(&osb->alloc_stats.local_data), + atomic_read(&osb->alloc_stats.bg_allocs), + atomic_read(&osb->alloc_stats.moves), + atomic_read(&osb->alloc_stats.bg_extends)); + + out += snprintf(buf + out, len - out, + "%10s => State: %u Descriptor: %llu Size: %u bits " + "Default: %u bits\n", + "LocalAlloc", osb->local_alloc_state, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_bits, osb->local_alloc_default_bits); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, + "%10s => Slot: %d NumStolen: %d\n", "Steal", + osb->s_inode_steal_slot, + atomic_read(&osb->s_num_inodes_stolen)); + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + "Slots", "Num", "RecoGen"); + + for (i = 0; i < osb->max_slots; ++i) { + out += snprintf(buf + out, len - out, + "%10s %c %3d %10d\n", + " ", + (i == osb->slot_num ? '*' : ' '), + i, osb->slot_recovery_generations[i]); + } + + return out; +} + +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + struct ocfs2_super *osb = inode->i_private; + char *buf = NULL; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static struct file_operations ocfs2_osb_debug_fops = { + .open = ocfs2_osb_debug_open, + .release = ocfs2_debug_release, + .read = ocfs2_debug_read, + .llseek = generic_file_llseek, +}; + /* * write_super and sync_fs ripped right out of ext3. */ @@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_osb_debug_fops); + if (!osb->osb_ctxt) { + status = -EINVAL; + mlog_errno(status); + goto read_super_error; + } + status = ocfs2_mount_volume(sb); if (osb->root_inode) inode = igrab(osb->root_inode); @@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + debugfs_remove(osb->osb_ctxt); + ocfs2_disable_quotas(osb); ocfs2_shutdown_local_alloc(osb); @@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb, bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; + + for (i = 0; i < 3; i++) + osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); + osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); + osb->sb = sb; /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); @@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * lock, and it's marked as dirty, set the bit in the recover * map and launch a recovery thread for it. */ status = ocfs2_mark_dead_nodes(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + status = ocfs2_compute_replay_slots(osb); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2563df89fc2a..15631019dc63 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, struct ocfs2_security_xattr_info *si, int *want_clusters, int *xattr_credits, - struct ocfs2_alloc_context **xattr_ac) + int *want_meta) { int ret = 0; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); @@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { - ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); - if (ret) { - mlog_errno(ret); - return ret; - } + *want_meta = *want_meta + 1; *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 5a1ebc789f7e..1ca7e9a1b7bc 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *, int *, int *, struct ocfs2_alloc_context **); int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, int, struct ocfs2_security_xattr_info *, - int *, int *, struct ocfs2_alloc_context **); + int *, int *, int *); /* * xattrs can live inside an inode, as part of an external xattr block, diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 38e337d51ced..99e33ef40be4 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -19,6 +19,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/blktrace_api.h> #include "check.h" @@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = { static struct attribute_group *part_attr_groups[] = { &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif NULL }; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index b446d7ad0b0d..7e14d1a04001 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) /* * display a list of all the REGIONs the kernel knows about - * - nommu kernals have a single flat list + * - nommu kernels have a single flat list */ static int nommu_region_list_show(struct seq_file *m, void *_p) { diff --git a/fs/read_write.c b/fs/read_write.c index 6d5d8ff238aa..9d1e76bb9ee1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -731,10 +731,16 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, return ret; } +static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) +{ +#define HALF_LONG_BITS (BITS_PER_LONG / 2) + return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; +} + SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen, u32, pos_high, u32, pos_low) + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { - loff_t pos = ((loff_t)pos_high << 32) | pos_low; + loff_t pos = pos_from_hilo(pos_h, pos_l); struct file *file; ssize_t ret = -EBADF; int fput_needed; @@ -757,9 +763,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen, u32, pos_high, u32, pos_low) + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { - loff_t pos = ((loff_t)pos_high << 32) | pos_low; + loff_t pos = pos_from_hilo(pos_h, pos_l); struct file *file; ssize_t ret = -EBADF; int fput_needed; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index e35b54d5059d..830e3f76f442 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR depends on UBIFS_FS help This option allows to explicitly choose which compressions, if any, - are enabled in UBIFS. Removing compressors means inbility to read + are enabled in UBIFS. Removing compressors means inability to read existing file systems. If unsure, say 'N'. @@ -32,7 +32,7 @@ config UBIFS_FS_LZO depends on UBIFS_FS default y help - LZO compressor is generally faster then zlib but compresses worse. + LZO compressor is generally faster than zlib but compresses worse. Say 'Y' if unsure. config UBIFS_FS_ZLIB |