diff options
| author | 2016-03-30 10:36:06 -0700 | |
|---|---|---|
| committer | 2016-03-30 10:36:06 -0700 | |
| commit | 1809de7e7d37c585e01a1bcc583ea92b78fc759d (patch) | |
| tree | 76c5b35c2b04eafce86a1a729c02ab705eba44bc /fs/btrfs | |
| parent | ARM: OMAP2+: Use srst_udelay for USB on dm814x (diff) | |
| parent | ARM: OMAP2+: hwmod: Fix updating of sysconfig register (diff) | |
| download | linux-dev-1809de7e7d37c585e01a1bcc583ea92b78fc759d.tar.xz linux-dev-1809de7e7d37c585e01a1bcc583ea92b78fc759d.zip | |
Merge tag 'for-v4.6-rc/omap-fixes-a' of git://git.kernel.org/pub/scm/linux/kernel/git/pjw/omap-pending into omap-for-v4.6/fixes
ARM: OMAP2+: first hwmod fix for v4.6-rc
Fix a longstanding bug in the hwmod code that could cause
hardware SYSCONFIG register values to not match the kernel's
idea of what they should be, and that could result in lower
performance during IP block idle entry.
Basic build, boot, and PM test logs are available here:
http://www.pwsan.com/omap/testlogs/omap-hwmod-fixes-a-for-v4.6-rc/20160326231727/
Diffstat (limited to 'fs/btrfs')
43 files changed, 1348 insertions, 788 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 88d9af3d4581..5fb60ea7eee2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, list_add_tail(&work->ordered_list, &wq->ordered_list); spin_unlock_irqrestore(&wq->list_lock, flags); } - queue_work(wq->normal_wq, &work->normal_work); trace_btrfs_work_queued(work); + queue_work(wq->normal_wq, &work->normal_work); } void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b90cd3776f8e..80e8472d618b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -148,8 +148,7 @@ int __init btrfs_prelim_ref_init(void) void btrfs_prelim_ref_exit(void) { - if (btrfs_prelim_ref_cache) - kmem_cache_destroy(btrfs_prelim_ref_cache); + kmem_cache_destroy(btrfs_prelim_ref_cache); } /* @@ -566,17 +565,14 @@ static void __merge_refs(struct list_head *head, int mode) struct __prelim_ref *pos2 = pos1, *tmp; list_for_each_entry_safe_continue(pos2, tmp, head, list) { - struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; + struct __prelim_ref *ref1 = pos1, *ref2 = pos2; struct extent_inode_elem *eie; if (!ref_for_same_block(ref1, ref2)) continue; if (mode == 1) { - if (!ref1->parent && ref2->parent) { - xchg = ref1; - ref1 = ref2; - ref2 = xchg; - } + if (!ref1->parent && ref2->parent) + swap(ref1, ref2); } else { if (ref1->parent != ref2->parent) continue; @@ -1406,7 +1402,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, read_extent_buffer(eb, dest + bytes_left, name_off, name_len); if (eb != eb_in) { - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1426,9 +1423,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[0] = NULL; + path->locks[0] = 0; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 861d472564c1..e34a71b3e225 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -95,6 +95,7 @@ #include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/vmalloc.h> +#include <linux/string.h> #include "ctree.h" #include "disk-io.h" #include "hash.h" @@ -105,6 +106,7 @@ #include "locking.h" #include "check-integrity.h" #include "rcu-string.h" +#include "compression.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -176,7 +178,7 @@ struct btrfsic_block { * Elements of this type are allocated dynamically and required because * each block object can refer to and can be ref from multiple blocks. * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block refered from. + * the block ref to plus the one from the block referred from. * The fact that they are searchable via a hashtable and that a * ref_cnt is maintained is not required for the btrfs integrity * check algorithm itself, it is only used to make the output more @@ -3076,7 +3078,7 @@ int btrfsic_mount(struct btrfs_root *root, list_for_each_entry(device, dev_head, dev_list) { struct btrfsic_dev_state *ds; - char *p; + const char *p; if (!device->bdev || !device->name) continue; @@ -3092,11 +3094,7 @@ int btrfsic_mount(struct btrfs_root *root, ds->state = state; bdevname(ds->bdev, ds->name); ds->name[BDEVNAME_SIZE - 1] = '\0'; - for (p = ds->name; *p != '\0'; p++); - while (p > ds->name && *p != '/') - p--; - if (*p == '/') - p++; + p = kbasename(ds->name); strlcpy(ds->name, p, sizeof(ds->name)); btrfsic_dev_state_hashtable_add(ds, &btrfsic_dev_state_hashtable); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c473c42d7d6c..3346cd8f9910 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - /* In the parent-locked case, we only locked the range we are - * interested in. In all other cases, we can opportunistically - * cache decompressed data that goes beyond the requested range. */ - if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 13a4dc0436c9..f49d8b8c0f00 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, unsigned long pg_index, unsigned long pg_offset); + +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, +}; + struct btrfs_compress_op { struct list_head *(*alloc_workspace)(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 769e0ff1b4ce..77592931ab4f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -311,7 +311,7 @@ struct tree_mod_root { struct tree_mod_elem { struct rb_node node; - u64 index; /* shifted logical */ + u64 logical; u64 seq; enum mod_log_op op; @@ -435,11 +435,11 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, /* * key order of the log: - * index -> sequence + * node/leaf start address -> sequence * - * the index is the shifted logical of the *new* root node for root replace - * operations, or the shifted logical of the affected block for all other - * operations. + * The 'start address' is the logical address of the *new* root node + * for root replace operations, or the logical address of the affected + * block for all other operations. * * Note: must be called with write lock (tree_mod_log_write_lock). */ @@ -460,9 +460,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) while (*new) { cur = container_of(*new, struct tree_mod_elem, node); parent = *new; - if (cur->index < tm->index) + if (cur->logical < tm->logical) new = &((*new)->rb_left); - else if (cur->index > tm->index) + else if (cur->logical > tm->logical) new = &((*new)->rb_right); else if (cur->seq < tm->seq) new = &((*new)->rb_left); @@ -523,7 +523,7 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot, if (!tm) return NULL; - tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->logical = eb->start; if (op != MOD_LOG_KEY_ADD) { btrfs_node_key(eb, &tm->key, slot); tm->blockptr = btrfs_node_blockptr(eb, slot); @@ -588,7 +588,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, goto free_tms; } - tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->logical = eb->start; tm->slot = src_slot; tm->move.dst_slot = dst_slot; tm->move.nr_items = nr_items; @@ -699,7 +699,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, goto free_tms; } - tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->logical = new_root->start; tm->old_root.logical = old_root->start; tm->old_root.level = btrfs_header_level(old_root); tm->generation = btrfs_header_generation(old_root); @@ -739,16 +739,15 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct rb_node *node; struct tree_mod_elem *cur = NULL; struct tree_mod_elem *found = NULL; - u64 index = start >> PAGE_CACHE_SHIFT; tree_mod_log_read_lock(fs_info); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { cur = container_of(node, struct tree_mod_elem, node); - if (cur->index < index) { + if (cur->logical < start) { node = node->rb_left; - } else if (cur->index > index) { + } else if (cur->logical > start) { node = node->rb_right; } else if (cur->seq < min_seq) { node = node->rb_left; @@ -1230,9 +1229,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, return NULL; /* - * the very last operation that's logged for a root is the replacement - * operation (if it is replaced at all). this has the index of the *new* - * root, making it the very first operation that's logged for this root. + * the very last operation that's logged for a root is the + * replacement operation (if it is replaced at all). this has + * the logical address of the *new* root, making it the very + * first operation that's logged for this root. */ while (1) { tm = tree_mod_log_search_oldest(fs_info, root_logical, @@ -1336,7 +1336,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, if (!next) break; tm = container_of(next, struct tree_mod_elem, node); - if (tm->index != first_tm->index) + if (tm->logical != first_tm->logical) break; } tree_mod_log_read_unlock(fs_info); @@ -5361,7 +5361,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); if (!tmp_buf) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bfe4a337fb4d..84a6a5b3384a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -100,6 +100,9 @@ struct btrfs_ordered_sum; /* tracks free space in block groups. */ #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL +/* device stats in the device tree */ +#define BTRFS_DEV_STATS_OBJECTID 0ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -715,14 +718,6 @@ struct btrfs_timespec { __le32 nsec; } __attribute__ ((__packed__)); -enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_TYPES = 2, - BTRFS_COMPRESS_LAST = 3, -}; - struct btrfs_inode_item { /* nfs style generation number */ __le64 generation; @@ -793,7 +788,7 @@ struct btrfs_root_item { /* * This generation number is used to test if the new fields are valid - * and up to date while reading the root item. Everytime the root item + * and up to date while reading the root item. Every time the root item * is written out, the "generation" field is copied into this field. If * anyone ever mounted the fs with an older kernel, we will have * mismatching generation values here and thus must invalidate the @@ -1002,8 +997,10 @@ struct btrfs_dev_replace { pid_t lock_owner; atomic_t nesting_level; struct mutex lock_finishing_cancel_unmount; - struct mutex lock_management_lock; - struct mutex lock; + rwlock_t lock; + atomic_t read_locks; + atomic_t blocking_readers; + wait_queue_head_t read_lock_wq; struct btrfs_scrub_progress scrub_progress; }; @@ -1222,10 +1219,10 @@ struct btrfs_space_info { * we've called update_block_group and dropped the bytes_used counter * and increased the bytes_pinned counter. However this means that * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed everytime we call - * btrfs_free_extent so it is a realtime count of what will be freed - * once the transaction is committed. It will be zero'ed everytime the - * transaction commits. + * delayed refs are flushed, so this counter is inc'ed every time we + * call btrfs_free_extent so it is a realtime count of what will be + * freed once the transaction is committed. It will be zero'ed every + * time the transaction commits. */ struct percpu_counter total_bytes_pinned; @@ -1822,6 +1819,9 @@ struct btrfs_fs_info { spinlock_t reada_lock; struct radix_tree_root reada_tree; + /* readahead works cnt */ + atomic_t reada_works_cnt; + /* Extent buffer radix tree */ spinlock_t buffer_lock; struct radix_tree_root buffer_radix; @@ -2185,13 +2185,43 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_QGROUP_RELATION_KEY 246 +/* + * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. + */ #define BTRFS_BALANCE_ITEM_KEY 248 /* - * Persistantly stores the io stats in the device tree. - * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + * The key type for tree items that are stored persistently, but do not need to + * exist for extended period of time. The items can exist in any tree. + * + * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] + * + * Existing items: + * + * - balance status item + * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) */ -#define BTRFS_DEV_STATS_KEY 249 +#define BTRFS_TEMPORARY_ITEM_KEY 248 + +/* + * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY + */ +#define BTRFS_DEV_STATS_KEY 249 + +/* + * The key type for tree items that are stored persistently and usually exist + * for a long period, eg. filesystem lifetime. The item kinds can be status + * information, stats or preference values. The item can exist in any tree. + * + * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] + * + * Existing items: + * + * - device statistics, store IO stats in the device tree, one key for all + * stats + * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) + */ +#define BTRFS_PERSISTENT_ITEM_KEY 249 /* * Persistantly stores the device replace state in the device tree. @@ -2241,7 +2271,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) -#define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) @@ -2250,9 +2280,10 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) +#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) -#define BTRFS_DEFAULT_MAX_INLINE (8192) +#define BTRFS_DEFAULT_MAX_INLINE (2048) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2353,6 +2384,9 @@ struct btrfs_map_token { unsigned long offset; }; +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ + ((bytes) >> (fs_info)->sb->s_blocksize_bits) + static inline void btrfs_init_map_token (struct btrfs_map_token *token) { token->kaddr = NULL; @@ -3448,8 +3482,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - 2 * num_items; + return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* @@ -4027,7 +4060,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len); -int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, +int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4089,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); int btrfs_is_empty_uuid(u8 *uuid); @@ -4151,7 +4185,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ -int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_parse_options(struct btrfs_root *root, char *options, + unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); #ifdef CONFIG_PRINTK @@ -4525,8 +4560,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err); +int btree_readahead_hook(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, u64 start, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0be47e4b8136..6cef0062f929 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -43,8 +43,7 @@ int __init btrfs_delayed_inode_init(void) void btrfs_delayed_inode_exit(void) { - if (delayed_node_cache) - kmem_cache_destroy(delayed_node_cache); + kmem_cache_destroy(delayed_node_cache); } static inline void btrfs_init_delayed_node( @@ -651,9 +650,14 @@ static int btrfs_delayed_inode_reserve_metadata( goto out; ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!WARN_ON(ret)) + if (!ret) goto out; + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + btrfs_debug(root->fs_info, + "block rsv migrate returned %d", ret); + WARN_ON(1); + } /* * Ok this is a problem, let's just steal from the global rsv * since this really shouldn't happen that often. @@ -1689,7 +1693,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + struct list_head *ins_list, bool *emitted) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1733,6 +1737,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; + *emitted = true; } return 0; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f70119f25421..0167853c84ae 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + struct list_head *ins_list, bool *emitted); /* for init */ int __init btrfs_delayed_inode_init(void); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 914ac13bd92f..430b3689b112 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -929,14 +929,10 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) void btrfs_delayed_ref_exit(void) { - if (btrfs_delayed_ref_head_cachep) - kmem_cache_destroy(btrfs_delayed_ref_head_cachep); - if (btrfs_delayed_tree_ref_cachep) - kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); - if (btrfs_delayed_data_ref_cachep) - kmem_cache_destroy(btrfs_delayed_data_ref_cachep); - if (btrfs_delayed_extent_op_cachep) - kmem_cache_destroy(btrfs_delayed_extent_op_cachep); + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } int btrfs_delayed_ref_init(void) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index cbb7dbfb3fff..a1d6652e0c47 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); return 0; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_mark_buffer_dirty(eb); @@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, return PTR_ERR(trans); } - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->is_valid = 1; dev_replace->item_needs_writeback = 1; args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); goto leave; } @@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } @@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); /* * flush all outstanding I/O and inode extent mappings before the @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); mutex_lock(&root->fs_info->chunk_mutex); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, rcu_str_deref(src_device->name), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); @@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_rm_dev_replace_blocked(fs_info); @@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *srcdev; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); } int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, @@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); goto leave; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: @@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); @@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); return 0; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); @@ -802,7 +802,7 @@ static int btrfs_dev_replace_kthread(void *data) struct btrfs_ioctl_dev_replace_args *status_args; u64 progress; - status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + status_args = kzalloc(sizeof(*status_args), GFP_KERNEL); if (status_args) { btrfs_dev_replace_status(fs_info, status_args); progress = status_args->status.progress_1000; @@ -858,55 +858,65 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) * not called and the the filesystem is remounted * in degraded state. This does not stop the * dev_replace procedure. It needs to be canceled - * manually if the cancelation is wanted. + * manually if the cancellation is wanted. */ break; } return 1; } -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw) { - /* the beginning is just an optimization for the typical case */ - if (atomic_read(&dev_replace->nesting_level) == 0) { -acquire_lock: - /* this is not a nested case where the same thread - * is trying to acqurire the same lock twice */ - mutex_lock(&dev_replace->lock); - mutex_lock(&dev_replace->lock_management_lock); - dev_replace->lock_owner = current->pid; - atomic_inc(&dev_replace->nesting_level); - mutex_unlock(&dev_replace->lock_management_lock); - return; + if (rw == 1) { + /* write */ +again: + wait_event(dev_replace->read_lock_wq, + atomic_read(&dev_replace->blocking_readers) == 0); + write_lock(&dev_replace->lock); + if (atomic_read(&dev_replace->blocking_readers)) { + write_unlock(&dev_replace->lock); + goto again; + } + } else { + read_lock(&dev_replace->lock); + atomic_inc(&dev_replace->read_locks); } +} - mutex_lock(&dev_replace->lock_management_lock); - if (atomic_read(&dev_replace->nesting_level) > 0 && - dev_replace->lock_owner == current->pid) { - WARN_ON(!mutex_is_locked(&dev_replace->lock)); - atomic_inc(&dev_replace->nesting_level); - mutex_unlock(&dev_replace->lock_management_lock); - return; +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw) +{ + if (rw == 1) { + /* write */ + ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); + write_unlock(&dev_replace->lock); + } else { + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + atomic_dec(&dev_replace->read_locks); + read_unlock(&dev_replace->lock); } +} - mutex_unlock(&dev_replace->lock_management_lock); - goto acquire_lock; +/* inc blocking cnt and release read lock */ +void btrfs_dev_replace_set_lock_blocking( + struct btrfs_dev_replace *dev_replace) +{ + /* only set blocking for read lock */ + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + atomic_inc(&dev_replace->blocking_readers); + read_unlock(&dev_replace->lock); } -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +/* acquire read lock and dec blocking cnt */ +void btrfs_dev_replace_clear_lock_blocking( + struct btrfs_dev_replace *dev_replace) { - WARN_ON(!mutex_is_locked(&dev_replace->lock)); - mutex_lock(&dev_replace->lock_management_lock); - WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); - WARN_ON(dev_replace->lock_owner != current->pid); - atomic_dec(&dev_replace->nesting_level); - if (atomic_read(&dev_replace->nesting_level) == 0) { - dev_replace->lock_owner = 0; - mutex_unlock(&dev_replace->lock_management_lock); - mutex_unlock(&dev_replace->lock); - } else { - mutex_unlock(&dev_replace->lock_management_lock); - } + /* only set blocking for read lock */ + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); + read_lock(&dev_replace->lock); + if (atomic_dec_and_test(&dev_replace->blocking_readers) && + waitqueue_active(&dev_replace->read_lock_wq)) + wake_up(&dev_replace->read_lock_wq); } void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 20035cbbf021..29e3ef5f96bd 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw); +void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_clear_lock_blocking( + struct btrfs_dev_replace *dev_replace); static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dd08e29f5117..4b02591b0301 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,7 @@ #include "raid56.h" #include "sysfs.h" #include "qgroup.h" +#include "compression.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -110,8 +111,7 @@ int __init btrfs_end_io_wq_init(void) void btrfs_end_io_wq_exit(void) { - if (btrfs_end_io_wq_cache) - kmem_cache_destroy(btrfs_end_io_wq_cache); + kmem_cache_destroy(btrfs_end_io_wq_cache); } /* @@ -182,6 +182,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, { .id = 0, .name_stem = "tree" }, }; @@ -611,6 +612,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, int found_level; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int reads_done; @@ -636,21 +638,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", - found_start, eb->start); + btrfs_err_rl(fs_info, "bad tree block start %llu %llu", + found_start, eb->start); ret = -EIO; goto err; } - if (check_tree_block_fsid(root->fs_info, eb)) { - btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", - eb->start); + if (check_tree_block_fsid(fs_info, eb)) { + btrfs_err_rl(fs_info, "bad fsid on block %llu", + eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(root->fs_info, "bad tree block level %d", - (int)btrfs_header_level(eb)); + btrfs_err(fs_info, "bad tree block level %d", + (int)btrfs_header_level(eb)); ret = -EIO; goto err; } @@ -658,7 +660,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(root->fs_info, eb, 1); + ret = csum_tree_block(fs_info, eb, 1); if (ret) { ret = -EIO; goto err; @@ -679,7 +681,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(root, eb, eb->start, ret); + btree_readahead_hook(fs_info, eb, eb->start, ret); if (ret) { /* @@ -698,14 +700,13 @@ out: static int btree_io_failed_hook(struct page *page, int failed_mirror) { struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(root, eb, eb->start, -EIO); + btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ } @@ -815,7 +816,7 @@ static void run_one_async_done(struct btrfs_work *work) waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - /* If an error occured we just want to clean up the bio and move on */ + /* If an error occurred we just want to clean up the bio and move on */ if (async->error) { async->bio->bi_error = async->error; bio_endio(async->bio); @@ -930,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) + if (static_cpu_has(X86_FEATURE_XMM4_2)) return 0; #endif return 1; @@ -1295,9 +1296,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, spin_lock_init(&root->root_item_lock); } -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + gfp_t flags) { - struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); + struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) root->fs_info = fs_info; return root; @@ -1309,7 +1311,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL); + root = btrfs_alloc_root(NULL, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); __setup_root(4096, 4096, 4096, root, NULL, 1); @@ -1331,7 +1333,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int ret = 0; uuid_le uuid; - root = btrfs_alloc_root(fs_info); + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); @@ -1407,7 +1409,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - root = btrfs_alloc_root(fs_info); + root = btrfs_alloc_root(fs_info, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); @@ -1505,7 +1507,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (!path) return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info); + root = btrfs_alloc_root(fs_info, GFP_NOFS); if (!root) { ret = -ENOMEM; goto alloc_fail; @@ -1787,7 +1789,6 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; - set_freezable(); do { again = 0; @@ -2272,9 +2273,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) fs_info->dev_replace.lock_owner = 0; atomic_set(&fs_info->dev_replace.nesting_level, 0); mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - mutex_init(&fs_info->dev_replace.lock_management_lock); - mutex_init(&fs_info->dev_replace.lock); + rwlock_init(&fs_info->dev_replace.lock); + atomic_set(&fs_info->dev_replace.read_locks, 0); + atomic_set(&fs_info->dev_replace.blocking_readers, 0); init_waitqueue_head(&fs_info->replace_wait); + init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) @@ -2385,7 +2388,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -EIO; } - log_tree_root = btrfs_alloc_root(fs_info); + log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!log_tree_root) return -ENOMEM; @@ -2510,8 +2513,8 @@ int open_ctree(struct super_block *sb, int backup_index = 0; int max_active; - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; @@ -2603,6 +2606,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); + atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; @@ -2622,7 +2626,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->delayed_root) { err = -ENOMEM; goto fail_iput; @@ -2750,7 +2754,7 @@ int open_ctree(struct super_block *sb, */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - ret = btrfs_parse_options(tree_root, options); + ret = btrfs_parse_options(tree_root, options, sb->s_flags); if (ret) { err = ret; goto fail_alloc; @@ -3029,8 +3033,9 @@ retry_root_backup: if (ret) goto fail_trans_kthread; - /* do not make disk changes in broken FS */ - if (btrfs_super_log_root(disk_super) != 0) { + /* do not make disk changes in broken FS or nologreplay is given */ + if (btrfs_super_log_root(disk_super) != 0 && + !btrfs_test_opt(tree_root, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3146,6 +3151,12 @@ retry_root_backup: fs_info->open = 1; + /* + * backuproot only affect mount behavior, and if open_ctree succeeded, + * no need to keep the flag + */ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + return 0; fail_qgroup: @@ -3200,7 +3211,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) + if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e2287c7c10be..53e12977bfd0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4838,7 +4838,7 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ - if (space_info->bytes_used >= thresh) + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -5373,27 +5373,33 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = min_t(u64, num_bytes, SZ_512M); - num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly + - sinfo->bytes_may_use; - - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - block_rsv->reserved += num_bytes; - sinfo->bytes_may_use += num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 1); - } - - if (block_rsv->reserved >= block_rsv->size) { + if (block_rsv->reserved < block_rsv->size) { + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + num_bytes = min(num_bytes, + block_rsv->size - block_rsv->reserved); + block_rsv->reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, + 1); + } + } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; } + if (block_rsv->reserved == block_rsv->size) + block_rsv->full = 1; + else + block_rsv->full = 0; + spin_unlock(&block_rsv->lock); spin_unlock(&sinfo->lock); } @@ -5752,7 +5758,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occured during this + * free'd from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -7018,7 +7024,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, int delalloc) { - struct btrfs_block_group_cache *used_bg; + struct btrfs_block_group_cache *used_bg = NULL; bool locked = false; again: spin_lock(&cluster->refill_lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2e7c97a3f344..76a0c8597d98 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -206,10 +206,8 @@ void extent_io_exit(void) * destroy caches. */ rcu_barrier(); - if (extent_state_cache) - kmem_cache_destroy(extent_state_cache); - if (extent_buffer_cache) - kmem_cache_destroy(extent_buffer_cache); + kmem_cache_destroy(extent_state_cache); + kmem_cache_destroy(extent_buffer_cache); if (btrfs_bioset) bioset_free(btrfs_bioset); } @@ -232,7 +230,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) if (!state) return state; state->state = 0; - state->private = 0; + state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); @@ -1844,7 +1842,8 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record *failrec) { struct rb_node *node; struct extent_state *state; @@ -1865,13 +1864,14 @@ static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private ret = -ENOENT; goto out; } - state->private = private; + state->failrec = failrec; out: spin_unlock(&tree->lock); return ret; } -int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record **failrec) { struct rb_node *node; struct extent_state *state; @@ -1892,7 +1892,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) ret = -ENOENT; goto out; } - *private = state->private; + *failrec = state->failrec; out: spin_unlock(&tree->lock); return ret; @@ -1972,7 +1972,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) int err = 0; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - set_state_private(failure_tree, rec->start, 0); + set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, rec->start + rec->len - 1, EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); @@ -2089,7 +2089,6 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page, unsigned int pg_offset) { u64 private; - u64 private_failure; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; @@ -2102,12 +2101,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page, if (!ret) return 0; - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, - &private_failure); + ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start, + &failrec); if (ret) return 0; - failrec = (struct io_failure_record *)(unsigned long) private_failure; BUG_ON(!failrec->this_mirror); if (failrec->in_validation) { @@ -2167,7 +2165,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) next = next_state(state); - failrec = (struct io_failure_record *)(unsigned long)state->private; + failrec = state->failrec; free_extent_state(state); kfree(failrec); @@ -2177,10 +2175,9 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) } int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret) + struct io_failure_record **failrec_ret) { struct io_failure_record *failrec; - u64 private; struct extent_map *em; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -2188,7 +2185,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, int ret; u64 logical; - ret = get_state_private(failure_tree, start, &private); + ret = get_state_failrec(failure_tree, start, &failrec); if (ret) { failrec = kzalloc(sizeof(*failrec), GFP_NOFS); if (!failrec) @@ -2237,8 +2234,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, ret = set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); if (ret >= 0) - ret = set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); + ret = set_state_failrec(failure_tree, start, failrec); /* set the bits in the inode's tree */ if (ret >= 0) ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, @@ -2248,7 +2244,6 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, return ret; } } else { - failrec = (struct io_failure_record *)(unsigned long)private; pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", failrec->logical, failrec->start, failrec->len, failrec->in_validation); @@ -2897,12 +2892,11 @@ static int __do_readpage(struct extent_io_tree *tree, struct block_device *bdev; int ret; int nr = 0; - int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); @@ -2942,18 +2936,16 @@ static int __do_readpage(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (!parent_locked) - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -3038,12 +3030,9 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (parent_locked) - free_extent_state(cached); - else - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -3052,8 +3041,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3063,8 +3051,7 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3083,8 +3070,7 @@ static int __do_readpage(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -3186,7 +3172,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, while (1) { lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_extent(inode, start); + ordered = btrfs_lookup_ordered_range(inode, start, + PAGE_CACHE_SIZE); if (!ordered) break; unlock_extent(tree, start, end); @@ -3213,20 +3200,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; - int ret; - - ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ, NULL); - if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - return ret; -} - static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0377413bd4b9..5dbf92e68fbd 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,7 +29,6 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 -#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -62,6 +61,7 @@ struct extent_state; struct btrfs_root; struct btrfs_io_bio; +struct io_failure_record; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -112,8 +112,7 @@ struct extent_state { atomic_t refs; unsigned state; - /* for use by the FS */ - u64 private; + struct io_failure_record *failrec; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; @@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -345,7 +342,6 @@ int extent_readpages(struct extent_io_tree *tree, get_extent_t get_extent); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); -int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 84fb56d5c018..318b048eb254 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -4,6 +4,7 @@ #include <linux/hardirq.h> #include "ctree.h" #include "extent_map.h" +#include "compression.h" static struct kmem_cache *extent_map_cache; @@ -20,8 +21,7 @@ int __init extent_map_init(void) void extent_map_exit(void) { - if (extent_map_cache) - kmem_cache_destroy(extent_map_cache); + kmem_cache_destroy(extent_map_cache); } /** @@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void) /** * free_extent_map - drop reference count of an extent_map - * @em: extent map beeing releasead + * @em: extent map being releasead * * Drops the reference out on @em by one and free the structure * if the reference count hits zero. @@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, /** * remove_extent_mapping - removes an extent_map from the extent tree * @tree: extent tree to remove from - * @em: extent map beeing removed + * @em: extent map being removed * * Removes @em from @tree. No reference counts are dropped, and no checks * are done to see if the range is in use diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a67e1c828d0f..b5baf5bdc8e1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "volumes.h" #include "print-tree.h" +#include "compression.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -172,6 +173,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; + u64 page_bytes_left; u32 diff; int nblocks; int bio_index = 0; @@ -220,6 +222,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; if (dio) offset = logical_offset; + + page_bytes_left = bvec->bv_len; while (bio_index < bio->bi_vcnt) { if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; @@ -243,7 +247,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, - offset + bvec->bv_len - 1, + offset + root->sectorsize - 1, EXTENT_NODATASUM, GFP_NOFS); } else { btrfs_info(BTRFS_I(inode)->root->fs_info, @@ -281,13 +285,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; - bio_index += count; + while (count--) { - disk_bytenr += bvec->bv_len; - offset += bvec->bv_len; - bvec++; + disk_bytenr += root->sectorsize; + offset += root->sectorsize; + page_bytes_left -= root->sectorsize; + if (!page_bytes_left) { + bio_index++; + /* + * make sure we're still inside the + * bio before we update page_bytes_left + */ + if (bio_index >= bio->bi_vcnt) { + WARN_ON_ONCE(count); + goto done; + } + bvec++; + page_bytes_left = bvec->bv_len; + } + } } + +done: btrfs_free_path(path); return 0; } @@ -432,6 +452,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; int index; + int nr_sectors; + int i; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; @@ -459,41 +481,56 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset) { - unsigned long bytes_left; - sums->len = this_sum_bytes; - this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); - btrfs_put_ordered_extent(ordered); + data = kmap_atomic(bvec->bv_page); - bytes_left = bio->bi_iter.bi_size - total_bytes; + nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + bvec->bv_len + root->sectorsize + - 1); + + for (i = 0; i < nr_sectors; i++) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { + unsigned long bytes_left; + + kunmap_atomic(data); + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_iter.bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); /* -ENOMEM */ + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, + offset); + ASSERT(ordered); /* Logic error */ + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + + total_bytes; + index = 0; + + data = kmap_atomic(bvec->bv_page); + } - sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), - GFP_NOFS); - BUG_ON(!sums); /* -ENOMEM */ - sums->len = bytes_left; - ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + - total_bytes; - index = 0; + sums->sums[index] = ~(u32)0; + sums->sums[index] + = btrfs_csum_data(data + bvec->bv_offset + + (i * root->sectorsize), + sums->sums[index], + root->sectorsize); + btrfs_csum_final(sums->sums[index], + (char *)(sums->sums + index)); + index++; + offset += root->sectorsize; + this_sum_bytes += root->sectorsize; + total_bytes += root->sectorsize; } - data = kmap_atomic(bvec->bv_page); - sums->sums[index] = ~(u32)0; - sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, - sums->sums[index], - bvec->bv_len); kunmap_atomic(data); - btrfs_csum_final(sums->sums[index], - (char *)(sums->sums + index)); bio_index++; - index++; - total_bytes += bvec->bv_len; - this_sum_bytes += bvec->bv_len; - offset += bvec->bv_len; bvec++; } this_sum_bytes = 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 098bb8f690c9..15a09cb156ce 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,7 @@ #include "locking.h" #include "volumes.h" #include "qgroup.h" +#include "compression.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -498,7 +499,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); + num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, @@ -1379,16 +1380,19 @@ fail: static noinline int lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, + size_t write_bytes, u64 *lockstart, u64 *lockend, struct extent_state **cached_state) { + struct btrfs_root *root = BTRFS_I(inode)->root; u64 start_pos; u64 last_pos; int i; int ret = 0; - start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); - last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + start_pos = round_down(pos, root->sectorsize); + last_pos = start_pos + + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; @@ -1503,6 +1507,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); @@ -1511,6 +1516,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t reserve_bytes; size_t dirty_pages; size_t copied; + size_t dirty_sectors; + size_t num_sectors; WARN_ON(num_pages > nrptrs); @@ -1523,29 +1530,29 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } - reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + sector_offset = pos & (root->sectorsize - 1); + reserve_bytes = round_up(write_bytes + sector_offset, + root->sectorsize); - if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) { - ret = check_can_nocow(inode, pos, &write_bytes); - if (ret < 0) - break; - if (ret > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); - reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - goto reserve_metadata; - } + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + reserve_bytes = round_up(write_bytes + sector_offset, + root->sectorsize); + goto reserve_metadata; } + ret = btrfs_check_data_free_space(inode, pos, write_bytes); if (ret < 0) break; @@ -1576,8 +1583,8 @@ again: break; ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, - pos, &lockstart, &lockend, - &cached_state); + pos, write_bytes, &lockstart, + &lockend, &cached_state); if (ret < 0) { if (ret == -EAGAIN) goto again; @@ -1612,9 +1619,16 @@ again: * we still have an outstanding extent for the chunk we actually * managed to copy. */ - if (num_pages > dirty_pages) { - release_bytes = (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT; + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + root->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + dirty_sectors); + + if (num_sectors > dirty_sectors) { + release_bytes = (write_bytes - copied) + & ~((u64)root->sectorsize - 1); if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -1633,7 +1647,8 @@ again: } } - release_bytes = dirty_pages << PAGE_CACHE_SHIFT; + release_bytes = round_up(copied + sector_offset, + root->sectorsize); if (copied > 0) ret = btrfs_dirty_pages(root, inode, pages, @@ -1654,8 +1669,7 @@ again: if (only_release_metadata && copied > 0) { lockstart = round_down(pos, root->sectorsize); - lockend = lockstart + - (dirty_pages << PAGE_CACHE_SHIFT) - 1; + lockend = round_up(pos + copied, root->sectorsize) - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_NORESERVE, NULL, @@ -1761,6 +1775,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ssize_t err; loff_t pos; size_t count; + loff_t oldsize; + int clean_page = 0; inode_lock(inode); err = generic_write_checks(iocb, from); @@ -1799,14 +1815,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, pos = iocb->ki_pos; count = iov_iter_count(from); start_pos = round_down(pos, root->sectorsize); - if (start_pos > i_size_read(inode)) { + oldsize = i_size_read(inode); + if (start_pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ end_pos = round_up(pos + count, root->sectorsize); - err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); + err = btrfs_cont_expand(inode, oldsize, end_pos); if (err) { inode_unlock(inode); goto out; } + if (start_pos > round_up(oldsize, root->sectorsize)) + clean_page = 1; } if (sync) @@ -1818,6 +1837,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) iocb->ki_pos = pos + num_written; + if (clean_page) + pagecache_isize_extended(inode, oldsize, + i_size_read(inode)); } inode_unlock(inode); @@ -1825,7 +1847,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, /* * We also have to set last_sub_trans to the current log transid, * otherwise subsequent syncs to a file that's been synced in this - * transaction will appear to have already occured. + * transaction will appear to have already occurred. */ spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; @@ -1996,10 +2018,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed && - (full_sync || - !btrfs_have_ordered_extents_in_range(inode, start, len)))) { + (full_sync && BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) || + (!btrfs_have_ordered_extents_in_range(inode, start, len) && + BTRFS_I(inode)->last_trans + <= root->fs_info->last_trans_committed)) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2293,10 +2316,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) int ret = 0; int err = 0; unsigned int rsv_count; - bool same_page; + bool same_block; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; - bool truncated_page = false; + bool truncated_block = false; bool updated_inode = false; ret = btrfs_wait_ordered_range(inode, offset, len); @@ -2304,7 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; inode_lock(inode); - ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ino_size = round_up(inode->i_size, root->sectorsize); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) goto out_only_mutex; @@ -2317,31 +2340,30 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); lockend = round_down(offset + len, BTRFS_I(inode)->root->sectorsize) - 1; - same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); - + same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset)) + == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1)); /* - * We needn't truncate any page which is beyond the end of the file + * We needn't truncate any block which is beyond the end of the file * because we are sure there is no data there. */ /* - * Only do this if we are in the same page and we aren't doing the - * entire page. + * Only do this if we are in the same block and we aren't doing the + * entire block. */ - if (same_page && len < PAGE_CACHE_SIZE) { + if (same_block && len < root->sectorsize) { if (offset < ino_size) { - truncated_page = true; - ret = btrfs_truncate_page(inode, offset, len, 0); + truncated_block = true; + ret = btrfs_truncate_block(inode, offset, len, 0); } else { ret = 0; } goto out_only_mutex; } - /* zero back part of the first page */ + /* zero back part of the first block */ if (offset < ino_size) { - truncated_page = true; - ret = btrfs_truncate_page(inode, offset, 0, 0); + truncated_block = true; + ret = btrfs_truncate_block(inode, offset, 0, 0); if (ret) { inode_unlock(inode); return ret; @@ -2376,9 +2398,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { - truncated_page = true; - ret = btrfs_truncate_page(inode, - tail_start + tail_len, 0, 1); + truncated_block = true; + ret = btrfs_truncate_block(inode, + tail_start + tail_len, + 0, 1); if (ret) goto out_only_mutex; } @@ -2544,7 +2567,7 @@ out_trans: goto out_free; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -2558,7 +2581,7 @@ out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); out_only_mutex: - if (!updated_inode && truncated_page && !ret && !err) { + if (!updated_inode && truncated_block && !ret && !err) { /* * If we only end up zeroing part of a page, we still need to * update the inode item, so that all the time fields are @@ -2611,7 +2634,7 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len) return 0; } insert: - range = kmalloc(sizeof(*range), GFP_NOFS); + range = kmalloc(sizeof(*range), GFP_KERNEL); if (!range) return -ENOMEM; range->start = start; @@ -2678,10 +2701,10 @@ static long btrfs_fallocate(struct file *file, int mode, } else if (offset + len > inode->i_size) { /* * If we are fallocating from the end of the file onward we - * need to zero out the end of the page if i_size lands in the - * middle of a page. + * need to zero out the end of the block if i_size lands in the + * middle of a block. */ - ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); if (ret) goto out; } @@ -2712,7 +2735,7 @@ static long btrfs_fallocate(struct file *file, int mode, btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); + &cached_state, GFP_KERNEL); /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -2794,7 +2817,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); i_size_write(inode, actual_end); btrfs_ordered_update_i_size(inode, actual_end, NULL); ret = btrfs_update_inode(trans, root, inode); @@ -2806,7 +2829,7 @@ static long btrfs_fallocate(struct file *file, int mode, } out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); + &cached_state, GFP_KERNEL); out: /* * As we waited the extent range, the data_rsv_map must be empty @@ -2939,8 +2962,7 @@ const struct file_operations btrfs_file_operations = { void btrfs_auto_defrag_exit(void) { - if (btrfs_inode_defrag_cachep) - kmem_cache_destroy(btrfs_inode_defrag_cachep); + kmem_cache_destroy(btrfs_inode_defrag_cachep); } int btrfs_auto_defrag_init(void) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 393e36bd5845..53dbeaf6ce94 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -153,6 +153,20 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static unsigned long *alloc_bitmap(u32 bitmap_size) { + void *mem; + + /* + * The allocation size varies, observed numbers were < 4K up to 16K. + * Using vmalloc unconditionally would be too heavy, we'll try + * contiguous allocations first. + */ + if (bitmap_size <= PAGE_SIZE) + return kzalloc(bitmap_size, GFP_NOFS); + + mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); + if (mem) + return mem; + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); } @@ -289,7 +303,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: - vfree(bitmap); + kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, root, ret); return ret; @@ -438,7 +452,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, ret = 0; out: - vfree(bitmap); + kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, root, ret); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index e50316c4af15..1f0ec19b23f6 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -556,6 +556,9 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) mutex_lock(&root->objectid_mutex); if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + btrfs_warn(root->fs_info, + "the objectid of root %llu reaches its highest value", + root->root_key.objectid); ret = -ENOSPC; goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e28f3d4691af..41a5688ffdfe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -263,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, data_len = compressed_size; if (start > 0 || - actual_end > PAGE_CACHE_SIZE || + actual_end > root->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || (!compressed_size && (actual_end & (root->sectorsize - 1)) == 0) || @@ -2002,7 +2002,8 @@ again: if (PagePrivate2(page)) goto out; - ordered = btrfs_lookup_ordered_extent(inode, page_start); + ordered = btrfs_lookup_ordered_range(inode, page_start, + PAGE_CACHE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -4013,7 +4014,8 @@ err: btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode_inc_iversion(inode); inode_inc_iversion(dir); - inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + inode->i_ctime = dir->i_mtime = + dir->i_ctime = current_fs_time(inode->i_sb); ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4156,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -4211,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, { int ret; + /* + * This is only used to apply pressure to the enospc system, we don't + * intend to use this reservation at all. + */ bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + bytes_deleted *= root->nodesize; ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, + bytes_deleted, 1); trans->bytes_reserved += bytes_deleted; + } return ret; } @@ -4248,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode, * read the extent item from disk (data not in the page cache). */ btrfs_release_path(path); - return btrfs_truncate_page(inode, offset, page_end - offset, 0); + return btrfs_truncate_block(inode, offset, page_end - offset, + 0); } btrfs_set_file_extent_ram_bytes(leaf, fi, size); @@ -4601,17 +4613,17 @@ error: } /* - * btrfs_truncate_page - read, zero a chunk and write a page + * btrfs_truncate_block - read, zero a chunk and write a block * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the * offset * @front - zero up to the offset instead of from the offset on * - * This will find the page for the "from" offset and cow the page and zero the + * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ -int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, +int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front) { struct address_space *mapping = inode->i_mapping; @@ -4622,18 +4634,19 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, char *kaddr; u32 blocksize = root->sectorsize; pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; - u64 page_start; - u64 page_end; + u64 block_start; + u64 block_end; if ((offset & (blocksize - 1)) == 0 && (!len || ((len & (blocksize - 1)) == 0))) goto out; + ret = btrfs_delalloc_reserve_space(inode, - round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); + round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4641,14 +4654,14 @@ again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, - round_down(from, PAGE_CACHE_SIZE), - PAGE_CACHE_SIZE); + round_down(from, blocksize), + blocksize); ret = -ENOMEM; goto out; } - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + block_start = round_down(from, blocksize); + block_end = block_start + blocksize - 1; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); @@ -4665,12 +4678,12 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, &cached_state); + lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(inode, page_start); + ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); unlock_page(page); page_cache_release(page); @@ -4679,39 +4692,41 @@ again: goto again; } - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, &cached_state); if (ret) { - unlock_extent_cached(io_tree, page_start, page_end, + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); goto out_unlock; } - if (offset != PAGE_CACHE_SIZE) { + if (offset != blocksize) { if (!len) - len = PAGE_CACHE_SIZE - offset; + len = blocksize - offset; kaddr = kmap(page); if (front) - memset(kaddr, 0, offset); + memset(kaddr + (block_start - page_offset(page)), + 0, offset); else - memset(kaddr + offset, 0, len); + memset(kaddr + (block_start - page_offset(page)) + offset, + 0, len); flush_dcache_page(page); kunmap(page); } ClearPageChecked(page); set_page_dirty(page); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); out_unlock: if (ret) - btrfs_delalloc_release_space(inode, page_start, - PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, block_start, + blocksize); unlock_page(page); page_cache_release(page); out: @@ -4782,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) int err = 0; /* - * If our size started in the middle of a page we need to zero out the - * rest of the page before we expand the i_size, otherwise we could + * If our size started in the middle of a block we need to zero out the + * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_page(inode, oldsize, 0, 0); + err = btrfs_truncate_block(inode, oldsize, 0, 0); if (err) return err; @@ -4895,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) } if (newsize > oldsize) { - truncate_pagecache(inode, newsize); /* * Don't do an expanding truncate while snapshoting is ongoing. * This is to ensure the snapshot captures a fully consistent @@ -4918,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); btrfs_end_write_no_snapshoting(root); btrfs_end_transaction(trans, root); @@ -5588,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = CURRENT_TIME; + inode->i_mtime = current_fs_time(inode->i_sb); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; @@ -5717,6 +5732,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ + bool emitted; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5745,6 +5761,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; + emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5788,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; } else { - name_ptr = kmalloc(name_len, GFP_NOFS); + name_ptr = kmalloc(name_len, GFP_KERNEL); if (!name_ptr) { ret = -ENOMEM; goto err; @@ -5824,6 +5841,7 @@ skip: if (over) goto nopos; + emitted = true; di_len = btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; @@ -5836,11 +5854,20 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); if (ret) goto nopos; } + /* + * If we haven't emitted any dir entry, we must not touch ctx->pos as + * it was was set to the termination value in previous call. We assume + * that "." and ".." were emitted if we reach this point and set the + * termination value as well for an empty directory. + */ + if (ctx->pos > 2 && !emitted) + goto nopos; + /* Reached end of directory/root. Bump pos past the last item. */ ctx->pos++; @@ -6160,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = CURRENT_TIME; + inode->i_mtime = current_fs_time(inode->i_sb); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; @@ -6273,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); inode_inc_iversion(parent_inode); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + parent_inode->i_mtime = parent_inode->i_ctime = + current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -6491,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -7116,21 +7144,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return em; - } - + /* + * Create the ordered extent before the extent map. This is to avoid + * races with the fast fsync path that would lead to it logging file + * extent items that point to disk extents that were not yet written to. + * The fast fsync path collects ordered extents into a local list and + * then collects all the new extent maps, so we must create the ordered + * extent first and make sure the fast fsync path collects any new + * ordered extents after collecting new extent maps as well. + * The fsync path simply can not rely on inode_dio_wait() because it + * causes deadlock with AIO. + */ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - free_extent_map(em); return ERR_PTR(ret); } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + struct btrfs_ordered_extent *oe; + + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + oe = btrfs_lookup_ordered_extent(inode, start); + ASSERT(oe); + if (WARN_ON(!oe)) + return em; + set_bit(BTRFS_ORDERED_IOERR, &oe->flags); + set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); + btrfs_remove_ordered_extent(inode, oe); + /* Once for our lookup and once for the ordered extents tree. */ + btrfs_put_ordered_extent(oe); + btrfs_put_ordered_extent(oe); + } return em; } @@ -7382,7 +7430,26 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state, GFP_NOFS); if (ordered) { - btrfs_start_ordered_extent(inode, ordered, 1); + /* + * If we are doing a DIO read and the ordered extent we + * found is for a buffered write, we can not wait for it + * to complete and retry, because if we do so we can + * deadlock with concurrent buffered writes on page + * locks. This happens only if our DIO read covers more + * than one extent map, if at this point has already + * created an ordered extent for a previous extent map + * and locked its range in the inode's io tree, and a + * concurrent write against that previous extent map's + * range and this range started (we unlock the ranges + * in the io tree only when the bios complete and + * buffered writes always lock pages before attempting + * to lock range in the io tree). + */ + if (writing || + test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) + btrfs_start_ordered_extent(inode, ordered, 1); + else + ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* @@ -7399,9 +7466,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * that page. */ ret = -ENOTBLK; - break; } + if (ret) + break; + cond_resched(); } @@ -7732,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode, } static int dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, u64 start, u64 end, - int failed_mirror, bio_end_io_t *repair_endio, - void *repair_arg) + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; struct bio *bio; @@ -7755,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, return -EIO; } - if (failed_bio->bi_vcnt > 1) + if ((failed_bio->bi_vcnt > 1) + || (failed_bio->bi_io_vec->bv_len + > BTRFS_I(inode)->root->sectorsize)) read_mode = READ_SYNC | REQ_FAILFAST_DEV; else read_mode = READ_SYNC; @@ -7763,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, isector = start - btrfs_io_bio(failed_bio)->logical; isector >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - 0, isector, repair_endio, repair_arg); + pgoff, isector, repair_endio, repair_arg); if (!bio) { free_io_failure(inode, failrec); return -EIO; @@ -7793,12 +7864,17 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; + struct inode *inode; struct bio_vec *bvec; int i; if (bio->bi_error) goto end; + ASSERT(bio->bi_vcnt == 1); + inode = bio->bi_io_vec->bv_page->mapping->host; + ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); + done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) clean_io_failure(done->inode, done->start, bvec->bv_page, 0); @@ -7810,25 +7886,35 @@ end: static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct btrfs_retry_complete done; u64 start; + unsigned int pgoff; + u32 sectorsize; + int nr_sectors; int i; int ret; + fs_info = BTRFS_I(inode)->root->fs_info; + sectorsize = BTRFS_I(inode)->root->sectorsize; + start = io_bio->logical; done.inode = inode; bio_for_each_segment_all(bvec, &io_bio->bio, i) { -try_again: + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + pgoff = bvec->bv_offset; + +next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, - start + bvec->bv_len - 1, - io_bio->mirror_num, - btrfs_retry_endio_nocsum, &done); + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); if (ret) return ret; @@ -7836,10 +7922,15 @@ try_again: if (!done.uptodate) { /* We might have another mirror, so try again */ - goto try_again; + goto next_block_or_try_again; } - start += bvec->bv_len; + start += sectorsize; + + if (nr_sectors--) { + pgoff += sectorsize; + goto next_block_or_try_again; + } } return 0; @@ -7849,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct inode *inode; struct bio_vec *bvec; + u64 start; int uptodate; int ret; int i; @@ -7858,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio) goto end; uptodate = 1; + + start = done->start; + + ASSERT(bio->bi_vcnt == 1); + inode = bio->bi_io_vec->bv_page->mapping->host; + ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); + bio_for_each_segment_all(bvec, bio, i) { ret = __readpage_endio_check(done->inode, io_bio, i, - bvec->bv_page, 0, - done->start, bvec->bv_len); + bvec->bv_page, bvec->bv_offset, + done->start, bvec->bv_len); if (!ret) clean_io_failure(done->inode, done->start, - bvec->bv_page, 0); + bvec->bv_page, bvec->bv_offset); else uptodate = 0; } @@ -7878,20 +7978,34 @@ end: static int __btrfs_subio_endio_read(struct inode *inode, struct btrfs_io_bio *io_bio, int err) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct btrfs_retry_complete done; u64 start; u64 offset = 0; + u32 sectorsize; + int nr_sectors; + unsigned int pgoff; + int csum_pos; int i; int ret; + fs_info = BTRFS_I(inode)->root->fs_info; + sectorsize = BTRFS_I(inode)->root->sectorsize; + err = 0; start = io_bio->logical; done.inode = inode; bio_for_each_segment_all(bvec, &io_bio->bio, i) { - ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, - 0, start, bvec->bv_len); + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + + pgoff = bvec->bv_offset; +next_block: + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec->bv_page, pgoff, start, + sectorsize); if (likely(!ret)) goto next; try_again: @@ -7899,10 +8013,10 @@ try_again: done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, - start + bvec->bv_len - 1, - io_bio->mirror_num, - btrfs_retry_endio, &done); + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); if (ret) { err = ret; goto next; @@ -7915,8 +8029,15 @@ try_again: goto try_again; } next: - offset += bvec->bv_len; - start += bvec->bv_len; + offset += sectorsize; + start += sectorsize; + + ASSERT(nr_sectors); + + if (--nr_sectors) { + pgoff += sectorsize; + goto next_block; + } } return err; @@ -7954,6 +8075,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) @@ -8008,6 +8130,7 @@ static void btrfs_endio_direct_write(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -8168,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; - int nr_pages = 0; - int ret; + u32 blocksize = root->sectorsize; int async_submit = 0; + int nr_sectors; + int ret; + int i; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, @@ -8200,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - if (map_length < submit_len + bvec->bv_len || - bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len) { + nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len); + i = 0; +next_block: + if (unlikely(map_length < submit_len + blocksize || + bio_add_page(bio, bvec->bv_page, blocksize, + bvec->bv_offset + (i * blocksize)) < blocksize)) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -8223,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, file_offset += submit_len; submit_len = 0; - nr_pages = 0; bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); @@ -8241,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio_put(bio); goto out_err; } + + goto next_block; } else { - submit_len += bvec->bv_len; - nr_pages++; + submit_len += blocksize; + if (--nr_sectors) { + i++; + goto next_block; + } bvec++; } } @@ -8608,6 +8740,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 start; + u64 end; int inode_evicting = inode->i_state & I_FREEING; /* @@ -8627,14 +8761,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, page_start); +again: + start = page_start; + ordered = btrfs_lookup_ordered_range(inode, start, + page_end - start + 1); if (ordered) { + end = min(page_end, ordered->file_offset + ordered->len - 1); /* * IO on this page will never be started, so we need * to account for any ordered extents now */ if (!inode_evicting) - clear_extent_bit(tree, page_start, page_end, + clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, @@ -8651,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - new_len = page_start - ordered->file_offset; + new_len = start - ordered->file_offset; if (new_len < ordered->truncated_len) ordered->truncated_len = new_len; spin_unlock_irq(&tree->lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, - page_start, - PAGE_CACHE_SIZE, 1)) + start, + end - start + 1, 1)) btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); if (!inode_evicting) { cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, + lock_extent_bits(tree, start, end, &cached_state); } + + start = end + 1; + if (start < page_end) + goto again; } /* @@ -8727,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret; int reserved = 0; + u64 reserved_space; u64 page_start; u64 page_end; + u64 end; + + reserved_space = PAGE_CACHE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; + end = page_end; + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepage() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ ret = btrfs_delalloc_reserve_space(inode, page_start, - PAGE_CACHE_SIZE); + reserved_space); if (!ret) { ret = file_update_time(vma->vm_file); reserved = 1; @@ -8769,7 +8924,7 @@ again: * we can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish */ - ordered = btrfs_lookup_ordered_extent(inode, page_start); + ordered = btrfs_lookup_ordered_range(inode, page_start, page_end); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -8779,6 +8934,18 @@ again: goto again; } + if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) { + reserved_space = round_up(size - page_start, root->sectorsize); + if (reserved_space < PAGE_CACHE_SIZE) { + end = page_start + reserved_space - 1; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_delalloc_release_space(inode, page_start, + PAGE_CACHE_SIZE - reserved_space); + } + } + /* * XXX - page_mkwrite gets called every time the page is dirtied, even * if it was already dirty, so for space accounting reasons we need to @@ -8786,12 +8953,12 @@ again: * is probably a better way to do this, but for now keep consistent with * prepare_pages in the normal write path. */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + ret = btrfs_set_extent_delalloc(inode, page_start, end, &cached_state); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, @@ -8830,7 +8997,7 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); return ret; @@ -9156,16 +9323,11 @@ void btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - if (btrfs_inode_cachep) - kmem_cache_destroy(btrfs_inode_cachep); - if (btrfs_trans_handle_cachep) - kmem_cache_destroy(btrfs_trans_handle_cachep); - if (btrfs_transaction_cachep) - kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_path_cachep) - kmem_cache_destroy(btrfs_path_cachep); - if (btrfs_free_space_cachep) - kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_inode_cachep); + kmem_cache_destroy(btrfs_trans_handle_cachep); + kmem_cache_destroy(btrfs_transaction_cachep); + kmem_cache_destroy(btrfs_path_cachep); + kmem_cache_destroy(btrfs_free_space_cachep); } int btrfs_init_cachep(void) @@ -9216,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt, generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->blksize = PAGE_CACHE_SIZE; spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; @@ -9234,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); - struct timespec ctime = CURRENT_TIME; u64 index = 0; u64 root_objectid; int ret; @@ -9331,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; - old_inode->i_ctime = ctime; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + old_inode->i_ctime = current_fs_time(old_dir->i_sb); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); @@ -9358,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_fs_time(new_inode->i_sb); if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { root_objectid = BTRFS_I(new_inode)->location.objectid; @@ -9836,7 +9996,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 952172ca7e45..053e677839fe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -59,6 +59,8 @@ #include "props.h" #include "sysfs.h" #include "qgroup.h" +#include "tree-log.h" +#include "compression.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -347,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_update_iflags(inode); inode_inc_iversion(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); ret = btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -443,7 +445,7 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; - struct timespec cur_time = CURRENT_TIME; + struct timespec cur_time = current_fs_time(dir->i_sb); struct inode *inode; int ret; int err; @@ -844,10 +846,6 @@ static noinline int btrfs_mksubvol(struct path *parent, if (IS_ERR(dentry)) goto out_unlock; - error = -EEXIST; - if (d_really_is_positive(dentry)) - goto out_dput; - error = btrfs_may_create(dir, dentry); if (error) goto out_dput; @@ -2097,8 +2095,6 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - btrfs_err(info, "could not find root %llu", - sk->tree_id); btrfs_free_path(path); return -ENOENT; } @@ -2476,6 +2472,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; + btrfs_record_snapshot_destroy(trans, dir); + ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, dentry->d_name.name, @@ -2794,24 +2792,29 @@ out: static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; page = grab_cache_page(inode->i_mapping, index); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); if (!PageUptodate(page)) { - if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, - 0)) - return NULL; + int ret; + + ret = btrfs_readpage(NULL, page); + if (ret) + return ERR_PTR(ret); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); - return NULL; + return ERR_PTR(-EIO); + } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + return ERR_PTR(-EAGAIN); } } - unlock_page(page); return page; } @@ -2823,17 +2826,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages, pgoff_t index = off >> PAGE_CACHE_SHIFT; for (i = 0; i < num_pages; i++) { +again: pages[i] = extent_same_get_page(inode, index + i); - if (!pages[i]) - return -ENOMEM; + if (IS_ERR(pages[i])) { + int err = PTR_ERR(pages[i]); + + if (err == -EAGAIN) + goto again; + pages[i] = NULL; + return err; + } } return 0; } -static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +static int lock_extent_range(struct inode *inode, u64 off, u64 len, + bool retry_range_locking) { - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ + /* + * Do any pending delalloc/csum calculations on inode, one way or + * another, and lock file content. + * The locking order is: + * + * 1) pages + * 2) range in the inode's io tree + */ while (1) { struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); @@ -2851,8 +2868,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); + if (!retry_range_locking) + return -EAGAIN; btrfs_wait_ordered_range(inode, off, len); } + return 0; } static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) @@ -2877,15 +2897,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len, + bool retry_range_locking) { + int ret; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - lock_extent_range(inode1, loff1, len); - lock_extent_range(inode2, loff2, len); + ret = lock_extent_range(inode1, loff1, len, retry_range_locking); + if (ret) + return ret; + ret = lock_extent_range(inode2, loff2, len, retry_range_locking); + if (ret) + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, + loff1 + len - 1); + return ret; } struct cmp_pages { @@ -2901,11 +2930,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) for (i = 0; i < cmp->num_pages; i++) { pg = cmp->src_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } pg = cmp->dst_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } } kfree(cmp->src_pages); kfree(cmp->dst_pages); @@ -2925,8 +2958,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, * of the array is bounded by len, which is in turn bounded by * BTRFS_MAX_DEDUPE_LEN. */ - src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); - dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); + dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); if (!src_pgarr || !dst_pgarr) { kfree(src_pgarr); kfree(dst_pgarr); @@ -2966,6 +2999,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, src_page = cmp->src_pages[i]; dst_page = cmp->dst_pages[i]; + ASSERT(PageLocked(src_page)); + ASSERT(PageLocked(dst_page)); addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -3031,6 +3066,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) goto out_unlock; + ret = extent_same_check_offsets(src, dst_loff, &len, olen); + if (ret) + goto out_unlock; /* * Single inode case wants the same checks, except we @@ -3078,14 +3116,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, goto out_unlock; } +again: ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); if (ret) goto out_unlock; if (same_inode) - lock_extent_range(src, same_lock_start, same_lock_len); + ret = lock_extent_range(src, same_lock_start, same_lock_len, + false); else - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, + false); + /* + * If one of the inodes has dirty pages in the respective range or + * ordered extents, we need to flush dellaloc and wait for all ordered + * extents in the range. We must unlock the pages and the ranges in the + * io trees to avoid deadlocks when flushing delalloc (requires locking + * pages) and when waiting for ordered extents to complete (they require + * range locking). + */ + if (ret == -EAGAIN) { + /* + * Ranges in the io trees already unlocked. Now unlock all + * pages before waiting for all IO to complete. + */ + btrfs_cmp_data_free(&cmp); + if (same_inode) { + btrfs_wait_ordered_range(src, same_lock_start, + same_lock_len); + } else { + btrfs_wait_ordered_range(src, loff, len); + btrfs_wait_ordered_range(dst, dst_loff, len); + } + goto again; + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + btrfs_cmp_data_free(&cmp); + return ret; + } /* pass original length for comparison so we stay within i_size */ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); @@ -3148,7 +3218,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); if (!no_time_update) - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -3795,9 +3865,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 lock_start = min_t(u64, off, destoff); u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, lock_start, lock_len); + ret = lock_extent_range(src, lock_start, lock_len, true); } else { - btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_double_extent_lock(src, off, inode, destoff, len, + true); + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + goto out_unlock; } ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); @@ -3814,8 +3890,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + truncate_inode_pages_range(&inode->i_data, + round_down(destoff, PAGE_CACHE_SIZE), + round_up(destoff + len, PAGE_CACHE_SIZE) - 1); out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); @@ -4956,7 +5033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; - struct timespec ct = CURRENT_TIME; + struct timespec ct = current_fs_time(inode->i_sb); int ret = 0; int received_uuid_changed; @@ -5187,8 +5264,7 @@ out_unlock: .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } -static int btrfs_ioctl_get_supported_features(struct file *file, - void __user *arg) +int btrfs_ioctl_get_supported_features(void __user *arg) { static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), @@ -5467,7 +5543,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: - return btrfs_ioctl_get_supported_features(file, argp); + return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: return btrfs_ioctl_get_features(file, argp); case BTRFS_IOC_SET_FEATURES: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8c27292ea9ea..0de7da5a610d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -25,6 +25,7 @@ #include "btrfs_inode.h" #include "extent_io.h" #include "disk-io.h" +#include "compression.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -1009,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, for (; node; node = rb_prev(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - /* We treat this entry as if it doesnt exist */ + /* We treat this entry as if it doesn't exist */ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) continue; if (test->file_offset + test->len <= disk_i_size) @@ -1114,6 +1115,5 @@ int __init ordered_data_init(void) void ordered_data_exit(void) { - if (btrfs_ordered_extent_cache) - kmem_cache_destroy(btrfs_ordered_extent_cache); + kmem_cache_destroy(btrfs_ordered_extent_cache); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 647ab12fdf5d..147dc6ca5de1 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -295,8 +295,27 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), btrfs_dev_extent_length(l, dev_extent)); break; - case BTRFS_DEV_STATS_KEY: - printk(KERN_INFO "\t\tdevice stats\n"); + case BTRFS_PERSISTENT_ITEM_KEY: + printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n", + key.objectid, key.offset); + switch (key.objectid) { + case BTRFS_DEV_STATS_OBJECTID: + printk(KERN_INFO "\t\tdevice stats\n"); + break; + default: + printk(KERN_INFO "\t\tunknown persistent item\n"); + } + break; + case BTRFS_TEMPORARY_ITEM_KEY: + printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n", + key.objectid, key.offset); + switch (key.objectid) { + case BTRFS_BALANCE_OBJECTID: + printk(KERN_INFO "\t\tbalance status\n"); + break; + default: + printk(KERN_INFO "\t\tunknown temporary item\n"); + } break; case BTRFS_DEV_REPLACE_KEY: printk(KERN_INFO "\t\tdev replace\n"); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f9e60231f685..36992128c746 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -22,6 +22,7 @@ #include "hash.h" #include "transaction.h" #include "xattr.h" +#include "compression.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 619f92963e27..b892914968c1 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -72,7 +72,7 @@ struct reada_extent { spinlock_t lock; struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; - struct btrfs_device *scheduled_for; + int scheduled; }; struct reada_zone { @@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info); static void __reada_start_machine(struct btrfs_fs_info *fs_info); static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation); + struct btrfs_key *top, u64 generation); /* recurses */ /* in case of err, eb might be NULL */ -static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) +static void __readahead_hook(struct btrfs_fs_info *fs_info, + struct reada_extent *re, struct extent_buffer *eb, + u64 start, int err) { int level = 0; int nritems; int i; u64 bytenr; u64 generation; - struct reada_extent *re; - struct btrfs_fs_info *fs_info = root->fs_info; struct list_head list; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct btrfs_device *for_dev; if (eb) level = btrfs_header_level(eb); - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - - if (!re) - return -1; - spin_lock(&re->lock); /* * just take the full list from the extent. afterwards we * don't need the lock anymore */ list_replace_init(&re->extctl, &list); - for_dev = re->scheduled_for; - re->scheduled_for = NULL; + re->scheduled = 0; spin_unlock(&re->lock); - if (err == 0) { - nritems = level ? btrfs_header_nritems(eb) : 0; - generation = btrfs_header_generation(eb); - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - } else { - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - nritems = 0; - generation = 0; - } + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + if (err) + goto cleanup; + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + if (!level) + goto cleanup; + + nritems = btrfs_header_nritems(eb); + generation = btrfs_header_generation(eb); for (i = 0; i < nritems; i++) { struct reada_extctl *rec; u64 n_gen; @@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, */ #ifdef DEBUG if (rec->generation != generation) { - btrfs_debug(root->fs_info, - "generation mismatch for (%llu,%d,%llu) %llu != %llu", - key.objectid, key.type, key.offset, - rec->generation, generation); + btrfs_debug(fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", + key.objectid, key.type, key.offset, + rec->generation, generation); } #endif if (rec->generation == generation && btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - level - 1, n_gen); + reada_add_block(rc, bytenr, &next_key, n_gen); } } + +cleanup: /* * free extctl records */ @@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, reada_extent_put(fs_info, re); /* one ref for each entry */ } - reada_extent_put(fs_info, re); /* our ref */ - if (for_dev) - atomic_dec(&for_dev->reada_in_flight); - return 0; + return; } /* * start is passed separately in case eb in NULL, which may be the case with * failed I/O */ -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) +int btree_readahead_hook(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, u64 start, int err) { - int ret; + int ret = 0; + struct reada_extent *re; - ret = __readahead_hook(root, eb, start, err); + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, + start >> PAGE_CACHE_SHIFT); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + if (!re) { + ret = -1; + goto start_machine; + } - reada_start_machine(root->fs_info); + __readahead_hook(fs_info, re, eb, start, err); + reada_extent_put(fs_info, re); /* our ref */ +start_machine: + reada_start_machine(fs_info); return ret; } @@ -260,18 +258,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) + if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - - if (ret == 1) { - if (logical >= zone->start && logical < zone->end) - return zone; - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); + return zone; } + spin_unlock(&fs_info->reada_lock); + cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; @@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, end = start + cache->key.offset - 1; btrfs_put_block_group(cache); - zone = kzalloc(sizeof(*zone), GFP_NOFS); + zone = kzalloc(sizeof(*zone), GFP_KERNEL); if (!zone) return NULL; @@ -307,8 +301,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) + if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); + else + zone = NULL; } spin_unlock(&fs_info->reada_lock); @@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 logical, - struct btrfs_key *top, int level) + struct btrfs_key *top) { int ret; struct reada_extent *re = NULL; @@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 length; int real_stripes; int nzones = 0; - int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; int dev_replace_is_ongoing; + int have_zone = 0; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (re) return re; - re = kzalloc(sizeof(*re), GFP_NOFS); + re = kzalloc(sizeof(*re), GFP_KERNEL); if (!re) return NULL; @@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_zone *zone; dev = bbio->stripes[nzones].dev; + + /* cannot read ahead on missing device. */ + if (!dev->bdev) + continue; + zone = reada_find_zone(fs_info, dev, logical, bbio); if (!zone) - break; + continue; - re->zones[nzones] = zone; + re->zones[re->nzones++] = zone; spin_lock(&zone->lock); if (!zone->elems) kref_get(&zone->refcnt); @@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - re->nzones = nzones; - if (nzones == 0) { + if (re->nzones == 0) { /* not a single zone found, error and out */ goto error; } /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); - for (i = 0; i < nzones; ++i) { - dev = bbio->stripes[i].dev; + for (nzones = 0; nzones < re->nzones; ++nzones) { + dev = re->zones[nzones]->device; + if (dev == prev_dev) { /* * in case of DUP, just add the first zone. As both @@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, */ continue; } - if (!dev->bdev) { - /* - * cannot read ahead on missing device, but for RAID5/6, - * REQ_GET_READ_MIRRORS return 1. So don't skip missing - * device for such case. - */ - if (nzones > 1) - continue; - } + if (!dev->bdev) + continue; + if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { /* @@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { - while (--i >= 0) { - dev = bbio->stripes[i].dev; + while (--nzones >= 0) { + dev = re->zones[nzones]->device; BUG_ON(dev == NULL); /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); @@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } + have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + + if (!have_zone) + goto error; btrfs_put_bbio(bbio); return re; error: - while (nzones) { + for (nzones = 0; nzones < re->nzones; ++nzones) { struct reada_zone *zone; - --nzones; zone = re->zones[nzones]; kref_get(&zone->refcnt); spin_lock(&zone->lock); @@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - if (re->scheduled_for) - atomic_dec(&re->scheduled_for->reada_in_flight); kfree(re); } @@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref) } static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation) + struct btrfs_key *top, u64 generation) { struct btrfs_root *root = rc->root; struct reada_extent *re; struct reada_extctl *rec; - re = reada_find_extent(root, logical, top, level); /* takes one ref */ + re = reada_find_extent(root, logical, top); /* takes one ref */ if (!re) return -1; - rec = kzalloc(sizeof(*rec), GFP_NOFS); + rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) { reada_extent_put(root->fs_info, re); return -ENOMEM; @@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, u64 logical; int ret; int i; - int need_kick = 0; spin_lock(&fs_info->reada_lock); if (dev->reada_curr_zone == NULL) { @@ -679,7 +674,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, dev->reada_next >> PAGE_CACHE_SHIFT, 1); - if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { spin_unlock(&fs_info->reada_lock); @@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); + spin_lock(&re->lock); + if (re->scheduled || list_empty(&re->extctl)) { + spin_unlock(&re->lock); + reada_extent_put(fs_info, re); + return 0; + } + re->scheduled = 1; + spin_unlock(&re->lock); + /* * find mirror num */ @@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } logical = re->logical; - spin_lock(&re->lock); - if (re->scheduled_for == NULL) { - re->scheduled_for = dev; - need_kick = 1; - } - spin_unlock(&re->lock); - - reada_extent_put(fs_info, re); - - if (!need_kick) - return 0; - atomic_inc(&dev->reada_in_flight); ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); if (ret) - __readahead_hook(fs_info->extent_root, NULL, logical, ret); + __readahead_hook(fs_info, re, NULL, logical, ret); else if (eb) - __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + __readahead_hook(fs_info, re, eb, eb->start, ret); if (eb) free_extent_buffer(eb); + atomic_dec(&dev->reada_in_flight); + reada_extent_put(fs_info, re); + return 1; } @@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work) set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); set_task_ioprio(current, old_ioprio); + + atomic_dec(&fs_info->reada_works_cnt); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) * enqueue to workers to finish it. This will distribute the load to * the cores. */ - for (i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { reada_start_machine(fs_info); + if (atomic_read(&fs_info->reada_works_cnt) > + BTRFS_MAX_MIRRORS * 2) + break; + } } static void reada_start_machine(struct btrfs_fs_info *fs_info) { struct reada_machine_work *rmw; - rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); if (!rmw) { /* FIXME we cannot handle this properly right now */ BUG(); @@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); + atomic_inc(&fs_info->reada_works_cnt); } #ifdef DEBUG @@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; printk(KERN_DEBUG - " re: logical %llu size %u empty %d for %lld", + " re: logical %llu size %u empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), re->scheduled_for ? - re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", @@ -878,27 +879,21 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) index, 1); if (ret == 0) break; - if (!re->scheduled_for) { + if (!re->scheduled) { index = (re->logical >> PAGE_CACHE_SHIFT) + 1; continue; } printk(KERN_DEBUG - "re: logical %llu size %u list empty %d for %lld", + "re: logical %llu size %u list empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), - re->scheduled_for ? re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", re->zones[i]->start, re->zones[i]->end); - for (i = 0; i < re->nzones; ++i) { - printk(KERN_CONT " zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - printk(KERN_CONT " %lld", - re->zones[i]->devs[j]->devid); - } + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); } } printk(KERN_CONT "\n"); @@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct reada_control *rc; u64 start; u64 generation; - int level; int ret; struct extent_buffer *node; static struct btrfs_key max_key = { @@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, .offset = (u64)-1 }; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = kzalloc(sizeof(*rc), GFP_KERNEL); if (!rc) return ERR_PTR(-ENOMEM); @@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, node = btrfs_root_node(root); start = node->start; - level = btrfs_header_level(node); generation = btrfs_header_generation(node); free_extent_buffer(node); - ret = reada_add_block(rc, start, &max_key, level, generation); + ret = reada_add_block(rc, start, &max_key, generation); if (ret) { kfree(rc); return ERR_PTR(ret); @@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->root->fs_info; while (atomic_read(&rc->elems)) { + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 5 * HZ); dump_devs(rc->root->fs_info, @@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle) int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->root->fs_info; while (atomic_read(&rc->elems)) { - wait_event(rc->wait, atomic_read(&rc->elems) == 0); + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + (HZ + 9) / 10); } kref_put(&rc->refcnt, reada_control_release); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fd1c4d982463..2bd0011450df 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_TREE_LOG_OBJECTID || root_objectid == BTRFS_CSUM_TREE_OBJECTID || root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID) + root_objectid == BTRFS_QUOTA_TREE_OBJECTID || + root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return 1; return 0; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7cf8509deda7..9fcd6dfc3266 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + if (err == -EEXIST) + err = 0; if (err) { - BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } @@ -488,7 +496,7 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct = CURRENT_TIME; + struct timespec ct = current_fs_time(root->fs_info->sb); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 92bf5ee732fb..39dbdcbf4d13 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -461,7 +461,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; int ret; - sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; atomic_set(&sctx->refs, 1); @@ -472,7 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; - sbio = kzalloc(sizeof(*sbio), GFP_NOFS); + sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); if (!sbio) goto nomem; sctx->bios[i] = sbio; @@ -611,7 +611,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u64 flags = 0; u64 ref_root; u32 item_size; - u8 ref_level; + u8 ref_level = 0; int ret; WARN_ON(sblock->page_count < 1); @@ -1654,7 +1654,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, again: if (!wr_ctx->wr_curr_bio) { wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), - GFP_NOFS); + GFP_KERNEL); if (!wr_ctx->wr_curr_bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -1671,7 +1671,8 @@ again: sbio->dev = wr_ctx->tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + bio = btrfs_io_bio_alloc(GFP_KERNEL, + wr_ctx->pages_per_wr_bio); if (!bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -2076,7 +2077,8 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + bio = btrfs_io_bio_alloc(GFP_KERNEL, + sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -2241,7 +2243,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, struct scrub_block *sblock; int index; - sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2259,7 +2261,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, struct scrub_page *spage; u64 l = min_t(u64, len, PAGE_SIZE); - spage = kzalloc(sizeof(*spage), GFP_NOFS); + spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { leave_nomem: spin_lock(&sctx->stat_lock); @@ -2286,7 +2288,7 @@ leave_nomem: spage->have_csum = 0; } sblock->page_count++; - spage->page = alloc_page(GFP_NOFS); + spage->page = alloc_page(GFP_KERNEL); if (!spage->page) goto leave_nomem; len -= l; @@ -2541,7 +2543,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, struct scrub_block *sblock; int index; - sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2561,7 +2563,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, struct scrub_page *spage; u64 l = min_t(u64, len, PAGE_SIZE); - spage = kzalloc(sizeof(*spage), GFP_NOFS); + spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { leave_nomem: spin_lock(&sctx->stat_lock); @@ -2591,7 +2593,7 @@ leave_nomem: spage->have_csum = 0; } sblock->page_count++; - spage->page = alloc_page(GFP_NOFS); + spage->page = alloc_page(GFP_KERNEL); if (!spage->page) goto leave_nomem; len -= l; @@ -3857,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); if (dev->scrub_device || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -EINPROGRESS; } - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 63a6152be04b..19b7bf4284ee 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -34,6 +34,7 @@ #include "disk-io.h" #include "btrfs_inode.h" #include "transaction.h" +#include "compression.h" static int g_verbose = 0; @@ -304,7 +305,7 @@ static struct fs_path *fs_path_alloc(void) { struct fs_path *p; - p = kmalloc(sizeof(*p), GFP_NOFS); + p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->reversed = 0; @@ -363,11 +364,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) * First time the inline_buf does not suffice */ if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS); + tmp_buf = kmalloc(len, GFP_KERNEL); if (tmp_buf) memcpy(tmp_buf, p->buf, old_buf_len); } else { - tmp_buf = krealloc(p->buf, len, GFP_NOFS); + tmp_buf = krealloc(p->buf, len, GFP_KERNEL); } if (!tmp_buf) return -ENOMEM; @@ -995,7 +996,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, * values are small. */ buf_len = PATH_MAX; - buf = kmalloc(buf_len, GFP_NOFS); + buf = kmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; @@ -1042,7 +1043,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, buf = NULL; } else { char *tmp = krealloc(buf, buf_len, - GFP_NOFS | __GFP_NOWARN); + GFP_KERNEL | __GFP_NOWARN); if (!tmp) kfree(buf); @@ -1303,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx, /* We only use this path under the commit sem */ tmp_path->need_commit_sem = 0; - backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL); if (!backref_ctx) { ret = -ENOMEM; goto out; @@ -1984,7 +1985,7 @@ static int name_cache_insert(struct send_ctx *sctx, nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)nce->ino); if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); + nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { kfree(nce); return -ENOMEM; @@ -2179,7 +2180,7 @@ out_cache: /* * Store the result of the lookup in the name cache. */ - nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); + nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); if (!nce) { ret = -ENOMEM; goto out; @@ -2315,7 +2316,7 @@ static int send_subvol_begin(struct send_ctx *sctx) if (!path) return -ENOMEM; - name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); + name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); if (!name) { btrfs_free_path(path); return -ENOMEM; @@ -2730,7 +2731,7 @@ static int __record_ref(struct list_head *head, u64 dir, { struct recorded_ref *ref; - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmalloc(sizeof(*ref), GFP_KERNEL); if (!ref) return -ENOMEM; @@ -2755,7 +2756,7 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; - new = kmalloc(sizeof(*ref), GFP_NOFS); + new = kmalloc(sizeof(*ref), GFP_KERNEL); if (!new) return -ENOMEM; @@ -2818,7 +2819,7 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) struct rb_node *parent = NULL; struct orphan_dir_info *entry, *odi; - odi = kmalloc(sizeof(*odi), GFP_NOFS); + odi = kmalloc(sizeof(*odi), GFP_KERNEL); if (!odi) return ERR_PTR(-ENOMEM); odi->ino = dir_ino; @@ -2973,7 +2974,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) struct rb_node *parent = NULL; struct waiting_dir_move *entry, *dm; - dm = kmalloc(sizeof(*dm), GFP_NOFS); + dm = kmalloc(sizeof(*dm), GFP_KERNEL); if (!dm) return -ENOMEM; dm->ino = ino; @@ -3040,7 +3041,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, int exists = 0; int ret; - pm = kmalloc(sizeof(*pm), GFP_NOFS); + pm = kmalloc(sizeof(*pm), GFP_KERNEL); if (!pm) return -ENOMEM; pm->parent_ino = parent_ino; @@ -4280,7 +4281,7 @@ static int __find_xattr(int num, struct btrfs_key *di_key, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmemdup(data, data_len, GFP_NOFS); + ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); if (!ctx->found_data) return -ENOMEM; return 1; @@ -4481,7 +4482,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_CACHE_SIZE - pg_offset); - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { ret = -ENOMEM; break; @@ -5989,7 +5990,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); + sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL); if (!sctx) { ret = -ENOMEM; goto out; @@ -5997,7 +5998,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d41e09fe8e38..00b8f37cc306 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -303,7 +303,8 @@ enum { Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, - Opt_datasum, Opt_treelog, Opt_noinode_cache, + Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot, + Opt_nologreplay, Opt_norecovery, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif @@ -335,6 +336,8 @@ static const match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_treelog, "treelog"}, + {Opt_nologreplay, "nologreplay"}, + {Opt_norecovery, "norecovery"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_noflushoncommit, "noflushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, @@ -352,7 +355,8 @@ static const match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_noinode_cache, "noinode_cache"}, {Opt_no_space_cache, "nospace_cache"}, - {Opt_recovery, "recovery"}, + {Opt_recovery, "recovery"}, /* deprecated */ + {Opt_usebackuproot, "usebackuproot"}, {Opt_skip_balance, "skip_balance"}, {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, @@ -373,7 +377,8 @@ static const match_table_t tokens = { * reading in a new superblock is parsed here. * XXX JDM: This needs to be cleaned up for remount. */ -int btrfs_parse_options(struct btrfs_root *root, char *options) +int btrfs_parse_options(struct btrfs_root *root, char *options, + unsigned long new_flags) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; @@ -393,8 +398,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) else if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); + /* + * Even the options are empty, we still need to do extra check + * against new flags + */ if (!options) - goto out; + goto check; /* * strsep changes the string, duplicate it because parse_options @@ -606,6 +615,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_clear_and_info(root, NOTREELOG, "enabling tree log"); break; + case Opt_norecovery: + case Opt_nologreplay: + btrfs_set_and_info(root, NOLOGREPLAY, + "disabling log replay at mount time"); + break; case Opt_flushoncommit: btrfs_set_and_info(root, FLUSHONCOMMIT, "turning on flush-on-commit"); @@ -696,8 +710,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "disabling auto defrag"); break; case Opt_recovery: - btrfs_info(root->fs_info, "enabling auto recovery"); - btrfs_set_opt(info->mount_opt, RECOVERY); + btrfs_warn(root->fs_info, + "'recovery' is deprecated, use 'usebackuproot' instead"); + case Opt_usebackuproot: + btrfs_info(root->fs_info, + "trying to use backup root at mount time"); + btrfs_set_opt(info->mount_opt, USEBACKUPROOT); break; case Opt_skip_balance: btrfs_set_opt(info->mount_opt, SKIP_BALANCE); @@ -792,6 +810,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) break; } } +check: + /* + * Extra check for current option against current flag + */ + if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + btrfs_err(root->fs_info, + "nologreplay must be used with ro mount option"); + ret = -EINVAL; + } out: if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && !btrfs_test_opt(root, FREE_SPACE_TREE) && @@ -1202,6 +1229,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",ssd"); if (btrfs_test_opt(root, NOTREELOG)) seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(root, NOLOGREPLAY)) + seq_puts(seq, ",nologreplay"); if (btrfs_test_opt(root, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(root, DISCARD)) @@ -1228,8 +1257,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(root, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); - if (btrfs_test_opt(root, RECOVERY)) - seq_puts(seq, ",recovery"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) seq_puts(seq, ",check_int_data"); @@ -1685,7 +1712,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } - ret = btrfs_parse_options(root, data); + ret = btrfs_parse_options(root, data, *flags); if (ret) { ret = -EINVAL; goto restore; @@ -2163,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; ret = !(fs_devices->num_devices == fs_devices->total_devices); break; + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + ret = btrfs_ioctl_get_supported_features((void __user*)arg); + break; } kfree(vol); @@ -2261,7 +2291,7 @@ static void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void btrfs_print_info(void) +static void btrfs_print_mod_info(void) { printk(KERN_INFO "Btrfs loaded" #ifdef CONFIG_BTRFS_DEBUG @@ -2363,7 +2393,7 @@ static int __init init_btrfs_fs(void) btrfs_init_lockdep(); - btrfs_print_info(); + btrfs_print_mod_info(); err = btrfs_run_sanity_tests(); if (err) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e0ac85949067..539e7b5e3f86 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -780,6 +782,39 @@ failure: return error; } + +/* + * Change per-fs features in /sys/fs/btrfs/UUID/features to match current + * values in superblock. Call after any changes to incompat/compat_ro flags + */ +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set) +{ + struct btrfs_fs_devices *fs_devs; + struct kobject *fsid_kobj; + u64 features; + int ret; + + if (!fs_info) + return; + + features = get_features(fs_info, set); + ASSERT(bit & supported_feature_masks[set]); + + fs_devs = fs_info->fs_devices; + fsid_kobj = &fs_devs->fsid_kobj; + + if (!fsid_kobj->state_initialized) + return; + + /* + * FIXME: this is too heavy to update just one value, ideally we'd like + * to use sysfs_update_group but some refactoring is needed first. + */ + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); +} + static int btrfs_init_debugfs(void) { #ifdef CONFIG_DEBUG_FS diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9c09522125a6..d7da1a4c2f6c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) @@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set); + #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b1d920b30070..f54bf450bad3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -82,18 +82,18 @@ void btrfs_destroy_test_fs(void) struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); + GFP_KERNEL); if (!fs_info) return fs_info; fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->fs_devices) { kfree(fs_info); return NULL; } fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->super_copy) { kfree(fs_info->fs_devices); kfree(fs_info); @@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void **slot; spin_lock(&fs_info->buffer_lock); -restart: radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -147,7 +146,7 @@ restart: /* Shouldn't happen but that kind of thinking creates CVE's */ if (radix_tree_exception(eb)) { if (radix_tree_deref_retry(eb)) - goto restart; + slot = radix_tree_iter_retry(&iter); continue; } spin_unlock(&fs_info->buffer_lock); @@ -180,21 +179,15 @@ btrfs_alloc_dummy_block_group(unsigned long length) { struct btrfs_block_group_cache *cache; - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (!cache) return NULL; cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); + GFP_KERNEL); if (!cache->free_space_ctl) { kfree(cache); return NULL; } - cache->fs_info = btrfs_alloc_dummy_fs_info(); - if (!cache->fs_info) { - kfree(cache->free_space_ctl); - kfree(cache); - return NULL; - } cache->key.objectid = 0; cache->key.offset = length; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index e29fa297e053..669b58201e36 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -94,7 +94,7 @@ static int test_find_delalloc(void) * test. */ for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_msg("Failed to allocate test page\n"); ret = -ENOMEM; @@ -113,7 +113,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -144,7 +144,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -199,7 +199,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +262,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); out: if (locked_page) page_cache_release(locked_page); @@ -360,7 +360,7 @@ static int test_eb_bitmaps(void) test_msg("Running extent buffer bitmap tests\n"); - bitmap = kmalloc(len, GFP_NOFS); + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); return -ENOMEM; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index d05fe1ab4808..7cea4462acd5 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -485,6 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps) cache->bitmap_low_thresh = 0; cache->bitmap_high_thresh = (u32)-1; cache->needs_free_space = 1; + cache->fs_info = root->fs_info; btrfs_init_dummy_trans(&trans); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 5de55fdd28bc..863a6a3af1f8 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -22,6 +22,7 @@ #include "../disk-io.h" #include "../extent_io.h" #include "../volumes.h" +#include "../compression.h" static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, @@ -974,7 +975,7 @@ static int test_extent_accounting(void) (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1045,7 +1046,7 @@ static int test_extent_accounting(void) BTRFS_MAX_EXTENT_SIZE+8191, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1079,7 +1080,7 @@ static int test_extent_accounting(void) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1096,7 +1097,7 @@ out: clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b6031ce474f7..43885e51b882 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -637,6 +637,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( trans->block_rsv = &root->fs_info->trans_block_rsv; trans->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, num_bytes, 1); return trans; } @@ -1333,7 +1335,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; - struct timespec cur_time = CURRENT_TIME; + struct timespec cur_time; int ret = 0; u64 to_reserve = 0; u64 index = 0; @@ -1375,12 +1377,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; - + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, + trans->bytes_reserved, 1); dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); + cur_time = current_fs_time(parent_inode->i_sb); + /* * insert the directory item */ @@ -1523,7 +1529,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + parent_inode->i_mtime = parent_inode->i_ctime = + current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..24d03c751149 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -26,6 +26,7 @@ #include "print-tree.h" #include "backref.h" #include "hash.h" +#include "compression.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -1045,7 +1046,7 @@ again: /* * NOTE: we have searched root tree and checked the - * coresponding ref, it does not need to check again. + * corresponding ref, it does not need to check again. */ *search_done = 1; } @@ -4127,7 +4128,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct list_head *logged_list, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + const u64 start, + const u64 end) { struct extent_map *em, *n; struct list_head extents; @@ -4166,7 +4169,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - + /* + * Collect any new ordered extents within the range. This is to + * prevent logging file extent items without waiting for the disk + * location they point to being written. We do this only to deal + * with races against concurrent lockless direct IO writes. + */ + btrfs_get_logged_extents(inode, logged_list, start, end); process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4492,7 +4501,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list, start, end); + /* + * Collect ordered extents only if we are logging data. This is to + * ensure a subsequent request to log this inode in LOG_INODE_ALL mode + * will process the ordered extents if they still exists at the time, + * because when we collect them we test and set for the flag + * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the + * same ordered extents. The consequence for the LOG_INODE_ALL log mode + * not processing the ordered extents is that we end up logging the + * corresponding file extent items, based on the extent maps in the + * inode's extent_map_tree's modified_list, without logging the + * respective checksums (since the may still be only attached to the + * ordered extents and have not been inserted in the csum tree by + * btrfs_finish_ordered_io() yet). + */ + if (inode_only == LOG_INODE_ALL) + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate @@ -4701,7 +4725,7 @@ log_extents: goto out_unlock; } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx); + &logged_list, ctx, start, end); if (ret) { err = ret; goto out_unlock; @@ -4764,6 +4788,42 @@ out_unlock: } /* + * Check if we must fallback to a transaction commit when logging an inode. + * This must be called after logging the inode and is used only in the context + * when fsyncing an inode requires the need to log some other inode - in which + * case we can't lock the i_mutex of each other inode we need to log as that + * can lead to deadlocks with concurrent fsync against other inodes (as we can + * log inodes up or down in the hierarchy) or rename operations for example. So + * we take the log_mutex of the inode after we have logged it and then check for + * its last_unlink_trans value - this is safe because any task setting + * last_unlink_trans must take the log_mutex and it must do this before it does + * the actual unlink operation, so if we do this check before a concurrent task + * sets last_unlink_trans it means we've logged a consistent version/state of + * all the inode items, otherwise we are not sure and must do a transaction + * commit (the concurrent task migth have only updated last_unlink_trans before + * we logged the inode or it might have also done the unlink). + */ +static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + bool ret = false; + + mutex_lock(&BTRFS_I(inode)->log_mutex); + if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) { + /* + * Make sure any commits to the log are forced to be full + * commits. + */ + btrfs_set_log_full_commit(fs_info, trans); + ret = true; + } + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + return ret; +} + +/* * follow the dentry parent pointers up the chain and see if any * of the directories in it require a full commit before they can * be logged. Returns zero if nothing special needs to be done or 1 if @@ -4776,7 +4836,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, u64 last_committed) { int ret = 0; - struct btrfs_root *root; struct dentry *old_parent = NULL; struct inode *orig_inode = inode; @@ -4808,14 +4867,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, BTRFS_I(inode)->logged_trans = trans->transid; smp_mb(); - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { - root = BTRFS_I(inode)->root; - - /* - * make sure any commits to the log are forced - * to be full commits - */ - btrfs_set_log_full_commit(root->fs_info, trans); + if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; } @@ -4974,6 +5026,9 @@ process_leaf: btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, log_mode, 0, LLONG_MAX, ctx); + if (!ret && + btrfs_must_commit_transaction(trans, di_inode)) + ret = 1; iput(di_inode); if (ret) goto next_dir_inode; @@ -5088,6 +5143,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); + if (!ret && + btrfs_must_commit_transaction(trans, dir_inode)) + ret = 1; iput(dir_inode); if (ret) goto out; @@ -5439,6 +5497,9 @@ error: * They revolve around files there were unlinked from the directory, and * this function updates the parent directory so that a full commit is * properly done if it is fsync'd later after the unlinks are done. + * + * Must be called before the unlink operations (updates to the subvolume tree, + * inodes, etc) are done. */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, @@ -5454,8 +5515,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + mutex_lock(&BTRFS_I(inode)->log_mutex); BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + } /* * if this directory was already logged any new @@ -5486,7 +5550,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, return; record: + mutex_lock(&BTRFS_I(dir)->log_mutex); BTRFS_I(dir)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(dir)->log_mutex); +} + +/* + * Make sure that if someone attempts to fsync the parent directory of a deleted + * snapshot, it ends up triggering a transaction commit. This is to guarantee + * that after replaying the log tree of the parent directory's root we will not + * see the snapshot anymore and at log replay time we will not see any log tree + * corresponding to the deleted snapshot's root, which could lead to replaying + * it after replaying the log tree of the parent directory (which would replay + * the snapshot delete operation). + * + * Must be called before the actual snapshot destroy operation (updates to the + * parent root and tree of tree roots trees, etc) are done. + */ +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct inode *dir) +{ + mutex_lock(&BTRFS_I(dir)->log_mutex); + BTRFS_I(dir)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(dir)->log_mutex); } /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 6916a781ea02..a9f1b75d080d 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, int for_rename); +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct inode *dir); int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *old_dir, struct dentry *parent); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 366b335946fa..e2b54d546b7c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -138,7 +138,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void) { struct btrfs_fs_devices *fs_devs; - fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); + fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -220,7 +220,7 @@ static struct btrfs_device *__alloc_device(void) { struct btrfs_device *dev; - dev = kzalloc(sizeof(*dev), GFP_NOFS); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); @@ -733,7 +733,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * uuid mutex so nothing we touch in here is going to disappear. */ if (orig_dev->name) { - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + name = rcu_string_strdup(orig_dev->name->str, + GFP_KERNEL); if (!name) { kfree(device); goto error; @@ -1714,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&root->fs_info->dev_replace); + btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { WARN_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; @@ -2287,7 +2288,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - name = rcu_string_strdup(device_path, GFP_NOFS); + name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { kfree(device); ret = -ENOMEM; @@ -2748,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, em->start + em->len < chunk_offset) { /* * This is a logic error, but we don't want to just rely on the - * user having built with ASSERT enabled, so if ASSERT doens't + * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); @@ -2966,7 +2967,7 @@ static int insert_balance_item(struct btrfs_root *root, } key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; + key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -3015,7 +3016,7 @@ static int del_balance_item(struct btrfs_root *root) } key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; + key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -3686,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { BUG_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; @@ -3867,7 +3868,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) return -ENOMEM; key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; + key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); @@ -4118,7 +4119,7 @@ out: * Callback for btrfs_uuid_tree_iterate(). * returns: * 0 check succeeded, the entry is not outdated. - * < 0 if an error occured. + * < 0 if an error occurred. * > 0 if the check failed, which means the caller shall remove the entry. */ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, @@ -5062,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) ret++; - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); return ret; } @@ -5325,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!bbio_ret) goto out; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (!dev_replace_is_ongoing) - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); + else + btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && @@ -5751,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->mirror_num = map->num_stripes + 1; } out: - if (dev_replace_is_ongoing) - btrfs_dev_replace_unlock(dev_replace); + if (dev_replace_is_ongoing) { + btrfs_dev_replace_clear_lock_blocking(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); + } free_extent_map(em); return ret; } @@ -6705,8 +6710,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) int item_size; struct btrfs_dev_stats_item *ptr; - key.objectid = 0; - key.type = BTRFS_DEV_STATS_KEY; + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { @@ -6753,8 +6758,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, int ret; int i; - key.objectid = 0; - key.type = BTRFS_DEV_STATS_KEY; + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; path = btrfs_alloc_path(); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 6c68d6356197..145d2b89e62d 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -249,7 +249,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, goto out; inode_inc_iversion(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -260,16 +260,12 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct btrfs_key key, found_key; + struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_dir_item *di; - int ret = 0, slot; + int ret = 0; size_t total_size = 0, size_left = size; - unsigned long name_ptr; - size_t name_len; /* * ok we want all objects associated with this id. @@ -291,6 +287,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto err; while (1) { + struct extent_buffer *leaf; + int slot; + struct btrfs_dir_item *di; + struct btrfs_key found_key; + u32 item_size; + u32 cur; + leaf = path->nodes[0]; slot = path->slots[0]; @@ -316,31 +319,45 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) - goto next; + goto next_item; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(root, leaf, di)) - goto next; - - name_len = btrfs_dir_name_len(leaf, di); - total_size += name_len + 1; + item_size = btrfs_item_size_nr(leaf, slot); + cur = 0; + while (cur < item_size) { + u16 name_len = btrfs_dir_name_len(leaf, di); + u16 data_len = btrfs_dir_data_len(leaf, di); + u32 this_len = sizeof(*di) + name_len + data_len; + unsigned long name_ptr = (unsigned long)(di + 1); + + if (verify_dir_item(root, leaf, di)) { + ret = -EIO; + goto err; + } - /* we are just looking for how big our buffer needs to be */ - if (!size) - goto next; + total_size += name_len + 1; + /* + * We are just looking for how big our buffer needs to + * be. + */ + if (!size) + goto next; - if (!buffer || (name_len + 1) > size_left) { - ret = -ERANGE; - goto err; - } + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } - name_ptr = (unsigned long)(di + 1); - read_extent_buffer(leaf, buffer, name_ptr, name_len); - buffer[name_len] = '\0'; + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; - size_left -= name_len + 1; - buffer += name_len + 1; + size_left -= name_len + 1; + buffer += name_len + 1; next: + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } +next_item: path->slots[0]++; } ret = total_size; |
