diff options
author | 2018-10-24 17:15:26 +0100 | |
---|---|---|
committer | 2018-10-24 17:15:26 +0100 | |
commit | 318b067a5dd649d198c2ba00cf7408d778fc00b4 (patch) | |
tree | 0df4a0f1c3e3cce5c7ced0e65fbc1bd7d97219d4 /fs/btrfs/tree-log.c | |
parent | Merge branch 'work.tty-ioctl' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs (diff) | |
parent | btrfs: switch return_bigger to bool in find_ref_head (diff) | |
download | wireguard-linux-318b067a5dd649d198c2ba00cf7408d778fc00b4.tar.xz wireguard-linux-318b067a5dd649d198c2ba00cf7408d778fc00b4.zip |
Merge tag 'for-4.20-part1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"This is the first batch with fixes and some nice performance
improvements.
Preliminary results show eg. more files/sec in fsmark, better perf on
multi-threaded workloads (filebench, dbench), fewer context switches
and overall better memory allocation characteristics (multiple
benchmarks).
Apart from general performance, there's an improvement for qgroups +
balance workload that's been troubling our users.
Note for stable: there are 20+ patches tagged for stable, out of 90.
Not all of them apply cleanly on all stable versions but the conflicts
are mostly due to simple cleanups and resolving should be obvious. The
fixes are otherwise independent.
Performance improvements:
- transition between blocking and spinning modes of path is gone,
which originally resulted to more unnecessary wakeups and updates
to the path locks, the effects are measurable and improve latency
and scalability
- qgroups: first batch of changes that should speedup balancing with
qgroups on, skip quota accounting on unchanged subtrees, overall
gain is about 30+% in runtime
- use rb-tree with cached first node for several structures, small
improvement to avoid pointer chasing
Fixes:
- trim
- fix: some blockgroups could have been missed if their logical
address was past the total filesystem size (ie. after a lot of
balancing)
- better error reporting, after processing blockgroups and whole
device
- fix: continue trimming block groups after an error is
encountered
- check for trim support of the device earlier and avoid some
unnecessary work
- less interaction with transaction commit that improves latency
on slower storage (eg. image files over NFS)
- fsync
- fix warning when replaying log after fsync of a O_TMPFILE
- fix wrong dentries after fsync of file that got its parent
replaced
- qgroups: fix rescan that might misc some dirty groups
- don't clean dirty pages during buffered writes, this could lead to
lost updates in some corner cases
- some block groups could have been delayed in creation, if the
allocation triggered another one
- error handling improvements
Cleanups:
- removed unused struct members and variables
- function return type cleanups
- delayed refs code refactoring
- protect against deadlock that could be caused by crafted image that
tries to allocate from a tree that's locked already"
* tag 'for-4.20-part1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (93 commits)
btrfs: switch return_bigger to bool in find_ref_head
btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
btrfs: delayed-ref: pass delayed_refs directly to btrfs_delayed_ref_lock
btrfs: delayed-ref: pass delayed_refs directly to btrfs_select_ref_head
btrfs: qgroup: move the qgroup->members check out from (!qgroup)'s else branch
btrfs: relocation: Remove redundant tree level check
btrfs: relocation: Cleanup while loop using rbtree_postorder_for_each_entry_safe
btrfs: qgroup: Avoid calling qgroup functions if qgroup is not enabled
Btrfs: fix wrong dentries after fsync of file that got its parent replaced
Btrfs: fix warning when replaying log after fsync of a tmpfile
btrfs: drop min_size from evict_refill_and_join
btrfs: assert on non-empty delayed iputs
btrfs: make sure we create all new block groups
btrfs: reset max_extent_size on clear in a bitmap
btrfs: protect space cache inode alloc with GFP_NOFS
btrfs: release metadata before running delayed refs
Btrfs: kill btrfs_clear_path_blocking
btrfs: dev-replace: remove pointless assert in write unlock
btrfs: dev-replace: move replace members out of fs_info
...
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 86 |
1 files changed, 66 insertions, 20 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3c2ae0e4f25a..0dba09334a16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root) * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ -int btrfs_pin_log_trans(struct btrfs_root *root) +void btrfs_pin_log_trans(struct btrfs_root *root) { - int ret = -ENOENT; - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return ret; } /* @@ -258,6 +255,13 @@ struct walk_control { /* what stage of the replay code we're currently in */ int stage; + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in + * the LOG_WALK_REPLAY_INODES stage. + */ + bool ignore_cur_inode; + /* the root we are currently replaying */ struct btrfs_root *replay_dest; @@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + /* + * If we have a tmpfile (O_TMPFILE) that got fsync'ed + * and never got linked before the fsync, skip it, as + * replaying it is pointless since it would be deleted + * later. We skip logging tmpfiles, but it's always + * possible we are replaying a log created with a kernel + * that used to log tmpfiles. + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; + continue; + } else { + wc->ignore_cur_inode = false; + } ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) @@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root->fs_info->sectorsize); ret = btrfs_drop_extents(wc->trans, root, inode, from, (u64)-1, 1); - /* - * If the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done. - */ if (!ret) { - if (inode->i_nlink == 0) - inc_nlink(inode); - /* Update link count and nbytes. */ + /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); } @@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; } + if (wc->ignore_cur_inode) + continue; + if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { ret = replay_one_dir_item(wc->trans, root, path, @@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans, }; ret = walk_log_tree(trans, log, &wc); - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, ret); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -5564,9 +5580,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, dir_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); - /* If parent inode was deleted, skip it. */ - if (IS_ERR(dir_inode)) - continue; + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * <power fail> + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } if (ctx) ctx->log_new_dentries = false; @@ -5640,7 +5680,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (btrfs_inode_in_log(inode, trans->transid)) { + /* + * Skip already logged inodes or inodes corresponding to tmpfiles + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ + if (btrfs_inode_in_log(inode, trans->transid) || + inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } |