diff options
Diffstat (limited to 'fs')
121 files changed, 2040 insertions, 1413 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eb3589edf485..0576eaeb60b9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, } static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = d_inode(dentry); v9ses = v9fs_dentry2v9ses(dentry); /* diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 18c62bae9591..a6bd349bab23 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { const char *full_name = xattr_full_name(handler, name); diff --git a/fs/Kconfig b/fs/Kconfig index 6725f59c18e6..b8fcb416be72 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -52,6 +52,7 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE + depends on BROKEN endif # BLOCK diff --git a/fs/afs/write.c b/fs/afs/write.c index 65de439bdc4f..14d506efd1aa 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) return 0; result = generic_file_write_iter(iocb, from); - if (IS_ERR_VALUE(result)) { - _leave(" = %zd", result); - return result; - } _leave(" = %zd", result); return result; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 72e35b721608..3ba385eaa26e 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) return -EIO; } -static int bad_inode_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { return -EIO; } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 2fab9f130e51..64b331ae3428 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -127,11 +127,8 @@ static int set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); - if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; + if (end > start) + return vm_brk(start, end - start); } return 0; } @@ -275,7 +272,7 @@ static int load_aout_binary(struct linux_binprm * bprm) map_size = ex.a_text+ex.a_data; #endif error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) + if (error) return error; error = read_code(bprm->file, text_addr, pos, @@ -298,7 +295,7 @@ static int load_aout_binary(struct linux_binprm * bprm) if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (IS_ERR_VALUE(error)) + if (error) return error; read_code(bprm->file, N_TXTADDR(ex), fd_offset, @@ -382,7 +379,7 @@ static int load_aout_library(struct file *file) file); } retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (IS_ERR_VALUE(retval)) + if (retval) goto out; read_code(file, start_addr, N_TXTOFF(ex), @@ -402,9 +399,8 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - error = vm_brk(start_addr + len, bss - len); - retval = error; - if (error != start_addr + len) + retval = vm_brk(start_addr + len, bss - len); + if (retval) goto out; } retval = 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 938fc4ede764..e158b22ef32f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end) start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; + int error = vm_brk(start, end - start); + if (error) + return error; } current->mm->start_brk = current->mm->brk = end; return 0; @@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); - if (BAD_ADDR(error)) + if (error) goto out; } @@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file) bss = eppnt->p_memsz + eppnt->p_vaddr; if (bss > len) { error = vm_brk(len, bss - len); - if (BAD_ADDR(error)) + if (error) goto out_free_ph; } error = 0; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f723cd3a455c..caf9e39bb82b 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) "(%d != %d)", (unsigned) r, curid, id); goto failed; } else if ( ! p->lib_list[id].loaded && - IS_ERR_VALUE(load_flat_shared_library(id, p))) { + load_flat_shared_library(id, p) < 0) { printk("BINFMT_FLAT: failed to load library %d", id); goto failed; } @@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) res = prepare_binprm(&bprm); - if (!IS_ERR_VALUE(res)) + if (!res) res = load_flat_file(&bprm, libs, id, NULL); abort_creds(bprm.cred); @@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm) stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); - if (IS_ERR_VALUE(res)) + if (res < 0) return res; /* Update data segment pointers for all libraries */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 1089dbf25925..71ccab1d22c6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); if (!avail) return -ERANGE; if (avail > 0 && avail & ~PAGE_MASK) @@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) } EXPORT_SYMBOL_GPL(bdev_direct_access); +/** + * bdev_dax_supported() - Check if the device supports dax for filesystem + * @sb: The superblock of the device + * @blocksize: The block size of the device + * + * This is a library function for filesystems to check if the block device + * can be mounted with dax option. + * + * Return: negative errno if unsupported, 0 if supported. + */ +int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + struct blk_dax_ctl dax = { + .sector = 0, + .size = PAGE_SIZE, + }; + int err; + + if (blocksize != PAGE_SIZE) { + vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); + return -EINVAL; + } + + err = bdev_direct_access(sb->s_bdev, &dax); + if (err < 0) { + switch (err) { + case -EOPNOTSUPP: + vfs_msg(sb, KERN_ERR, + "error: device does not support dax"); + break; + case -EINVAL: + vfs_msg(sb, KERN_ERR, + "error: unaligned partition for dax"); + break; + default: + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%d)", err); + } + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bdev_dax_supported); + +/** + * bdev_dax_capable() - Return if the raw device is capable for dax + * @bdev: The device for raw block device access + */ +bool bdev_dax_capable(struct block_device *bdev) +{ + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + }; + + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + + dax.sector = 0; + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + return true; +} + /* * pseudo-fs */ @@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static bool blkdev_dax_capable(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - - if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX)) - return false; - - /* - * If the partition is not aligned on a page boundary, we can't - * do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) - || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) - return false; - - /* - * If the device has known bad blocks, force all I/O through the - * driver / page cache. - * - * TODO: support finer grained dax error handling - */ - if (disk->bb && disk->bb->count) - return false; - - return true; -} - static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); /* @@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } @@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d3090187fd76..8bb3509099e8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the - * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1da5753d886d..4919aedb5fc1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -313,7 +313,7 @@ struct btrfs_dio_private { struct bio *dio_bio; /* - * The original bio may be splited to several sub-bios, this is + * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 516e19d1d202..b677a6ea6001 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1939,7 +1939,7 @@ again: /* * Clear all references of this block. Do not free * the block itself even if is not referenced anymore - * because it still carries valueable information + * because it still carries valuable information * like whether it was ever written and IO completed. */ list_for_each_entry_safe(l, tmp, &block->ref_to_list, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index decd0a3f5d61..427c36b430a6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) /* * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do + * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ @@ -955,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf) { /* - * Tree blocks not in refernece counted trees and tree roots + * Tree blocks not in reference counted trees and tree roots * are never shared. If a block was allocated after the last * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. @@ -1270,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, /* * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewinded (until we reach something older than + * previous operations will be rewound (until we reach something older than * time_seq). */ static void @@ -1345,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } /* - * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * Called with eb read locked. If the buffer cannot be rewound, the same buffer * is returned. If rewind operations happen, a fresh buffer is returned. The * returned buffer is always read-locked. If the returned buffer is not the * input buffer, the lock on the input buffer is released and the input buffer @@ -1516,7 +1516,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * 3) the root is not forced COW. * * What is forced COW: - * when we create snapshot during commiting the transaction, + * when we create snapshot during committing the transaction, * after we've finished coping src root, we must COW the shared * block to ensure the metadata consistency. */ @@ -1531,7 +1531,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, /* * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than + * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, @@ -2986,7 +2986,7 @@ again: btrfs_unlock_up_safe(p, level + 1); /* - * Since we can unwind eb's we want to do a real search every + * Since we can unwind ebs we want to do a real search every * time. */ prev_cmp = -1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ddcc58f03c79..101c3cfd3f7c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -89,7 +89,7 @@ static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) /* ioprio of readahead is set to idle */ @@ -431,7 +431,7 @@ struct btrfs_space_info { * bytes_pinned does not reflect the bytes that will be pinned once the * delayed refs are flushed, so this counter is inc'ed every time we * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zero'ed every + * freed once the transaction is committed. It will be zeroed every * time the transaction commits. */ struct percpu_counter total_bytes_pinned; @@ -1401,7 +1401,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) token->kaddr = NULL; } -/* some macros to generate set/get funcs for the struct fields. This +/* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c24b653c7343..5fca9534a271 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root { /* * To make qgroup to skip given root. - * This is for snapshot, as btrfs_qgroup_inherit() will manully + * This is for snapshot, as btrfs_qgroup_inherit() will manually * modify counters for snapshot and its source, so we should skip * the snapshot in new_root/old_roots or it will get calculated twice */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 85f12e6e28d2..63ef9cdf0144 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -450,7 +450,7 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, } /* - * blocked until all flighting bios are finished. + * blocked until all in-flight bios operations are finished. */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91d123938cef..6628fca9f4ed 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -384,7 +384,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, /* * Things reading via commit roots that don't have normal protection, * like send, can have a really old block in cache that may point at a - * block that has been free'd and re-allocated. So don't clear uptodate + * block that has been freed and re-allocated. So don't clear uptodate * if we find an eb that is under IO (dirty/writeback) because we could * end up reading in the stale data and then writing it back out and * making everybody very sad. @@ -418,7 +418,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checkum. + * is filled with zeros and is included in the checksum. */ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -600,7 +600,7 @@ static noinline int check_leaf(struct btrfs_root *root, /* * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but + * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > @@ -3022,7 +3022,7 @@ retry_root_backup: } /* - * Mount does not set all options immediatelly, we can do it now and do + * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit */ btrfs_apply_pending_changes(fs_info); @@ -3255,7 +3255,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) btrfs_warn_rl_in_rcu(device->dev_root->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); - /* note, we dont' set_buffer_write_io_error because we have + /* note, we don't set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); @@ -4367,7 +4367,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, if (ret) break; - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; @@ -4402,7 +4402,7 @@ again: if (ret) break; - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9424864fd01a..a400951e8678 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root, { u64 end = start + num_bytes - 1; set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); return 0; } @@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root, end = start + cache->key.offset - 1; clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); } static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +980,7 @@ out_free: * event that tree block loses its owner tree's reference and do the * back refs conversion. * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. @@ -2595,7 +2595,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * Need to drop our head ref lock and re-aqcuire the + * Need to drop our head ref lock and re-acquire the * delayed ref lock and then re-check to make sure * nobody got added. */ @@ -2747,7 +2747,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) /* * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to ouse. + * closer to what we're really going to want to use. */ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } @@ -2851,7 +2851,7 @@ static void delayed_ref_async_start(struct btrfs_work *work) } /* - * trans->sync means that when we call end_transaciton, we won't + * trans->sync means that when we call end_transaction, we won't * wait on delayed refs */ trans->sync = true; @@ -4296,7 +4296,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * Called if we need to clear a data reservation for this inode * Normally in a error case. * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4967,7 +4967,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5572,7 +5572,7 @@ void btrfs_orphan_release_metadata(struct inode *inode) * common file/directory operations, they change two fs/file trees * and root tree, the number of items that the qgroup reserves is * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, @@ -5621,7 +5621,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of @@ -5647,7 +5647,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) drop_inode_space = 1; /* - * If we have more or the same amount of outsanding extents than we have + * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= @@ -5661,8 +5661,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) } /** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + * reserved/freed for the given bytes. * @inode: the inode we're manipulating * @num_bytes: the number of bytes in question * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5814,7 +5814,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occurred during this + * freed from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -5836,7 +5836,7 @@ out_fail: /* * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have free'd more space had we + * more than to_free then we would have freed more space had we * not had an artificially high ->csum_bytes, so we need to free * the remainder. If bytes is the same or less then we don't * need to do anything, the other free-ers did the correct @@ -6515,7 +6515,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); unpin_extent_range(root, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -7578,7 +7578,7 @@ loop: if (loop == LOOP_CACHING_NOWAIT) { /* * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any unached bgs and we've alrelady done a + * don't have any uncached bgs and we've already done a * full search through. */ if (orig_have_caching_bg || !full_search) @@ -7982,7 +7982,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, /* * Mixed block groups will exclude before processing the log so we only - * need to do the exlude dance if this fs isn't mixed. + * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -8032,7 +8032,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + buf->start + buf->len - 1); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, @@ -9426,7 +9426,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) u64 free_bytes = 0; int factor; - /* It's df, we don't care if it's racey */ + /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; @@ -10635,14 +10635,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2f83448d34fe..3cd57825c75f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -726,14 +726,6 @@ next: start = last_end + 1; if (start <= end && state && !need_resched()) goto hit_next; - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; search_again: if (start > end) @@ -742,6 +734,14 @@ search_again: if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + } static void wait_on_state(struct extent_io_tree *tree, @@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); } spin_lock(&tree->lock); @@ -1037,7 +1043,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; out: spin_unlock(&tree->lock); @@ -1046,13 +1058,6 @@ out: return err; -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range * @cached_state: state that we're going to cache - * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist * already in this range they are set with the given bit and cleared of the * clear_bits. This is only meant to be used by things that are mergeable, ie * converting from say DELALLOC to DIRTY. This is not meant to be used with * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask) + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1106,7 +1112,7 @@ again: * extent state allocations are needed. We'll only know this * after locking the tree. */ - prealloc = alloc_extent_state(mask); + prealloc = alloc_extent_state(GFP_NOFS); if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1263,7 +1269,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; out: spin_unlock(&tree->lock); @@ -1271,21 +1283,11 @@ out: free_extent_state(prealloc); return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - first_iteration = false; - goto again; } /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, changeset); } @@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, changeset); } @@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret) err = ret; ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret && !err) err = ret; @@ -2232,13 +2233,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, /* set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) ret = set_state_failrec(failure_tree, start, failrec); /* set the bits in the inode's tree */ if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, - GFP_NOFS); + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); if (ret < 0) { kfree(failrec); return ret; @@ -4389,8 +4389,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret < 0) { btrfs_free_path(path); return ret; + } else { + WARN_ON(!ret); + if (ret == 1) + ret = 0; } - WARN_ON(!ret); + path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = found_key.type; @@ -4601,7 +4605,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) if (mapped) spin_unlock(&page->mapping->private_lock); - /* One for when we alloced the page */ + /* One for when we allocated the page */ put_page(page); } while (index != 0); } @@ -5761,7 +5765,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; /* - * We need to make sure noboody is attaching this page to an eb right + * We need to make sure nobody is attaching this page to an eb right * now. */ spin_lock(&page->mapping->private_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 981f402bf754..1baf19c9b79d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -220,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); @@ -240,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { int wake = 0; if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, + GFP_NOFS); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -278,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask); + struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 318b048eb254..e0715fcfb11e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void) /** * free_extent_map - drop reference count of an extent_map - * @em: extent map being releasead + * @em: extent map being released * * Drops the reference out on @em by one and free the structure * if the reference count hits zero. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7a7d6e253cfc..62a81ee13a5f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -248,7 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, offset + root->sectorsize - 1, - EXTENT_NODATASUM, GFP_NOFS); + EXTENT_NODATASUM); } else { btrfs_info(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c98805c35bab..e0c9bd3fb02d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1596,6 +1596,13 @@ again: copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + root->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + dirty_sectors); + /* * if we have trouble faulting in the pages, fall * back to one page at a time @@ -1605,6 +1612,7 @@ again: if (copied == 0) { force_page_uptodate = true; + dirty_sectors = 0; dirty_pages = 0; } else { force_page_uptodate = false; @@ -1615,20 +1623,19 @@ again: /* * If we had a short copy we need to release the excess delaloc * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space will decrement it, but + * because btrfs_delalloc_release_space and + * btrfs_delalloc_release_metadata will decrement it, but * we still have an outstanding extent for the chunk we actually * managed to copy. */ - num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - root->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - dirty_sectors); - if (num_sectors > dirty_sectors) { - release_bytes = (write_bytes - copied) - & ~((u64)root->sectorsize - 1); + /* + * we round down because we don't want to count + * any partial blocks actually sent through the + * IO machines + */ + release_bytes = round_down(release_bytes - copied, + root->sectorsize); if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -2022,7 +2029,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed)) { /* - * We'v had everything committed since the last time we were + * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ @@ -2370,7 +2377,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page - * including serveral following pages are already in holes, + * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5e6062c26129..c6dc1183f542 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want - * to reserve them to larger extents, however if we have plent + * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33178c490ace..3af651c2bbc7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -/* Support functions for runnint our sanity tests */ +/* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group_cache *cache, u64 offset, u64 bytes, bool bitmap); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 91419ef79b00..270499598ed4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -455,7 +455,7 @@ again: /* * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. + * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -1978,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -3119,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } @@ -3722,7 +3721,7 @@ cache_index: * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, - * but it guarantees correctness at the expense of ocassional full + * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ @@ -4978,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * be instantly completed which will give us extents that need * to be truncated. If we fail to get an orphan inode down we * could have left over extents that were never meant to live, - * so we need to garuntee from this point on that everything + * so we need to guarantee from this point on that everything * will be consistent. */ ret = btrfs_orphan_add(trans, inode); @@ -5248,7 +5247,7 @@ void btrfs_evict_inode(struct inode *inode) } /* - * We can't just steal from the global reserve, we need tomake + * We can't just steal from the global reserve, we need to make * sure there is room to do it, if not we need to commit and try * again. */ @@ -7433,7 +7432,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state); /* * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered + * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7595,7 +7594,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (current->journal_info) { /* * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transction doesn't get + * that anything that needs to check if there's a transaction doesn't get * confused. */ dio_data = current->journal_info; @@ -7628,7 +7627,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * - * We return -ENOTBLK because thats what makes DIO go ahead and go back + * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ @@ -9041,7 +9040,7 @@ static int btrfs_truncate(struct inode *inode) return ret; /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * Yes ladies and gentlemen, this is indeed ugly. The fact is we have * 3 things going on here * * 1) We need to reserve space for our orphan item and the space to @@ -9055,15 +9054,15 @@ static int btrfs_truncate(struct inode *inode) * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be seperate. The fact is we can use alot of + * And we need these to all be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it + * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode or * removing the orphan item. We also need to be able to stop the * transaction and start a new one, which means we need to be able to * update the inode several times, and we have no idea of knowing how * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. + * entirety of the operation, so that has to be done separately as well. * Then there is the orphan item, which does indeed need to be held on * to for the whole operation, and we need nobody to touch this reserved * space except the orphan code. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4e700694b741..05173563e4a6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } else { /* - * Revert back under same assuptions as above + * Revert back under same assumptions as above */ if (S_ISREG(mode)) { if (inode->i_size == 0) @@ -465,7 +465,7 @@ static noinline int create_subvol(struct inode *dir, /* * Don't create subvolume whose level is not zero. Or qgroup will be - * screwed up since it assume subvolme qgroup's level to be 0. + * screwed up since it assumes subvolume qgroup's level to be 0. */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; @@ -780,7 +780,7 @@ free_pending: * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability - * 6. If the victim is append-only or immutable we can't do antyhing with + * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. @@ -846,11 +846,9 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (error == -EINTR) - // return error; + error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -1239,7 +1237,7 @@ again: set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state, GFP_NOFS); + &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, @@ -2377,11 +2375,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (err == -EINTR) - // goto out_drop_write; + err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (err == -EINTR) + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2571,7 +2567,7 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); -//out_drop_write: +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -4654,7 +4650,7 @@ again: } /* - * mut. excl. ops lock is locked. Three possibilites: + * mut. excl. ops lock is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) @@ -5571,7 +5567,7 @@ long btrfs_ioctl(struct file *file, unsigned int ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); /* * The transaction thread may want to do more work, - * namely it pokes the cleaner ktread that will start + * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ wake_up_process(root->fs_info->transaction_kthread); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8ef12623d65c..2049c9be85ee 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -58,7 +58,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e119552ed32..9d4c05b14f6e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -85,7 +85,7 @@ struct btrfs_qgroup { /* * temp variables for accounting operations - * Refer to qgroup_shared_accouting() for details. + * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; @@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } /* * we call btrfs_free_qgroup_config() when umounting - * filesystem and disabling quota, so we set qgroup_ulit + * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); @@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, /* * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and + * then this qgroup and all of the parent qgroups get their reference and * exclusive counts adjusted. * * Caller should hold fs_info->qgroup_lock. @@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, /* * No need to do lock, since this function will only be called in - * btrfs_commmit_transaction(). + * btrfs_commit_transaction(). */ node = rb_first(&delayed_refs->dirty_extent_root); while (node) { @@ -1557,7 +1557,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) - * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing @@ -1851,7 +1851,7 @@ out: } /* - * Copy the acounting information between qgroups. This is necessary + * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. @@ -2340,7 +2340,7 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has alreay updated the + * only update status, since the previous part has already updated the * qgroup info. */ trans = btrfs_start_transaction(fs_info->quota_root, 1); @@ -2542,8 +2542,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) changeset.bytes_changed = 0; changeset.range_changed = ulist_alloc(GFP_NOFS); ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); trace_btrfs_qgroup_reserve_data(inode, start, len, changeset.bytes_changed, QGROUP_RESERVE); @@ -2580,8 +2579,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, return -ENOMEM; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -2672,7 +2670,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) } /* - * Check qgroup reserved space leaking, normally at destory inode + * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct inode *inode) @@ -2688,7 +2686,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) return; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0b7792e02dd5..f8b6d411a034 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can - * steal from cached rbio's though, other functions + * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || @@ -2368,7 +2368,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - /* Check scrubbing pairty and repair it */ + /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) @@ -2493,7 +2493,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckly, use the other one to repair + * is scrubbing parity, luckily, use the other one to repair * the data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1cfd35cfac76..0477dca154ed 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * roots of b-trees that reference the tree block. * * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop + * to find upper level blocks that reference the block, and then check + * backrefs of these upper level blocks recursively. the recursion stop * when tree root is reached or backrefs for the block is cached. * * NOTE: if we find backrefs for a block are cached, we know backrefs @@ -1160,7 +1160,7 @@ out: if (!RB_EMPTY_NODE(&upper->rb_node)) continue; - /* Add this guy's upper edges to the list to proces */ + /* Add this guy's upper edges to the list to process */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); if (list_empty(&upper->upper)) @@ -2396,7 +2396,7 @@ again: } /* - * we keep the old last snapshod transid in rtranid when we + * we keep the old last snapshot transid in rtranid when we * created the relocation tree. */ last_snap = btrfs_root_rtransid(&reloc_root->root_item); @@ -2616,7 +2616,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + rc->extent_root->nodesize * @@ -2814,7 +2814,7 @@ static void mark_block_processed(struct reloc_control *rc, u64 bytenr, u32 blocksize) { set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); } static void __mark_block_processed(struct reloc_control *rc, @@ -3182,7 +3182,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); + EXTENT_BOUNDARY); nr++; } @@ -4059,8 +4059,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans, rc->extent_root); @@ -4591,7 +4590,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, /* * called before creating snapshot. it calculates metadata reservation - * requried for relocating tree blocks in the snapshot + * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b2b14e7115f1..f1c30861d062 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, * search_key: the key to search * path: the path we search * root_item: the root item of the tree we look for - * root_key: the reak key of the tree we look for + * root_key: the root key of the tree we look for * - * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset * of the search key, just lookup the root with the highest offset for a * given objectid. * diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fa35cdc46494..46d847f66e4b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) * sure we read the bad mirror. */ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret) { /* set_extent_bits should give proper error */ WARN_ON(ret > 0); @@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) end, EXTENT_DAMAGED, 0, NULL); if (!corrected) clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); } out: @@ -1044,7 +1044,7 @@ nodatasum_case: /* * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified + * might not be COWed, that it might be modified * concurrently. The general strategy to work on the * commit root does not help in the case when COW is not * used. @@ -1125,7 +1125,7 @@ nodatasum_case: * the 2nd page of mirror #1 faces I/O errors, and the 2nd page * of mirror #2 is readable but the final checksum test fails, * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare + * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors @@ -2181,7 +2181,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio; + struct btrfs_bio *bbio = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2982,6 +2982,7 @@ again: extent_len); mapped_length = extent_len; + bbio = NULL; ret = btrfs_map_block(fs_info, READ, extent_logical, &mapped_length, &bbio, 0); if (!ret) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6a8c86074aa4..b71dd298385c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, /* * If we have a parent root we need to verify that the parent dir was - * not delted and then re-created, if it was then we have no overwrite + * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ if (sctx->parent_root) { @@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, return -ENOMEM; /* - * This hack is needed because empty acl's are stored as zero byte + * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte - * acl's will fail later. To fix this, we send a dummy acl list that + * acls will fail later. To fix this, we send a dummy acl list that * only contains the version number and no entries. */ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index e05619f241be..875c757e73e2 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The end result is that anyone who #includes ctree.h gets a * declaration for the btrfs_set_foo functions and btrfs_foo functions, - * which are wappers of btrfs_set_token_#bits functions and + * which are wrappers of btrfs_set_token_#bits functions and * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bf71071ab6f6..4e59a91a11e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -112,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * Note that a running device replace operation is not * canceled here although there is no way to update * the progress. It would add the risk of a deadlock, - * therefore the canceling is ommited. The only penalty + * therefore the canceling is omitted. The only penalty * is that some I/O remains active until the procedure * completes. The next time when the filesystem is * mounted writeable again, the device replace @@ -1877,7 +1877,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int ret; /* - * We aren't under the device list lock, so this is racey-ish, but good + * We aren't under the device list lock, so this is racy-ish, but good * enough for our purposes. */ nr_devices = fs_info->fs_devices->open_devices; @@ -1896,7 +1896,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) if (!devices_info) return -ENOMEM; - /* calc min stripe number for data space alloction */ + /* calc min stripe number for data space allocation */ type = btrfs_get_alloc_profile(root, 1); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; @@ -1932,7 +1932,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space *= BTRFS_STRIPE_LEN; /* - * In order to avoid overwritting the superblock on the drive, + * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 70948b13bc81..55724607f79b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -113,7 +113,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, 0, 4095, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -144,7 +144,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -176,7 +176,7 @@ static int test_find_delalloc(void) locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Could'nt find the locked page\n"); + test_msg("Couldn't find the locked page\n"); goto out_bits; } start = test_start; @@ -199,7 +199,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +262,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 514247515312..0eeb8f3d6b67 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -25,7 +25,7 @@ #define BITS_PER_BITMAP (PAGE_SIZE * 8) /* - * This test just does basic sanity checking, making sure we can add an exten + * This test just does basic sanity checking, making sure we can add an extent * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ @@ -396,8 +396,9 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * wasn't optimal as they could be spread all over the block group while under * concurrency (extra overhead and fragmentation). * - * This stealing approach is benefical, since we always prefer to allocate from - * extent entries, both for clustered and non-clustered allocation requests. + * This stealing approach is beneficial, since we always prefer to allocate + * from extent entries, both for clustered and non-clustered allocation + * requests. */ static int test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 863a6a3af1f8..8a25fe8b7c45 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -264,7 +264,7 @@ static noinline int test_btrfs_get_extent(void) /* * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidently release our page. + * extra ref so our searches don't accidentally release our page. */ extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 8ea5d34bc5a2..8aa4ded31326 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -234,7 +234,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) } /* - * Since the test trans doesn't havee the complicated delayed refs, + * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5b0b758a3f79..f6e24cb423ae 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -944,7 +944,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, err = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 72be51f7ca2f..9fe0ec2bf0fe 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -144,7 +144,7 @@ struct btrfs_pending_snapshot { /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; - /* extra metadata reseration for relocation */ + /* extra metadata reservation for relocation */ int error; bool readonly; struct list_head list; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8aaca5c6af94..b7665af471d8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF + * orphan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { @@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + EXTENT_DIRTY | EXTENT_NEW); } /* @@ -4914,7 +4914,7 @@ out_unlock: * the actual unlink operation, so if we do this check before a concurrent task * sets last_unlink_trans it means we've logged a consistent version/state of * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task migth have only updated last_unlink_trans before + * commit (the concurrent task might have only updated last_unlink_trans before * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, @@ -4973,7 +4973,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, while (1) { /* * If we are logging a directory then we start with our inode, - * not our parents inode, so we need to skipp setting the + * not our parent's inode, so we need to skip setting the * logged_trans so that further down in the log code we don't * think this inode has already been logged. */ @@ -5357,7 +5357,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, log_dentries = true; /* - * On unlink we must make sure all our current and old parent directores + * On unlink we must make sure all our current and old parent directory * inodes are fully logged. This is to prevent leaving dangling * directory index entries in directories that were our parents but are * not anymore. Not doing this results in old parent directory being diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 91feb2bdefee..b1434bb57e36 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -28,7 +28,7 @@ * } * ulist_free(ulist); * - * This assumes the graph nodes are adressable by u64. This stems from the + * This assumes the graph nodes are addressable by u64. This stems from the * usage for tree enumeration in btrfs, where the logical addresses are * 64 bit. * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2b88127bba5b..bdc62561ede8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2190,7 +2190,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) } /* - * strore the expected generation for seed devices in device items. + * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -3387,7 +3387,7 @@ static int should_balance_chunk(struct btrfs_root *root, } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be - * determined here because we do not have the global informatoin + * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) @@ -6076,7 +6076,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { - /* Shoud be the original bio. */ + /* Should be the original bio. */ WARN_ON(bio != bbio->orig_bio); btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -6560,7 +6560,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* - * The sb extent buffer is artifical and just used to read the system array. + * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all @@ -6630,13 +6630,13 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } - free_extent_buffer(sb); + free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); - free_extent_buffer(sb); + free_extent_buffer_stale(sb); return -EIO; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3bfb252206c7..d1a177a3dbe8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, - int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_set_prop(d_inode(dentry), name, value, size, flags); + return btrfs_set_prop(inode, name, value, size, flags); } static const struct xattr_handler btrfs_security_xattr_handler = { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 43098cd9602b..eeb71e5de27a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) /* * Finish an async read(ahead) op. */ -static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_osd_data *osd_data; - int rc = req->r_result; - int bytes = le32_to_cpu(msg->hdr.data_len); + int rc = req->r_result <= 0 ? req->r_result : 0; + int bytes = req->r_result >= 0 ? req->r_result : 0; int num_pages; int i; @@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { - dout("writepage setting page/mapping error %d %p\n", err, page); + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + dout("writepage interrupted page %p\n", page); + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + goto out; + } + dout("writepage setting page/mapping error %d %p\n", + err, page); SetPageError(page); mapping_set_error(&inode->i_data, err); - if (wbc) - wbc->pages_skipped++; + wbc->pages_skipped++; } else { dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ @@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); err = writepage_nounlock(page, wbc); + if (err == -ERESTARTSYS) { + /* direct memory reclaimer was killed by SIGKILL. return 0 + * to prevent caller from setting mapping/page error */ + err = 0; + } unlock_page(page); iput(inode); return err; } - /* * lame release_pages helper. release_pages() isn't exported to * modules. @@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num) * If we get an error, set the mapping error bit, but not the individual * page error bits. */ -static void writepages_finish(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); @@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); bool remove_page; - dout("writepages_finish %p rc %d\n", inode, rc); if (rc < 0) mapping_set_error(mapping, rc); @@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + if (rc < 0) + SetPageError(page); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - pr_warn("writepage_start %p on forced umount\n", inode); - truncate_pagecache(inode, 0); + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited( + "writepage_start %p %lld forced umount\n", + inode, ceph_ino(inode)); + } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } @@ -1063,10 +1079,7 @@ new_request: pages = NULL; } - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, snapc, vino.snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1099,8 +1112,7 @@ release_pvec_pages: mapping->writeback_index = index; out: - if (req) - ceph_osdc_put_request(req); + ceph_osdc_put_request(req); ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; @@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file, struct page *page) { struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); loff_t page_off = pos & PAGE_MASK; int pos_in_page = pos & ~PAGE_MASK; @@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + unlock_page(page); + return -EIO; + } + retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); @@ -1165,7 +1184,7 @@ retry_locked: snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); - r = wait_event_interruptible(ci->i_cap_wq, + r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); if (r == -ERESTARTSYS) @@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = { .direct_IO = ceph_direct_io, }; +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} /* * vm ops @@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_SHIFT; int want, got, ret; + sigset_t oldset; + + ceph_block_sigs(&oldset); dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); @@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, - -1, &got, &pinned_page); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - return VM_FAULT_SIGBUS; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (ret < 0) + goto out_restore; + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); @@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ceph_put_cap_refs(ci, got); if (ret != -EAGAIN) - return ret; + goto out_restore; /* read inline data */ if (off >= PAGE_SIZE) { @@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; - goto out; + goto out_inline; } ret1 = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); if (ret1 < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); - ret = VM_FAULT_SIGBUS; - goto out; + if (ret1 < 0) + ret = ret1; + else + ret = VM_FAULT_SIGBUS; + goto out_inline; } if (ret1 < PAGE_SIZE) zero_user_segment(page, ret1, PAGE_SIZE); @@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_SIZE, ret); } -out: - dout("filemap_fault %p %llu~%zd read inline data ret %d\n", - inode, off, (size_t)PAGE_SIZE, ret); +out_restore: + ceph_restore_sigs(&oldset); + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return ret; } @@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size = i_size_read(inode); size_t len; int want, got, ret; + sigset_t oldset; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) - return VM_FAULT_SIGBUS; + return VM_FAULT_OOM; + + ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; @@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_free; - } } if (off + PAGE_SIZE <= size) @@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - ret = VM_FAULT_SIGBUS; - goto out_free; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (ret < 0) + goto out_free; + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", inode, off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); - lock_page(page); + do { + lock_page(page); - ret = VM_FAULT_NOPAGE; - if ((off > size) || - (page->mapping != inode->i_mapping)) { - unlock_page(page); - goto out; - } + if ((off > size) || (page->mapping != inode->i_mapping)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + break; + } + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret >= 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + ret = VM_FAULT_LOCKED; + } + } while (ret == -EAGAIN); - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret >= 0) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); - ret = VM_FAULT_LOCKED; - } else { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; - } -out: if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; @@ -1495,8 +1523,10 @@ out: inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); out_free: + ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); - + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; return ret; } @@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; - snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), - "%llx.00000000", ci->i_vino.ino); - rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); + + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); @@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; - wr_req->r_base_oid = rd_req->r_base_oid; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); + + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); @@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); + wr_req->r_mtime = ci->vfs_inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) @@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) out_unlock: up_write(&mdsc->pool_perm_rwsem); - if (rd_req) - ceph_osdc_put_request(rd_req); - if (wr_req) - ceph_osdc_put_request(wr_req); + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); out: if (!err) err = have; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a351480dbabc..c052b5bf219b 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int unlock_page(page); } -static inline int cache_valid(struct ceph_inode_info *ci) +static inline bool cache_valid(struct ceph_inode_info *ci) { return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && (ci->i_fscache_gen == ci->i_rdcache_gen)); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cfaeef18cbca..c17b5d76d75e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1656,7 +1656,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ @@ -1698,8 +1698,8 @@ retry_locked: revoking = cap->implemented & ~cap->issued; dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap_used), + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -2317,7 +2317,7 @@ again: /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { + if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; @@ -2412,12 +2412,26 @@ again: goto out_unlock; } - if (!__ceph_is_any_caps(ci) && - ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; - goto out_unlock; + if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { + int mds_wanted; + if (ACCESS_ONCE(mdsc->fsc->mount_state) == + CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci); + if ((mds_wanted & need) != need) { + dout("get_cap_refs %p caps were dropped" + " (session killed?)\n", inode); + *err = -ESTALE; + ret = 1; + goto out_unlock; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } dout("get_cap_refs %p have %s needed %s\n", inode, @@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err == -EAGAIN) continue; if (err < 0) - return err; + ret = err; } else { ret = wait_event_interruptible(ci->i_cap_wq, try_get_cap_refs(ci, need, want, endoff, @@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, continue; if (err < 0) ret = err; - if (ret < 0) - return ret; + } + if (ret < 0) { + if (err == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; + } + return ret; } if (ci->i_inline_version != CEPH_INLINE_NONE && @@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - !ci->i_wrbuffer_ref) { + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ @@ -3226,6 +3247,8 @@ retry: if (target < 0) { __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; goto out_unlock; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 31f831471ed2..39ff678e567f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); kfree(path); - } else if (req->r_path2) { + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3ab1192d2029..6e0fedf6713b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -70,16 +70,42 @@ out_unlock: } /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + static unsigned fpos_frag(loff_t p) { - return p >> 32; + return p >> OFFSET_BITS; } + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + static unsigned fpos_off(loff_t p) { - return p & 0xffffffff; + return p & OFFSET_MASK; } static int fpos_cmp(loff_t l, loff_t r) @@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); + if (!cache_ctl->page) { + dout(" page %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_mutex, no need to use page lock */ + unlock_page(cache_ctl->page); + cache_ctl->dentries = kmap(cache_ctl->page); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; - unsigned nsize = PAGE_SIZE / sizeof(struct dentry *); - int err = 0; - loff_t ptr_pos = 0; struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; + int err = 0; - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); + + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linar search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } - /* we can calculate cache index for the first dirfrag */ - if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { - cache_ctl.index = fpos_off(ctx->pos) - 2; - BUG_ON(cache_ctl.index < 0); - ptr_pos = cache_ctl.index * sizeof(struct dentry *); + dout("__dcache_readdir %p cache idx %llu\n", dir, idx); } - while (true) { - pgoff_t pgoff; - bool emit_dentry; - if (ptr_pos >= i_size_read(dir)) { + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { fi->flags |= CEPH_F_ATEND; err = 0; break; } - - err = -EAGAIN; - pgoff = ptr_pos >> PAGE_SHIFT; - if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { - ceph_readdir_cache_release(&cache_ctl); - cache_ctl.page = find_lock_page(&dir->i_data, pgoff); - if (!cache_ctl.page) { - dout(" page %lu not found\n", pgoff); - break; - } - /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ - unlock_page(cache_ctl.page); - cache_ctl.dentries = kmap(cache_ctl.page); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; } - rcu_read_lock(); - spin_lock(&parent->d_lock); - /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ - if (ceph_dir_is_complete_ordered(dir) && - ptr_pos < i_size_read(dir)) - dentry = cache_ctl.dentries[cache_ctl.index % nsize]; - else - dentry = NULL; - spin_unlock(&parent->d_lock); - if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) - dentry = NULL; - rcu_read_unlock(); - if (!dentry) - break; - - emit_dentry = false; di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && - ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) { emit_dentry = true; } spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dout(" %llx dentry %p %pd %p\n", di->offset, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, @@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, } else { dput(dentry); } - - cache_ctl.index++; - ptr_pos += sizeof(struct dentry *); } +out: ceph_readdir_cache_release(&cache_ctl); if (last) { int ret; @@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, return err; } +static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +{ + if (!fi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + else + return fi->frag != fpos_frag(pos); +} + static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + int i; int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); if (fi->flags & CEPH_F_ATEND) return 0; @@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { ino_t ino = parent_ino(file->f_path.dentry); @@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } /* can we use the dcache? */ @@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; - frag = fpos_frag(ctx->pos); - off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; + unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -305,6 +372,13 @@ more: fi->last_readdir = NULL; } + if (is_hash_order(ctx->pos)) { + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -331,6 +405,8 @@ more: req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); req->r_inode = inode; ihold(inode); @@ -340,22 +416,26 @@ more: ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); - + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); - /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - off = req->r_readdir_offset; - fi->next_offset = off; + if (!rinfo->hash_order) { + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + fi->next_offset, + false); + } } fi->frag = frag; - fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_did_prepopulate) { @@ -363,7 +443,8 @@ more: if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ fi->dir_ordered_count = 0; - } else if (ceph_frag_is_leftmost(frag) && off == 2) { + } else if (ceph_frag_is_leftmost(frag) && + fi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ fi->dir_release_count = req->r_dir_release_cnt; @@ -377,65 +458,87 @@ more: fi->dir_release_count = 0; } - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - if (ceph_frag_is_rightmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - } else { - err = note_last_dentry(fi, - rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1], - fi->next_offset + rinfo->dir_nr); + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); + err = note_last_dentry(fi, rde->name, rde->name_len, + next_offset); if (err) return err; + } else if (req->r_reply_info.dir_end) { + fi->next_offset = 2; + /* keep last name */ } } rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); - - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_inode *in = - rinfo->dir_in[off - fi->offset].in; + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + fi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, - rinfo->dir_dname_len[off - fi->offset], - rinfo->dir_dname[off - fi->offset], in); - BUG_ON(!in); - ftype = le32_to_cpu(in->mode) >> 12; - vino.ino = le64_to_cpu(in->ino); - vino.snap = le64_to_cpu(in->snapid); + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); + + BUG_ON(!rde->inode.in); + ftype = le32_to_cpu(rde->inode.in->mode) >> 12; + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); - if (!dir_emit(ctx, - rinfo->dir_dname[off - fi->offset], - rinfo->dir_dname_len[off - fi->offset], - ceph_translate_ino(inode->i_sb, ino), ftype)) { + + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } - off++; ctx->pos++; } - if (fi->last_name) { + if (fi->next_offset > 2) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - off = 0; - ctx->pos = ceph_make_fpos(frag, off); + if (!ceph_frag_is_rightmost(fi->frag)) { + unsigned frag = ceph_frag_next(fi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + fi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); + kfree(fi->last_name); + fi->last_name = NULL; + } dout("readdir next frag is %x\n", frag); goto more; } @@ -467,7 +570,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +static void reset_readdir(struct ceph_file_info *fi) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->last_name = NULL; fi->dir_release_count = 0; fi->readdir_cache_idx = -1; - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; /* compensate for . and .. */ - else - fi->next_offset = 0; + fi->next_offset = 2; /* compensate for . and .. */ fi->flags &= ~CEPH_F_ATEND; } +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; + if (new_pos == 0) + return true; + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (fi->frag |= fpos_frag(new_pos)) { + return true; + } + rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); +} + static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; inode_lock(inode); @@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { + if (need_reset_readdir(fi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; fi->flags &= ~CEPH_F_ATEND; } retval = offset; - - if (offset == 0 || - fpos_frag(offset) != fi->frag || - fpos_off(offset) < fi->offset) { - /* discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk */ - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi, fpos_frag(offset)); - } else if (fpos_cmp(offset, old_offset) > 0) { - /* reset dir_release_count if we did a forward seek */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; - } } out: inode_unlock(inode); @@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, return dentry; } -static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) { return ceph_ino(inode) == CEPH_INO_ROOT && strncmp(dentry->d_name.name, ".ceph", 5) == 0; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4f1dc7120916..a888df6f2d71 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_fmode = -1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + +/* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode, kfree(aio_req); } -static void ceph_aio_complete_req(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_aio_complete_req(struct ceph_osd_request *req) { int rc = req->r_result; struct inode *inode = req->r_inode; @@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; - req->r_base_oid = orig_req->r_base_oid; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } req->r_ops[0] = orig_req->r_ops[0]; osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - ceph_osdc_build_request(req, req->r_ops[0].extent.offset, - snapc, CEPH_NOSNAP, &aio_req->mtime); + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; ceph_osdc_put_request(orig_req); @@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work) out: if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } ceph_put_snap_context(snapc); @@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) list_add_tail(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); + + complete_all(&req->r_completion); } else { spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_item); @@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (pos+len) | (PAGE_SIZE - 1)); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, false, false); - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; @@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req, false); if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } } return -EIOCBQUEUED; @@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, true); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - + req->r_mtime = mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e669cfa9d793..f059b5997072 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -11,6 +11,7 @@ #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/random.h> +#include <linux/sort.h> #include "super.h" #include "mds_client.h" @@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode, diri_auth = ci->i_auth_cap->mds; spin_unlock(&ci->i_ceph_lock); + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + mutex_lock(&ci->i_fragtree_mutex); if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ @@ -300,20 +304,38 @@ out: return err; } +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(ls->frag, rs->frag); +} + +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_inode_frag *frag; + struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; - int i; - u32 id, nsplits; + unsigned i, split_by, nsplits; + u32 id; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); - if (nsplits) { + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) @@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode, if (!update) goto out_unlock; + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } frag = NULL; while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); @@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode, break; } rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } frag = NULL; } if (!frag) { @@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode, if (IS_ERR(frag)) continue; } - frag->split_by = le32_to_cpu(fragtree->splits[i].by); + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; dout(" frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; } while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } } out_unlock: mutex_unlock(&ci->i_fragtree_mutex); @@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode) rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) @@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode) return 1; } +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, size = 0; } i_size_write(inode, size); - inode->i_blocks = (size + (1<<9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); - err = -EINVAL; - if (WARN_ON(symlen != i_size_read(inode))) - goto out; + if (symlen != i_size_read(inode)) { + pr_err("fill_inode %llx.%llx BAD symlink " + "size %lld\n", ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } err = -ENOMEM; sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); @@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; struct inode *in; int rc; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); in = ceph_get_inode(req->r_dentry->d_sb, vino); if (IS_ERR(in)) { @@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { pr_err("fill_inode badness on %p got %d\n", in, rc); err = rc; - continue; } + iput(in); } return err; @@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - struct ceph_dentry_info *di; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order && req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } + if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) + if (!rinfo->hash_order) req->r_readdir_offset = 2; - else - req->r_readdir_offset = 0; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { @@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_readdir_cache_idx = 0; } cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; - dname.name = rinfo->dir_dname[i]; - dname.len = rinfo->dir_dname_len[i]; + dname.name = rde->name; + dname.len = rde->name_len; dname.hash = full_name_hash(dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); + + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } retry_lookup: dn = d_lookup(parent, &dname); @@ -1490,7 +1569,7 @@ retry_lookup: } } - ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + ret = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (ret < 0) { @@ -1523,11 +1602,9 @@ retry_lookup: dn = realdn; } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + ceph_dentry(dn)->offset = rde->offset; - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, + update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { @@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); i_size_write(inode, size); - inode->i_blocks = (size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && @@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_pg_inv_work); struct inode *inode = &ci->vfs_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0; mutex_lock(&ci->i_truncate_mutex); + + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", + inode, ceph_ino(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); @@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_pagecache(inode, 0); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err("invalidate_pages %p fails\n", inode); + } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); - inode->i_blocks = - (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(attr->ia_size); inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f851d8d70158..be6b1657b1af 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; - down_read(&osdc->map_sem); + down_read(&osdc->lock); r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return -EIO; } dl.file_offset -= dl.object_offset; @@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ceph_ino(inode), dl.object_no); oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); - ceph_oid_set_name(&oid, dl.object_name); + ceph_oid_printf(&oid, "%s", dl.object_name); - r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return r; } - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); @@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } - up_read(&osdc->map_sem); + up_read(&osdc->lock); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85b8517f17a0..2103b823bec0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end, ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); - info->dir_end = ceph_decode_8(p); - info->dir_complete = ceph_decode_8(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + } if (num == 0) goto done; - BUG_ON(!info->dir_in); - info->dir_dname = (void *)(info->dir_in + num); - info->dir_dname_len = (void *)(info->dir_dname + num); - info->dir_dlease = (void *)(info->dir_dname_len + num); - if ((unsigned long)(info->dir_dlease + num) > - (unsigned long)info->dir_in + info->dir_buf_size) { + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err("dir contents are larger than expected\n"); WARN_ON(1); goto bad; @@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_nr = num; while (num) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); - info->dir_dname_len[i] = ceph_decode_32(p); - ceph_decode_need(p, end, info->dir_dname_len[i], bad); - info->dir_dname[i] = *p; - *p += info->dir_dname_len[i]; - dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], - info->dir_dname[i]); - info->dir_dlease[i] = *p; + rde->name_len = ceph_decode_32(p); + ceph_decode_need(p, end, rde->name_len, bad); + rde->name = *p; + *p += rde->name_len; + dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + rde->lease = *p; *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i], features); + err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } @@ -345,9 +348,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - if (!info->dir_in) + if (!info->dir_entries) return; - free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req); } +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ -static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, - u64 tid) +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - struct rb_node *n = mdsc->request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mds_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else { - ceph_mdsc_get_request(req); - return req; - } - } - return NULL; -} -static void __insert_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *new) -{ - struct rb_node **p = &mdsc->request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mds_request *req = NULL; + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mds_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &mdsc->request_tree); + return req; } /* @@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - __insert_request(mdsc, req); + insert_request(&mdsc->request_tree, req); req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); @@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } } - rb_erase(&req->r_node, &mdsc->request_tree); - RB_CLEAR_NODE(&req->r_node); + erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir && req->r_got_unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); @@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; void *p; const char* metadata[][2] = { {"hostname", utsname()->nodename}, {"kernel_version", utsname()->release}, - {"entity_id", opt->name ? opt->name : ""}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; @@ -1149,9 +1125,11 @@ out: static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { + struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - int drop = 0; + bool drop = false; + bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; + + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + + if (ci->i_wrbuffer_ref > 0 && + ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + invalidate = true; while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); @@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = 1; + drop = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = 1; + drop = true; } spin_unlock(&mdsc->cap_dirty_lock); @@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_del(&cf->list); ceph_free_cap_flush(cf); } - while (drop--) + + wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); + if (drop) iput(inode); return 0; } @@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, */ static void remove_session_caps(struct ceph_mds_session *session) { + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, NULL); + iterate_session_caps(session, remove_session_caps_cb, fsc); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { - struct super_block *sb = session->s_mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; @@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; - size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + - sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + size_t size = sizeof(struct ceph_mds_reply_dir_entry); int order, num_entries; spin_lock(&ci->i_ceph_lock); @@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, - order); - if (rinfo->dir_in) + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, + order); + if (rinfo->dir_entries) break; order--; } - if (!rinfo->dir_in) + if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; @@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("handle_reply on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); @@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ee69a537dba5..e7d38aac7109 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in { u32 pool_ns_len; }; +struct ceph_mds_reply_dir_entry { + char *name; + u32 name_len; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; + loff_t offset; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + bool dir_complete; + bool dir_end; + bool hash_order; + struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 261531e55e9d..8c3591a7fbae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err = -EINVAL; - u16 version; + u8 mdsmap_v, mdsmap_cv; m = kzalloc(sizeof(*m), GFP_NOFS); if (m == NULL) return ERR_PTR(-ENOMEM); - ceph_decode_16_safe(p, end, version, bad); - if (version > 3) { - pr_warn("got mdsmap version %d > 3, failing", version); - goto bad; + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + mdsmap_cv = ceph_decode_8(p); + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); @@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 namelen; s32 mds, inc, state; u64 state_seq; - u8 infoversion; + u8 info_v; + void *info_end = NULL; struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; - ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); - infoversion = ceph_decode_8(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + u8 info_cv; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + info_cv = ceph_decode_8(p); + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); *p += sizeof(u64); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; @@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; - if (infoversion >= 2) { + if (info_v >= 2) { ceph_decode_32_safe(p, end, num_export_targets, bad); pexport_targets = *p; *p += num_export_targets * sizeof(u32); @@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) num_export_targets = 0; } + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), @@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ + *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f12d5e2955c2..91e02481ce06 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { + Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -143,6 +144,7 @@ enum { }; static match_table_t fsopt_tokens = { + {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ + case Opt_mds_namespace: + fsopt->mds_namespace = intval; + break; case Opt_wsize: fsopt->wsize = intval; break; @@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->server_path); kfree(args); } @@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } static int parse_mount_options(struct ceph_mount_options **pfsopt, struct ceph_options **popt, int flags, char *options, - const char *dev_name, - const char **path) + const char *dev_name) { struct ceph_mount_options *fsopt; const char *dev_name_end; @@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); + fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } else { - /* path is empty */ dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ @@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, goto out; } dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); @@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) + seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, { struct ceph_fs_client *fsc; const u64 supported_features = - CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDS_INLINE_DATA; + CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); fsc->mount_options = fsopt; @@ -785,8 +799,7 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto fail; } - if (path[0] == 0) { + if (!fsc->mount_options->server_path) { root = fsc->sb->s_root; dget(root); } else { - dout("mount opening base mountpoint\n"); + const char *path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; @@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, #ifdef CONFIG_CEPH_FS_POSIX_ACL flags |= MS_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { res = ERR_PTR(err); goto out_final; @@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } } - res = ceph_real_mount(fsc, path); + res = ceph_real_mount(fsc); if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7b99eb756477..0130a8592191 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,6 +62,7 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + int mds_namespace; /* * everything above this point can be memcmp'd; everything below @@ -69,6 +70,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *server_path; /* default "/" */ }; struct ceph_fs_client { @@ -295,6 +297,7 @@ struct ceph_inode_info { u64 i_files, i_subdirs; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; @@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -632,7 +631,6 @@ struct ceph_file_info { struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; @@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; +extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, @@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0d66722c6a52..4870b29df224 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { size_t len = strlen(pool_name); @@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ret = -ERANGE; } } - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) ret = snprintf(val, size, "%s", pool_name); else ret = snprintf(val, size, "%lld", (unsigned long long)pool); - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; int err; if (size > 0) { @@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (err) goto out; } else if (!value) { - flags |= CEPH_XATTR_REMOVE; + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; } dout("setxattr value=%.*s\n", (int)size, value); /* do request */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, - USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { ceph_mdsc_put_request(req); @@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, goto out; } - req->r_pagelist = pagelist; - pagelist = NULL; + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_pagelist = pagelist; + pagelist = NULL; + } req->r_inode = inode; ihold(inode); @@ -1051,12 +1056,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - return __ceph_setxattr(d_inode(dentry), name, value, size, flags); + return __ceph_setxattr(inode, name, value, size, flags); } const struct xattr_handler ceph_other_xattr_handler = { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index c8b77aa24a1d..5e23f64c0804 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -39,8 +39,9 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; static int cifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int rc = -EOPNOTSUPP; unsigned int xid; @@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (value && pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - size, d_inode(dentry), + size, inode, full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(d_inode(dentry))->time = 0; + CIFS_I(inode)->time = 0; kfree(pacl); } #endif /* CONFIG_CIFS_ACL */ @@ -32,14 +32,43 @@ #include <linux/pfn_t.h> #include <linux/sizes.h> -#define RADIX_DAX_MASK 0xf -#define RADIX_DAX_SHIFT 4 -#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +/* + * We use lowest available bit in exceptional entry for locking, other two + * bits to determine entry type. In total 3 special bits. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) +#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ + RADIX_TREE_EXCEPTIONAL_ENTRY)) + +/* We choose 4096 entries - same as per-zone page wait tables */ +#define DAX_WAIT_TABLE_BITS 12 +#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) + +wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; + +static int __init init_dax_wait_table(void) +{ + int i; + + for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) + init_waitqueue_head(wait_table + i); + return 0; +} +fs_initcall(init_dax_wait_table); + +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index) +{ + unsigned long hash = hash_long((unsigned long)mapping ^ index, + DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { @@ -87,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) return page; } -/* - * dax_clear_sectors() is called from within transaction context from XFS, - * and hence this means the stack from this point must follow GFP_NOFS - * semantics for all operations. - */ -int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) -{ - struct blk_dax_ctl dax = { - .sector = _sector, - .size = _size, - }; - - might_sleep(); - do { - long count, sz; - - count = dax_map_atomic(bdev, &dax); - if (count < 0) - return count; - sz = min_t(long, count, SZ_128K); - clear_pmem(dax.addr, sz); - dax.size -= sz; - dax.sector += sz / 512; - dax_unmap_atomic(bdev, &dax); - cond_resched(); - } while (dax.size); - - wmb_pmem(); - return 0; -} -EXPORT_SYMBOL_GPL(dax_clear_sectors); - -/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ -static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, - loff_t pos, loff_t end) -{ - loff_t final = end - pos + first; /* The final byte of the buffer */ - - if (first > 0) - clear_pmem(addr, first); - if (final < size) - clear_pmem(addr + final, size - final); -} - static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); @@ -169,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; + unsigned blkbits = inode->i_blkbits; + sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) + >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); @@ -176,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, while (pos < end) { size_t len; if (pos == max) { - unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); @@ -192,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; + /* + * We allow uninitialized buffers for writes + * beyond EOF as those cannot race with faults + */ + WARN_ON_ONCE( + (buffer_new(bh) && block < file_blks) || + (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -211,11 +205,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, rc = map_len; break; } - if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(dax.addr, map_len, first, - pos, end); - need_wmb = true; - } dax.addr += first; size = map_len - first; } @@ -276,15 +265,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { - struct address_space *mapping = inode->i_mapping; + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); - retval = filemap_write_and_wait_range(mapping, pos, end - 1); - if (retval) { - inode_unlock(inode); - goto out; - } - } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) @@ -305,12 +287,268 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); - out: return retval; } EXPORT_SYMBOL_GPL(dax_do_io); /* + * DAX radix tree locking + */ +struct exceptional_entry_key { + struct address_space *mapping; + unsigned long index; +}; + +struct wait_exceptional_entry_queue { + wait_queue_t wait; + struct exceptional_entry_key key; +}; + +static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, + int sync, void *keyp) +{ + struct exceptional_entry_key *key = keyp; + struct wait_exceptional_entry_queue *ewait = + container_of(wait, struct wait_exceptional_entry_queue, wait); + + if (key->mapping != ewait->key.mapping || + key->index != ewait->key.index) + return 0; + return autoremove_wake_function(wait, mode, sync, NULL); +} + +/* + * Check whether the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline int slot_locked(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + return entry & RADIX_DAX_ENTRY_LOCK; +} + +/* + * Mark the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline void *lock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry |= RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Mark the given slot is unlocked. The function must be called with + * mapping->tree_lock held + */ +static inline void *unlock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Lookup entry in radix tree, wait for it to become unlocked if it is + * exceptional entry and return it. The caller must call + * put_unlocked_mapping_entry() when he decided not to lock the entry or + * put_locked_mapping_entry() when he locked the entry and now wants to + * unlock it. + * + * The function must be called with mapping->tree_lock held. + */ +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) +{ + void *ret, **slot; + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + ewait.key.mapping = mapping; + ewait.key.index = index; + + for (;;) { + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + &slot); + if (!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot)) { + if (slotp) + *slotp = slot; + return ret; + } + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mapping->tree_lock); + schedule(); + finish_wait(wq, &ewait.wait); + spin_lock_irq(&mapping->tree_lock); + } +} + +/* + * Find radix tree entry at given index. If it points to a page, return with + * the page locked. If it points to the exceptional entry, return with the + * radix tree entry locked. If the radix tree doesn't contain given index, + * create empty exceptional entry for the index and return with it locked. + * + * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For + * persistent memory the benefit is doubtful. We can add that later if we can + * show it helps. + */ +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + +restart: + spin_lock_irq(&mapping->tree_lock); + ret = get_unlocked_mapping_entry(mapping, index, &slot); + /* No entry for given index? Make sure radix tree is big enough. */ + if (!ret) { + int err; + + spin_unlock_irq(&mapping->tree_lock); + err = radix_tree_preload( + mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + if (err) + return ERR_PTR(err); + ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK); + spin_lock_irq(&mapping->tree_lock); + err = radix_tree_insert(&mapping->page_tree, index, ret); + radix_tree_preload_end(); + if (err) { + spin_unlock_irq(&mapping->tree_lock); + /* Someone already created the entry? */ + if (err == -EEXIST) + goto restart; + return ERR_PTR(err); + } + /* Good, we have inserted empty locked entry into the tree. */ + mapping->nrexceptional++; + spin_unlock_irq(&mapping->tree_lock); + return ret; + } + /* Normal page in radix tree? */ + if (!radix_tree_exceptional_entry(ret)) { + struct page *page = ret; + + get_page(page); + spin_unlock_irq(&mapping->tree_lock); + lock_page(page); + /* Page got truncated? Retry... */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto restart; + } + return page; + } + ret = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all) +{ + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) { + struct exceptional_entry_key key; + + key.mapping = mapping; + key.index = index; + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + } +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + + spin_lock_irq(&mapping->tree_lock); + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +/* + * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree + * entry to get unlocked before deleting it. + */ +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *entry; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + /* + * This gets called from truncate / punch_hole path. As such, the caller + * must hold locks protecting against concurrent modifications of the + * radix tree (usually fs-private i_mmap_sem for writing). Since the + * caller has seen exceptional entry for this index, we better find it + * at that index as well... + */ + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, true); + + return 1; +} + +/* * The user has performed a load from a hole in the file. Allocating * a new page in the file would cause excessive storage usage for * workloads with sparse files. We allocate a page cache page instead. @@ -318,24 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io); * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, struct page *page, - struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - unsigned long size; - struct inode *inode = mapping->host; - if (!page) - page = find_or_create_page(mapping, vmf->pgoff, - GFP_KERNEL | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - /* Recheck i_size under page lock to avoid truncate race */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) { - unlock_page(page); - put_page(page); - return VM_FAULT_SIGBUS; + struct page *page; + + /* Hole page already exists? Return it... */ + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } + /* This will replace locked radix tree entry with a hole page */ + page = find_or_create_page(mapping, vmf->pgoff, + vmf->gfp_mask | __GFP_ZERO); + if (!page) { + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + return VM_FAULT_OOM; + } vmf->page = page; return VM_FAULT_LOCKED; } @@ -359,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } -#define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) -static int dax_radix_entry(struct address_space *mapping, pgoff_t index, - sector_t sector, bool pmd_entry, bool dirty) +static void *dax_insert_mapping_entry(struct address_space *mapping, + struct vm_fault *vmf, + void *entry, sector_t sector) { struct radix_tree_root *page_tree = &mapping->page_tree; - pgoff_t pmd_index = DAX_PMD_INDEX(index); - int type, error = 0; - void *entry; + int error = 0; + bool hole_fill = false; + void *new_entry; + pgoff_t index = vmf->pgoff; - WARN_ON_ONCE(pmd_entry && !dirty); - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - spin_lock_irq(&mapping->tree_lock); - - entry = radix_tree_lookup(page_tree, pmd_index); - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { - index = pmd_index; - goto dirty; + /* Replacing hole page with block mapping? */ + if (!radix_tree_exceptional_entry(entry)) { + hole_fill = true; + /* + * Unmap the page now before we remove it from page cache below. + * The page is locked so it cannot be faulted in again. + */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); + error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); + if (error) + return ERR_PTR(error); } - entry = radix_tree_lookup(page_tree, index); - if (entry) { - type = RADIX_DAX_TYPE(entry); - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && - type != RADIX_DAX_PMD)) { - error = -EIO; + spin_lock_irq(&mapping->tree_lock); + new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | + RADIX_DAX_ENTRY_LOCK); + if (hole_fill) { + __delete_from_page_cache(entry, NULL); + /* Drop pagecache reference */ + put_page(entry); + error = radix_tree_insert(page_tree, index, new_entry); + if (error) { + new_entry = ERR_PTR(error); goto unlock; } + mapping->nrexceptional++; + } else { + void **slot; + void *ret; - if (!pmd_entry || type == RADIX_DAX_PMD) - goto dirty; - - /* - * We only insert dirty PMD entries into the radix tree. This - * means we don't need to worry about removing a dirty PTE - * entry and inserting a clean PMD entry, thus reducing the - * range we would flush with a follow-up fsync/msync call. - */ - radix_tree_delete(&mapping->page_tree, index); - mapping->nrexceptional--; - } - - if (sector == NO_SECTOR) { - /* - * This can happen during correct operation if our pfn_mkwrite - * fault raced against a hole punch operation. If this - * happens the pte that was hole punched will have been - * unmapped and the radix tree entry will have been removed by - * the time we are called, but the call will still happen. We - * will return all the way up to wp_pfn_shared(), where the - * pte_same() check will fail, eventually causing page fault - * to be retried by the CPU. - */ - goto unlock; + ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + WARN_ON_ONCE(ret != entry); + radix_tree_replace_slot(slot, new_entry); } - - error = radix_tree_insert(page_tree, index, - RADIX_DAX_ENTRY(sector, pmd_entry)); - if (error) - goto unlock; - - mapping->nrexceptional++; - dirty: - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); - return error; + if (hole_fill) { + radix_tree_preload_end(); + /* + * We don't need hole page anymore, it has been replaced with + * locked radix tree entry now. + */ + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(entry); + unlock_page(entry); + put_page(entry); + } + return new_entry; } static int dax_writeback_one(struct block_device *bdev, @@ -555,56 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, +static int dax_insert_mapping(struct address_space *mapping, + struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), + .sector = to_sector(bh, mapping->host), .size = bh->b_size, }; - pgoff_t size; - int error; - - i_mmap_lock_read(mapping); - - /* - * Check truncate didn't happen while we were allocating a block. - * If it did, this block may or may not be still allocated to the - * file. We can't tell the filesystem to free it because we can't - * take i_mutex here. In the worst case, the file still has blocks - * allocated past the end of the file. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - error = -EIO; - goto out; - } + void *ret; + void *entry = *entryp; - if (dax_map_atomic(bdev, &dax) < 0) { - error = PTR_ERR(dax.addr); - goto out; - } - - if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(dax.addr, PAGE_SIZE); - wmb_pmem(); - } + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, - vmf->flags & FAULT_FLAG_WRITE); - if (error) - goto out; - - error = vm_insert_mixed(vma, vaddr, dax.pfn); + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + if (IS_ERR(ret)) + return PTR_ERR(ret); + *entryp = ret; - out: - i_mmap_unlock_read(mapping); - - return error; + return vm_insert_mixed(vma, vaddr, dax.pfn); } /** @@ -612,24 +818,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks - * @complete_unwritten: The filesystem method used to convert unwritten blocks - * to written so the data written to them is exposed. This is required for - * required by write faults for filesystems that will return unwritten - * extent mappings from @get_block, but it is optional for reads as - * dax_insert_mapping() will always zero unwritten blocks. If the fs does - * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; + void *entry; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; @@ -638,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, int error; int major = 0; + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -647,49 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; - repeat: - page = find_get_page(mapping, vmf->pgoff); - if (page) { - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - put_page(page); - return VM_FAULT_RETRY; - } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - /* - * We have a struct page covering a hole in the file - * from a read fault and we've raced with a truncate - */ - error = -EIO; - goto unlock_page; - } + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; - - if (!buffer_mapped(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_page; - } else { - return dax_load_hole(mapping, page, vmf); - } - } + goto unlock_entry; if (vmf->cow_page) { struct page *new_page = vmf->cow_page; @@ -698,53 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; - vmf->page = page; - if (!page) { - i_mmap_lock_read(mapping); - /* Check we didn't race with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); - error = -EIO; - goto out; - } + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } - return VM_FAULT_LOCKED; + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; } - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); - - if (page) { - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - delete_from_page_cache(page); - unlock_page(page); - put_page(page); - page = NULL; - } - - /* - * If we successfully insert the new mapping over an unwritten extent, - * we need to ensure we convert the unwritten extent. If there is an - * error inserting the mapping, the filesystem needs to leave it as - * unwritten to prevent exposure of the stale underlying data to - * userspace, but we still need to call the completion function so - * the private resources on the mapping buffer can be released. We - * indicate what the callback should do via the uptodate variable, same - * as for normal BH based IO completions. - */ - error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) { - if (complete_unwritten) - complete_unwritten(&bh, !error); - else - WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + if (!buffer_mapped(&bh)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_entry; + } else { + return dax_load_hole(mapping, entry, vmf); + } } + /* Filesystem should not return unwritten buffers to us! */ + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); + error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -752,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - - unlock_page: - if (page) { - unlock_page(page); - put_page(page); - } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -772,7 +920,7 @@ EXPORT_SYMBOL(__dax_fault); * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -781,7 +929,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_fault(vma, vmf, get_block, complete_unwritten); + result = __dax_fault(vma, vmf, get_block); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -789,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * The 'colour' (ie low bits) within a PMD of a page offset. This comes up * more often than one might expect in the below function. @@ -815,8 +963,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -828,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int error, result = 0; + int result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ @@ -875,6 +1022,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); } bdev = bh.b_bdev; @@ -900,26 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, truncate_pagecache_range(inode, lstart, lend); } - i_mmap_lock_read(mapping); - - /* - * If a truncate happened while we were allocating blocks, we may - * leave blocks allocated to the file that are beyond EOF. We can't - * take i_mutex here, so just leave them hanging; they'll be freed - * when the file is deleted. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) { - result = VM_FAULT_SIGBUS; - goto out; - } - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, - "offset + huge page size > file size"); - goto fallback; - } - - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); @@ -954,8 +1083,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, long length = dax_map_atomic(bdev, &dax); if (length < 0) { - result = VM_FAULT_SIGBUS; - goto out; + dax_pmd_dbg(&bh, address, "dax-error fallback"); + goto fallback; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); @@ -973,14 +1102,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } - - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - clear_pmem(dax.addr, PMD_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } dax_unmap_atomic(bdev, &dax); /* @@ -999,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * the write to insert a dirty entry. */ if (write) { - error = dax_radix_entry(mapping, pgoff, dax.sector, - true, true); - if (error) { - dax_pmd_dbg(&bh, address, - "PMD radix insertion failed"); - goto fallback; - } + /* + * We should insert radix-tree entry and dirty it here. + * For now this is broken... + */ } dev_dbg(part_to_dev(bdev->bd_part), @@ -1018,11 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } out: - i_mmap_unlock_read(mapping); - - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - return result; fallback: @@ -1042,8 +1155,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault); * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -1052,8 +1164,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_pmd_fault(vma, address, pmd, flags, get_block, - complete_unwritten); + result = __dax_pmd_fault(vma, address, pmd, flags, get_block); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -1070,27 +1181,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; - int error; - - /* - * We pass NO_SECTOR to dax_radix_entry() because we expect that a - * RADIX_DAX_PTE entry already exists in the radix tree from a - * previous call to __dax_fault(). We just want to look up that PTE - * entry using vmf->pgoff and make sure the dirty tag is set. This - * saves us from having to make a call to get_block() here to look - * up the sector. - */ - error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, - true); + struct address_space *mapping = file->f_mapping; + void *entry; + pgoff_t index = vmf->pgoff; - if (error == -ENOMEM) - return VM_FAULT_OOM; - if (error) - return VM_FAULT_SIGBUS; + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + put_unlocked_mapping_entry(mapping, index, entry); +out: + spin_unlock_irq(&mapping->tree_lock); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +static bool dax_range_is_aligned(struct block_device *bdev, + unsigned int offset, unsigned int length) +{ + unsigned short sector_size = bdev_logical_block_size(bdev); + + if (!IS_ALIGNED(offset, sector_size)) + return false; + if (!IS_ALIGNED(length, sector_size)) + return false; + + return true; +} + +int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + unsigned int offset, unsigned int length) +{ + struct blk_dax_ctl dax = { + .sector = sector, + .size = PAGE_SIZE, + }; + + if (dax_range_is_aligned(bdev, offset, length)) { + sector_t start_sector = dax.sector + (offset >> 9); + + return blkdev_issue_zeroout(bdev, start_sector, + length >> 9, GFP_NOFS, true); + } else { + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); + wmb_pmem(); + dax_unmap_atomic(bdev, &dax); + } + return 0; +} +EXPORT_SYMBOL_GPL(__dax_zero_page_range); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated @@ -1102,12 +1245,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); * page in a DAX file. This is intended for hole-punch operations. If * you are truncating a file, the helper function dax_truncate_page() may be * more convenient. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, get_block_t get_block) @@ -1126,23 +1263,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; err = get_block(inode, index, &bh, 0); - if (err < 0) + if (err < 0 || !buffer_written(&bh)) return err; - if (buffer_written(&bh)) { - struct block_device *bdev = bh.b_bdev; - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PAGE_SIZE, - }; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - wmb_pmem(); - dax_unmap_atomic(bdev, &dax); - } - - return 0; + return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), + offset, length); } EXPORT_SYMBOL_GPL(dax_zero_page_range); @@ -1154,12 +1279,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); * * Similar to block_truncate_page(), this function can be called by a * filesystem when it is truncating a DAX file to handle the partial page. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 3bf3f20f8ecc..f3b4408be590 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, map_bh->b_size = fs_count << i_blkbits; /* - * For writes inside i_size on a DIO_SKIP_HOLES filesystem we - * forbid block creations: only overwrites are permitted. - * We will return early to the caller once we see an - * unmapped buffer head returned, and the caller will fall - * back to buffered I/O. + * For writes that could fill holes inside i_size on a + * DIO_SKIP_HOLES filesystem we forbid block creations: only + * overwrites are permitted. We will return early to the caller + * once we see an unmapped buffer head returned, and the caller + * will fall back to buffered I/O. * * Otherwise the decision is left to the get_blocks method, * which may decide to handle it or also return an unmapped @@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (sdio->block_in_file < (i_size_read(dio->inode) >> - sdio->blkbits)) + if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> + i_blkbits)) create = 0; } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index ebd40f46ed4c..0d8eb3455b34 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode, char *page_virt, size_t size) { int rc; - rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt, - size, 0); + rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode, + ECRYPTFS_XATTR_NAME, page_virt, size, 0); return rc; } @@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, - size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode, + virt, size); else rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 3ec495db7e82..4ba1547bb9ad 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -609,8 +609,8 @@ ssize_t ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, const char *name, void *value, size_t size); int -ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); +ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); #ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 318b04689d76..9d153b6a1d72 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, } int -ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, +ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { int rc = 0; @@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc && d_really_is_positive(dentry)) - fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); + if (!rc && inode) + fsstack_copy_attr_all(inode, d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 148d11b514fb..9c3437c8a5b1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c1400b109805..868c02317b05 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = __dax_fault(vma, vmf, ext2_get_block, NULL); + ret = __dax_fault(vma, vmf, ext2_get_block); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, } down_read(&ei->dax_sem); - ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b675610391b8..fcbe58641e40 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,6 +26,7 @@ #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> +#include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_sectors(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) << - (inode->i_blkbits - 9), - 1 << inode->i_blkbits); + err = sb_issue_zeroout(inode->i_sb, + le32_to_cpu(chain[depth-1].key), count, + GFP_NOFS); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } + } else + set_buffer_new(bh_result); ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); - set_buffer_new(bh_result); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b78caf25f746..1d9379568aa8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext2_msg(sb, KERN_ERR, - "error: device does not support dax"); - goto failed_mount; - } } /* If the blocksize doesn't match, re-read the thing.. */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 7fd3b867ce65..7b9e9c1842d5 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 0f85705ff519..65049b71af13 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 1fafd27037cc..fb2f992ae763 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d478110c32a6..df44c877892a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); + result = __dax_fault(vma, vmf, ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block, NULL); + ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20c5d52253b4..3822a5aedc61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3417,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); - goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext4_msg(sb, KERN_ERR, - "error: device does not support dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 123a7d010efe..a8921112030d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 60652fa24cbc..c7765c735714 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 17a446ffecd3..ca20e423034b 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 00ea56797258..e3decae3acfb 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); switch (handler->flags) { case F2FS_XATTR_INDEX_USER: @@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, default: return -EINVAL; } - return f2fs_setxattr(d_inode(dentry), handler->flags, name, + return f2fs_setxattr(inode, handler->flags, name, value, size, NULL, flags); } @@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b9419058108f..ccd4971cc6c1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1719,10 +1719,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, return fuse_update_attributes(inode, stat, NULL, NULL); } -static int fuse_setxattr(struct dentry *entry, const char *name, - const void *value, size_t size, int flags) +static int fuse_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 4a01f30e9995..271d93905bac 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { __be64 *hash; + int error; hash = gfs2_dir_get_hash_table(dip); - if (IS_ERR(hash)) - return PTR_ERR(hash); - *leaf_out = be64_to_cpu(*(hash + index)); - return 0; + error = PTR_ERR_OR_ZERO(hash); + + if (!error) + *leaf_out = be64_to_cpu(*(hash + index)); + + return error; } static int get_first_leaf(struct gfs2_inode *dip, u32 index, @@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!IS_ERR_VALUE(error)) + if (!error) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (IS_ERR_VALUE(error)) + if (error) return error; /* Get the old leaf block */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f42ab53bd30d..3a2853504084 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 064f92f17efc..d9a86919fdf6 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -13,10 +13,10 @@ #include "hfs_fs.h" #include "btree.h" -int hfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int hfs_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index fa3eed86837c..ee2f385811c8 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ -extern int hfs_setxattr(struct dentry *dentry, const char *name, +extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4f118d282a7a..d37bb88dc746 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen) { @@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, - flags); + res = __hfsplus_setxattr(inode, xattr_name, value, size, flags); kfree(xattr_name); return res; } @@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { /* * Don't allow setting properly prefixed attributes @@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); + return __hfsplus_setxattr(inode, name, buffer, size, flags); } const struct xattr_handler hfsplus_xattr_osx_handler = { diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index d04ba6f58df2..68f6b539371f 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index ae2ca8c2e335..37b3efa733ef 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index eae2947060aa..94519d6c627d 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 3c9eec3e4c7b..fae6c0ea0030 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 3ed9a4b49778..c2332e30f218 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 4ebecff1d922..5d6030826c52 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index bce249e1b277..9d027b4abcf9 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index beb182b503b3..0bf3c33aedff 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __jfs_xattr_set(inode, name, value, size, flags); } @@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (is_known_namespace(name)) return -EOPNOTSUPP; return __jfs_xattr_set(inode, name, value, size, flags); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 1719649d7ad7..63b925d5ba1e 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -160,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, return 0; } -int kernfs_iop_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; void *secdata; int error; @@ -175,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(d_inode(dentry), suffix, + error = security_inode_setsecurity(inode, suffix, value, size, flags); if (error) return error; - error = security_inode_getsecctx(d_inode(dentry), + error = security_inode_getsecctx(inode, &secdata, &secdata_len); if (error) return error; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 45c9192c276e..37159235ac10 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, +int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags); int kernfs_iop_removexattr(struct dentry *dentry, const char *name); ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode, diff --git a/fs/libfs.c b/fs/libfs.c index 8765ff1adc07..3db2721144c2 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1118,8 +1118,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) return -EPERM; } -static int empty_dir_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { return -EOPNOTSUPP; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 223982eb38c9..de97567795a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5015,12 +5015,11 @@ static int nfs4_do_set_security_label(struct inode *inode, } static int -nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -6281,11 +6280,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, @@ -6303,12 +6302,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { if (security_ismaclabel(key)) - return nfs4_set_security_label(dentry, buf, buflen); + return nfs4_set_security_label(inode, buf, buflen); return -EOPNOTSUPP; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5075592df145..9679f4749364 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -66,7 +66,7 @@ #define OPENOWNER_POOL_SIZE 8 const nfs4_stateid zero_stateid = { - .data = { 0 }, + { .data = { 0 } }, .type = NFS4_SPECIAL_STATEID_TYPE, }; static DEFINE_MUTEX(nfs_clid_init_mutex); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a8d15beee5cb..6aaf3e351391 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -272,10 +272,21 @@ struct o2hb_region { struct delayed_work hr_write_timeout_work; unsigned long hr_last_timeout_start; + /* negotiate timer, used to negotiate extending hb timeout. */ + struct delayed_work hr_nego_timeout_work; + unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* Used during o2hb_check_slot to hold a copy of the block * being checked because we temporarily have to zero out the * crc field. */ struct o2hb_disk_heartbeat_block *hr_tmp_block; + + /* Message key for negotiate timeout message. */ + unsigned int hr_key; + struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; struct o2hb_bio_wait_ctxt { @@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) + +enum { + O2HB_NEGO_TIMEOUT_MSG = 1, + O2HB_NEGO_APPROVE_MSG = 2, +}; + +struct o2hb_nego_msg { + u8 node_num; +}; + static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; @@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work) o2quo_disk_timeout(); } -static void o2hb_arm_write_timeout(struct o2hb_region *reg) +static void o2hb_arm_timeout(struct o2hb_region *reg) { /* Arm writeout only after thread reaches steady state */ if (atomic_read(®->hr_steady_iterations) != 0) @@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) spin_unlock(&o2hb_live_lock); } cancel_delayed_work(®->hr_write_timeout_work); - reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); + + cancel_delayed_work(®->hr_nego_timeout_work); + /* negotiate timeout must be less than write timeout. */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); + memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); } -static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +static void o2hb_disarm_timeout(struct o2hb_region *reg) { cancel_delayed_work_sync(®->hr_write_timeout_work); + cancel_delayed_work_sync(®->hr_nego_timeout_work); +} + +static int o2hb_send_nego_msg(int key, int type, u8 target) +{ + struct o2hb_nego_msg msg; + int status, ret; + + msg.node_num = o2nm_this_node(); +again: + ret = o2net_send_message(type, key, &msg, sizeof(msg), + target, &status); + + if (ret == -EAGAIN || ret == -ENOMEM) { + msleep(100); + goto again; + } + + return ret; +} + +static void o2hb_nego_timeout(struct work_struct *work) +{ + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int master_node, i, ret; + struct o2hb_region *reg; + + reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + /* lowest node as master node to make negotiate decision. */ + master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + + if (master_node == o2nm_this_node()) { + if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, + config_item_name(®->hr_item), reg->hr_dev_name); + set_bit(master_node, reg->hr_nego_node_bitmap); + } + if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, + sizeof(reg->hr_nego_node_bitmap))) { + /* check negotiate bitmap every second to do timeout + * approve decision. + */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(1000)); + + return; + } + + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", + config_item_name(®->hr_item), reg->hr_dev_name); + /* approve negotiate timeout request. */ + o2hb_arm_timeout(reg); + + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + if (i == master_node) + continue; + + mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); + ret = o2hb_send_nego_msg(reg->hr_key, + O2HB_NEGO_APPROVE_MSG, i); + if (ret) + mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", + i, ret); + } + } else { + /* negotiate timeout with master node. */ + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), + reg->hr_dev_name, master_node); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); + if (ret) + mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", + master_node, ret); + } +} + +static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + struct o2hb_nego_msg *nego_msg; + + nego_msg = (struct o2hb_nego_msg *)msg->buf; + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", + nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); + if (nego_msg->node_num < O2NM_MAX_NODES) + set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); + else + mlog(ML_ERROR, "got nego timeout message from bad node.\n"); + + return 0; +} + +static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", + config_item_name(®->hr_item), reg->hr_dev_name); + o2hb_arm_timeout(reg); + return 0; } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Skip disarming the timeout if own slot has stale/bad data */ if (own_slot_ok) { o2hb_set_quorum_device(reg); - o2hb_arm_write_timeout(reg); + o2hb_arm_timeout(reg); + reg->hr_last_timeout_start = jiffies; } bail: @@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data) before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; after_hb = ktime_get_real(); @@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data) } } - o2hb_disarm_write_timeout(reg); + o2hb_disarm_timeout(reg); /* unclean stop is only used in very bad situation */ for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) @@ -1451,6 +1593,7 @@ static void o2hb_region_release(struct config_item *item) list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); + o2net_unregister_handler_list(®->hr_handler_list); kfree(reg); } @@ -1762,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); /* * A node is considered live after it has beat LIVE_THRESHOLD @@ -1995,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + /* this is the same way to generate msg key as dlm, for local heartbeat, + * name is also the same, so make initial crc value different to avoid + * message key conflict. + */ + reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, + name, strlen(name)); + INIT_LIST_HEAD(®->hr_handler_list); + ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_timeout_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto free; + + ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_approve_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto unregister_handler; + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); - goto free; + goto unregister_handler; } return ®->hr_item; + +unregister_handler: + o2net_unregister_handler_list(®->hr_handler_list); free: kfree(reg); return ERR_PTR(ret); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..94b18369b1cc 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,6 +44,9 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * + * New in version 12 + * - Negotiate hb timeout when storage is down. + * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -75,7 +78,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 11ULL +#define O2NET_PROTOCOL_VERSION 12ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0748777f2e2a..c56a7679df93 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } if (is_bad_inode(inode)) { iput(inode); - if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || - (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) - /* Return OCFS2_FILECHECK_ERR_XXX related errno */ - inode = ERR_PTR(rc); - else - inode = ERR_PTR(-ESTALE); + inode = ERR_PTR(rc); goto bail; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ad16995c9e7a..d2053853951e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 99c19545752c..5893ddde0e4b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -448,13 +448,14 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, + return orangefs_inode_setxattr(inode, ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, name, buffer, @@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler, } static int orangefs_xattr_set_trusted(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, + return orangefs_inode_setxattr(inode, ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, name, buffer, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index cc514da6f3e7..80aa6f1eb336 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -336,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct dentry *upperdir; struct dentry *upperdentry; const struct cred *old_cred; - struct cred *override_cred; char *link = NULL; if (WARN_ON(!workdir)) @@ -357,28 +356,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return PTR_ERR(link); } - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_free_link; - - override_cred->fsuid = stat->uid; - override_cred->fsgid = stat->gid; - /* - * CAP_SYS_ADMIN for copying up extended attributes - * CAP_DAC_OVERRIDE for create - * CAP_FOWNER for chmod, timestamp update - * CAP_FSETID for chmod - * CAP_CHOWN for chown - * CAP_MKNOD for mknod - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - cap_raise(override_cred->cap_effective, CAP_MKNOD); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { @@ -401,9 +379,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, out_unlock: unlock_rename(workdir, upperdir); revert_creds(old_cred); - put_cred(override_cred); -out_free_link: if (link) free_page((unsigned long) link); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index b3fc0a35bf62..22f0253a3567 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -405,28 +405,13 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, err = ovl_create_upper(dentry, inode, &stat, link, hardlink); } else { const struct cred *old_cred; - struct cred *override_cred; - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_iput; - - /* - * CAP_SYS_ADMIN for setting opaque xattr - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = ovl_create_over_whiteout(dentry, inode, &stat, link, hardlink); revert_creds(old_cred); - put_cred(override_cred); } if (!err) @@ -662,32 +647,11 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { - const struct cred *old_cred; - struct cred *override_cred; - - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); - put_cred(override_cred); } out_drop_write: ovl_drop_write(dentry); @@ -725,7 +689,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool new_is_dir = false; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; - struct cred *override_cred = NULL; err = -EINVAL; if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) @@ -794,26 +757,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_opaque = !OVL_TYPE_PURE_UPPER(old_type); new_opaque = !OVL_TYPE_PURE_UPPER(new_type); - if (old_opaque || new_opaque) { - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); - } + if (old_opaque || new_opaque) + old_cred = ovl_override_creds(old->d_sb); if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); @@ -943,10 +888,8 @@ out_dput_old: out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - if (old_opaque || new_opaque) { + if (old_opaque || new_opaque) revert_creds(old_cred); - put_cred(override_cred); - } out_drop_write: ovl_drop_write(old); out: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c7b31a03dc9c..0ed7c4012437 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -210,8 +210,9 @@ static bool ovl_is_private_xattr(const char *name) return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } -int ovl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int ovl_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int err; struct dentry *upperdentry; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 99ec4b035237..4bd9b5ba8f42 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -153,6 +153,7 @@ void ovl_drop_write(struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); bool ovl_is_whiteout(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); @@ -171,8 +172,9 @@ int ovl_check_d_type_supported(struct path *realpath); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); -int ovl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); +int ovl_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags); ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index da186ee4f846..cf37fc76fc9f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,6 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; + struct dentry *dentry; bool is_lowest; struct rb_root root; struct list_head *list; @@ -206,21 +207,10 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) struct ovl_cache_entry *p; struct dentry *dentry; const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) - return -ENOMEM; - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(rdd->dentry->d_sb); - inode_lock(dir->d_inode); - err = 0; - // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex); + err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; @@ -234,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) inode_unlock(dir->d_inode); } revert_creds(old_cred); - put_cred(override_cred); return err; } @@ -290,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, + .dentry = dentry, .list = list, .root = RB_ROOT, .is_lowest = false, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ed53ae0fe868..ce02f46029da 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -42,6 +42,8 @@ struct ovl_fs { long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; }; struct ovl_dir_cache; @@ -265,6 +267,13 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + static bool ovl_is_opaquedir(struct dentry *dentry) { int res; @@ -603,6 +612,7 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); kfree(ufs->config.workdir); + put_cred(ufs->creator_cred); kfree(ufs); } @@ -1064,16 +1074,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* * Upper should support d_type, else whiteouts are visible. * Given workdir and upper are on same fs, we can do - * iterate_dir() on workdir. + * iterate_dir() on workdir. This check requires successful + * creation of workdir in previous step. */ - err = ovl_check_d_type_supported(&workpath); - if (err < 0) - goto out_put_workdir; + if (ufs->workdir) { + err = ovl_check_d_type_supported(&workpath); + if (err < 0) + goto out_put_workdir; - if (!err) { - pr_err("overlayfs: upper fs needs to support d_type.\n"); - err = -EINVAL; - goto out_put_workdir; + if (!err) { + pr_err("overlayfs: upper fs needs to support d_type.\n"); + err = -EINVAL; + goto out_put_workdir; + } } } @@ -1108,10 +1121,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; + ufs->creator_cred = prepare_creds(); + if (!ufs->creator_cred) + goto out_put_lower_mnt; + err = -ENOMEM; oe = ovl_alloc_entry(numlower); if (!oe) - goto out_put_lower_mnt; + goto out_put_cred; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) @@ -1144,6 +1161,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) out_free_oe: kfree(oe); +out_put_cred: + put_cred(ufs->creator_cred); out_put_lower_mnt: for (i = 0; i < ufs->numlower; i++) mntput(ufs->lower_mnt[i]); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 2c60f17e7d92..8a4a266beff3 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler, static int posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; int ret; diff --git a/fs/readdir.c b/fs/readdir.c index 68ef06efe6bc..9d0212c374d6 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -35,13 +35,13 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - if (shared) + if (shared) { inode_lock_shared(inode); - else - inode_lock(inode); - // res = mutex_lock_killable(&inode->i_mutex); - // if (res) - // goto out; + } else { + res = down_write_killable(&inode->i_rwsem); + if (res) + goto out; + } res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 86aeb9dd805a..e4cbb7719906 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, } static int -security_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +security_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (IS_PRIVATE(d_inode(dentry))) + if (IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 31837f031f59..f15a5f9e84ce 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, } static int -trusted_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +trusted_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index f7c39731684b..dc59df43b2db 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +user_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!reiserfs_xattrs_user(dentry->d_sb)) + if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 595ca0debe11..69e287e20732 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -260,7 +260,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\txattr_names %u\n", ui->xattr_names); pr_err("\tdirty %u\n", ui->dirty); pr_err("\txattr %u\n", ui->xattr); - pr_err("\tbulk_read %u\n", ui->xattr); + pr_err("\tbulk_read %u\n", ui->bulk_read); pr_err("\tsynced_i_size %llu\n", (unsigned long long)ui->synced_i_size); pr_err("\tui_size %llu\n", diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6c277eb6aef9..b5fc27969e9d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, } static int ubifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); diff --git a/fs/xattr.c b/fs/xattr.c index fc81e771488a..4beafc43daa5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_op->setxattr) { - error = inode->i_op->setxattr(dentry, name, value, size, flags); + error = inode->i_op->setxattr(dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, @@ -745,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) * Find the handler for the prefix and dispatch its set() operation. */ int -generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) +generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) { const struct xattr_handler *handler; @@ -754,7 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, value, size, flags); + return handler->set(handler, dentry, inode, name, value, size, flags); } /* @@ -769,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name) handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); + return handler->set(handler, dentry, d_inode(dentry), name, NULL, + 0, XATTR_REPLACE); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 613ea2d7ac19..586bb64e674b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -72,18 +72,11 @@ xfs_zero_extent( struct xfs_mount *mp = ip->i_mount; xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); sector_t block = XFS_BB_TO_FSBT(mp, sector); - ssize_t size = XFS_FSB_TO_B(mp, count_fsb); - - if (IS_DAX(VFS_I(ip))) - return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), - sector, size); - - /* - * let the block layer decide on the fastest method of - * implementing the zeroing. - */ - return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + block << (mp->m_super->s_blocksize_bits - 9), + count_fsb << (mp->m_super->s_blocksize_bits - 9), + GFP_NOFS, true); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 44af22897c8b..47fc63295422 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1551,7 +1551,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); @@ -1585,7 +1585,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1622,8 +1622,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 416421d7ff10..11ea5d51db56 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1555,14 +1555,12 @@ xfs_fs_fill_super( if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - if (sb->s_blocksize != PAGE_SIZE) { - xfs_alert(mp, - "Filesystem block size invalid for DAX Turning DAX off."); - mp->m_flags &= ~XFS_MOUNT_DAX; - } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + + error = bdev_dax_supported(sb, sb->s_blocksize); + if (error) { xfs_alert(mp, - "Block device does not support DAX Turning DAX off."); + "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ec58ff094b1d..ea62245fee26 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -74,11 +74,12 @@ xfs_forget_acl( } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(inode); int error; /* Convert Linux syscall to XFS internal ATTR flags */ @@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); if (!error) - xfs_forget_acl(d_inode(dentry), name, xflags); + xfs_forget_acl(inode, name, xflags); return error; } |