diff options
Diffstat (limited to 'fs/fuse')
-rw-r--r-- | fs/fuse/Kconfig | 18 | ||||
-rw-r--r-- | fs/fuse/Makefile | 6 | ||||
-rw-r--r-- | fs/fuse/acl.c | 21 | ||||
-rw-r--r-- | fs/fuse/control.c | 29 | ||||
-rw-r--r-- | fs/fuse/cuse.c | 35 | ||||
-rw-r--r-- | fs/fuse/dax.c | 1390 | ||||
-rw-r--r-- | fs/fuse/dev.c | 296 | ||||
-rw-r--r-- | fs/fuse/dir.c | 569 | ||||
-rw-r--r-- | fs/fuse/file.c | 1332 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 321 | ||||
-rw-r--r-- | fs/fuse/inode.c | 885 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 504 | ||||
-rw-r--r-- | fs/fuse/readdir.c | 37 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 557 | ||||
-rw-r--r-- | fs/fuse/xattr.c | 64 |
15 files changed, 4541 insertions, 1523 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..038ed0b9aaa5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -8,11 +8,11 @@ config FUSE_FS There's also a companion library: libfuse2. This library is available from the FUSE homepage: - <http://fuse.sourceforge.net/> + <https://github.com/libfuse/> although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.txt> for more information. + See <file:Documentation/filesystems/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use @@ -38,3 +38,17 @@ config VIRTIO_FS If you want to share files between guests or with the host, answer Y or M. + +config FUSE_DAX + bool "Virtio Filesystem Direct Host Memory Access support" + default y + select INTERVAL_TREE + depends on VIRTIO_FS + depends on FS_DAX + depends on DAX + help + This allows bypassing guest page cache and allows mapping host page + cache directly in guest address space. + + If you want to allow mounting a Virtio Filesystem with the "dax" + option, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3e8cebfb59b7..0c48b35c058d 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o -virtiofs-y += virtio_fs.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-$(CONFIG_FUSE_DAX) += dax.o + +virtiofs-y := virtio_fs.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 5a48cee6d7d3..337cb29a8dd5 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -11,7 +11,7 @@ #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -struct posix_acl *fuse_get_acl(struct inode *inode, int type) +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) { struct fuse_conn *fc = get_fuse_conn(inode); int size; @@ -19,6 +19,12 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) void *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + if (!fc->posix_acl || fc->no_getxattr) return NULL; @@ -47,12 +53,16 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) return acl; } -int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct fuse_conn *fc = get_fuse_conn(inode); const char *name; int ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fc->posix_acl || fc->no_setxattr) return -EOPNOTSUPP; @@ -64,6 +74,7 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) return -EINVAL; if (acl) { + unsigned int extra_flags = 0; /* * Fuse userspace is responsible for updating access * permissions in the inode, if needed. fuse_setxattr @@ -87,7 +98,11 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; } - ret = fuse_setxattr(inode, name, value, size, 0); + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; + + ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); kfree(value); } else { ret = fuse_removexattr(inode, name); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index c23f6f243ad4..247ef4f76761 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -120,7 +120,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned uninitialized_var(val); + unsigned val; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -162,7 +162,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned uninitialized_var(val); + unsigned val; struct fuse_conn *fc; ssize_t ret; @@ -174,18 +174,11 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; + down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - if (fc->sb) { - if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); - } else { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); - } - } spin_unlock(&fc->bg_lock); + up_read(&fc->killsb); fuse_conn_put(fc); out: return ret; @@ -265,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) struct dentry *parent; char name[32]; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; @@ -303,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) { int i; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return; for (i = fc->ctl_ndents - 1; i >= 0; i--) { @@ -318,7 +311,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) drop_nlink(d_inode(fuse_control_sb->s_root)); } -static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) +static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) { static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; @@ -344,18 +337,18 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) return 0; } -static int fuse_ctl_get_tree(struct fs_context *fc) +static int fuse_ctl_get_tree(struct fs_context *fsc) { - return get_tree_single(fc, fuse_ctl_fill_super); + return get_tree_single(fsc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { .get_tree = fuse_ctl_get_tree, }; -static int fuse_ctl_init_fs_context(struct fs_context *fc) +static int fuse_ctl_init_fs_context(struct fs_context *fsc) { - fc->ops = &fuse_ctl_context_ops; + fsc->ops = &fuse_ctl_context_ops; return 0; } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 030f094910c3..c7d882a9fe33 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -57,6 +57,7 @@ struct cuse_conn { struct list_head list; /* linked on cuse_conntbl */ + struct fuse_mount fm; /* Dummy mount referencing fc */ struct fuse_conn fc; /* fuse connection */ struct cdev *cdev; /* associated character device */ struct device *dev; /* device representing @cdev */ @@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file) * Generic permission check is already done against the chrdev * file, proceed to open. */ - rc = fuse_do_open(&cc->fc, 0, file, 0); + rc = fuse_do_open(&cc->fm, 0, file, 0); if (rc) fuse_conn_put(&cc->fc); return rc; @@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_sync_release(NULL, ff, file->f_flags); - fuse_conn_put(fc); + fuse_conn_put(fm->fc); return 0; } @@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = 0; if (cc->unrestricted_ioctl) @@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = FUSE_IOCTL_COMPAT; if (cc->unrestricted_ioctl) @@ -270,7 +271,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *uninitialized_var(key), *uninitialized_var(val); + char *key, *val; int rc; while (true) { @@ -313,9 +314,10 @@ struct cuse_init_args { * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, +static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; @@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc) { int rc; struct page *page; - struct fuse_conn *fc = &cc->fc; + struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc) ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; - rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); err_free_page: @@ -506,22 +508,21 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, + &fuse_dev_fiq_ops, NULL); + cc->fc.release = cuse_fc_release; fud = fuse_dev_alloc_install(&cc->fc); - if (!fud) { - kfree(cc); + fuse_conn_put(&cc->fc); + if (!fud) return -ENOMEM; - } INIT_LIST_HEAD(&cc->list); - cc->fc.release = cuse_fc_release; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); - fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; @@ -558,8 +559,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) unregister_chrdev_region(cc->cdev->dev, 1); cdev_del(cc->cdev); } - /* Base reference is now owned by "fud" */ - fuse_conn_put(&cc->fc); rc = fuse_dev_release(inode, file); /* puts the base reference */ @@ -624,6 +623,8 @@ static int __init cuse_init(void) cuse_channel_fops.owner = THIS_MODULE; cuse_channel_fops.open = cuse_channel_open; cuse_channel_fops.release = cuse_channel_release; + /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ + cuse_channel_fops.unlocked_ioctl = NULL; cuse_class = class_create(THIS_MODULE, "cuse"); if (IS_ERR(cuse_class)) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c new file mode 100644 index 000000000000..e23e802a8013 --- /dev/null +++ b/fs/fuse/dax.c @@ -0,0 +1,1390 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dax: direct host memory access + * Copyright (C) 2020 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include <linux/delay.h> +#include <linux/dax.h> +#include <linux/uio.h> +#include <linux/pagemap.h> +#include <linux/pfn_t.h> +#include <linux/iomap.h> +#include <linux/interval_tree.h> + +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ +#define FUSE_DAX_SHIFT 21 +#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) +#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) + +/* Number of ranges reclaimer will try to free in one invocation */ +#define FUSE_DAX_RECLAIM_CHUNK (10) + +/* + * Dax memory reclaim threshold in percetage of total ranges. When free + * number of free ranges drops below this threshold, reclaim can trigger + * Default is 20% + */ +#define FUSE_DAX_RECLAIM_THRESHOLD (20) + +/** Translation information for file offsets to DAX window offsets */ +struct fuse_dax_mapping { + /* Pointer to inode where this memory range is mapped */ + struct inode *inode; + + /* Will connect in fcd->free_ranges to keep track of free memory */ + struct list_head list; + + /* For interval tree in file/inode */ + struct interval_tree_node itn; + + /* Will connect in fc->busy_ranges to keep track busy memory */ + struct list_head busy_list; + + /** Position in DAX window */ + u64 window_offset; + + /** Length of mapping, in bytes */ + loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; + + /* reference count when the mapping is used by dax iomap. */ + refcount_t refcnt; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; +}; + +struct fuse_conn_dax { + /* DAX device */ + struct dax_device *dev; + + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /* List of memory ranges which are busy */ + unsigned long nr_busy_ranges; + struct list_head busy_ranges; + + /* Worker to free up memory ranges */ + struct delayed_work free_work; + + /* Wait queue for a dax range to become free */ + wait_queue_head_t range_waitq; + + /* DAX Window Free Ranges */ + long nr_free_ranges; + struct list_head free_ranges; + + unsigned long nr_ranges; +}; + +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode); + +static void +__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) +{ + unsigned long free_threshold; + + /* If number of free ranges are below threshold, start reclaim */ + free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100, + 1); + if (fcd->nr_free_ranges < free_threshold) + queue_delayed_work(system_long_wq, &fcd->free_work, + msecs_to_jiffies(delay_ms)); +} + +static void kick_dmap_free_worker(struct fuse_conn_dax *fcd, + unsigned long delay_ms) +{ + spin_lock(&fcd->lock); + __kick_dmap_free_worker(fcd, delay_ms); + spin_unlock(&fcd->lock); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + __kick_dmap_free_worker(fcd, 0); + spin_unlock(&fcd->lock); + + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_del_init(&dmap->busy_list); + WARN_ON(fcd->nr_busy_ranges == 0); + fcd->nr_busy_ranges--; +} + +static void dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + spin_lock(&fcd->lock); + __dmap_remove_busy_list(fcd, dmap); + spin_unlock(&fcd->lock); +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; + wake_up(&fcd->range_waitq); +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn_dax *fcd = fm->fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + /* + * We don't take a reference on inode. inode is valid right now + * and when inode is going away, cleanup logic should first + * cleanup dmap entries. + */ + dmap->inode = inode; + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + spin_lock(&fcd->lock); + list_add_tail(&dmap->busy_list, &fcd->busy_ranges); + fcd->nr_busy_ranges++; + spin_unlock(&fcd->lock); + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fm, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + __dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + /* inode is going away. There should not be any users of dmap */ + WARN_ON(refcount_read(&dmap->refcnt) > 1); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +static int dmap_removemapping_one(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + struct fuse_removemapping_one forget_one; + struct fuse_removemapping_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.count = 1; + memset(&forget_one, 0, sizeof(forget_one)); + forget_one.moffset = dmap->window_offset; + forget_one.len = dmap->length; + + return fuse_send_removemapping(inode, &inarg, &forget_one); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + /* + * increace refcnt so that reclaim code knows this dmap is in + * use. This assumes fi->dax->sem mutex is held either + * shared/exclusive. + */ + refcount_inc(&dmap->refcnt); + + /* iomap->private should be NULL */ + WARN_ON_ONCE(iomap->private); + iomap->private = dmap; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Can't do inline reclaim in fault path. We call + * dax_layout_busy_page() before we free a range. And + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. + */ + if (flags & IOMAP_FAULT) { + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EAGAIN; + } else { + alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); + if (IS_ERR(alloc_dmap)) + return PTR_ERR(alloc_dmap); + } + + /* If we are here, we should have memory allocated */ + if (WARN_ON(!alloc_dmap)) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or invalidate_lock, and that should + * ensure that dmap can't be truncated. We are holding a reference + * on dmap and that should make sure it can't be reclaimed. So dmap + * should still be there in tree despite the fact we dropped and + * re-acquired the fi->dax->sem lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* We took an extra reference on dmap to make sure its not reclaimd. + * Now we hold fi->dax->sem lock and that reference is not needed + * anymore. Drop it. + */ + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + * + * Before dropping fi->dax->sem lock, take reference + * on dmap so that its not freed by range reclaim. + */ + refcount_inc(&dmap->refcnt); + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happens, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_dax_mapping *dmap = iomap->private; + + if (dmap) { + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + } + + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +static void fuse_wait_dax_page(struct inode *inode) +{ + filemap_invalidate_unlock(inode->i_mapping); + schedule(); + filemap_invalidate_lock(inode->i_mapping); +} + +/* Should be called with mapping->invalidate_lock held exclusively */ +static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, + loff_t start, loff_t end) +{ + struct page *page; + + page = dax_layout_busy_page_range(inode->i_mapping, start, end); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, fuse_wait_dax_page(inode)); +} + +/* dmap_end == 0 leads to unmapping of whole file */ +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, + u64 dmap_end) +{ + bool retry; + int ret; + + do { + retry = false; + ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, + dmap_end); + } while (ret == 0 && retry); + + return ret; +} + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + + fuse_write_update_attr(inode, iocb->ki_pos, ret); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + +static int fuse_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); +} + +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, bool write) +{ + vm_fault_t ret; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + pfn_t pfn; + int error = 0; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + bool retry = false; + + if (write) + sb_start_pagefault(sb); +retry: + if (retry && !(fcd->nr_free_ranges > 0)) + wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0)); + + /* + * We need to serialize against not only truncate but also against + * fuse dax memory range reclaim. While a range is being reclaimed, + * we do not want any read/write/mmap to make progress and try + * to populate page cache or access memory we are trying to free. + */ + filemap_invalidate_lock_shared(inode->i_mapping); + ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { + error = 0; + retry = true; + filemap_invalidate_unlock_shared(inode->i_mapping); + goto retry; + } + + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); + filemap_invalidate_unlock_shared(inode->i_mapping); + + if (write) + sb_end_pagefault(sb); + + return ret; +} + +static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, + vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static const struct vm_operations_struct fuse_dax_vm_ops = { + .fault = fuse_dax_fault, + .huge_fault = fuse_dax_huge_fault, + .page_mkwrite = fuse_dax_page_mkwrite, + .pfn_mkwrite = fuse_dax_pfn_mkwrite, +}; + +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &fuse_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; +} + +static int dmap_writeback_invalidate(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT; + loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1); + + ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos); + if (ret) { + pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n", + ret, start_pos, end_pos); + return ret; + } + + ret = invalidate_inode_pages2_range(inode->i_mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); + if (ret) + pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n", + ret); + + return ret; +} + +static int reclaim_one_dmap_locked(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * igrab() was done to make sure inode won't go under us, and this + * further avoids the race with evict(). + */ + ret = dmap_writeback_invalidate(inode, dmap); + if (ret) + return ret; + + /* Remove dax mapping from inode interval tree now */ + interval_tree_remove(&dmap->itn, &fi->dax->tree); + fi->dax->nr--; + + /* It is possible that umount/shutdown has killed the fuse connection + * and worker thread is trying to reclaim memory in parallel. Don't + * warn in that case. + */ + ret = dmap_removemapping_one(inode, dmap); + if (ret && ret != -ENOTCONN) { + pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n", + dmap->window_offset, dmap->length, ret); + } + return 0; +} + +/* Find first mapped dmap for an inode and return file offset. Caller needs + * to hold fi->dax->sem lock either shared or exclusive. + */ +static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node; + node = interval_tree_iter_next(node, 0, -1)) { + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + continue; + + return dmap; + } + + return NULL; +} + +/* + * Find first mapping in the tree and free it and return it. Do not add + * it back to free pool. + */ +static struct fuse_dax_mapping * +inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, + bool *retry) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + u64 dmap_start, dmap_end; + unsigned long start_idx; + int ret; + struct interval_tree_node *node; + + filemap_invalidate_lock(inode->i_mapping); + + /* Lookup a dmap and corresponding file offset to reclaim. */ + down_read(&fi->dax->sem); + dmap = inode_lookup_first_dmap(inode); + if (dmap) { + start_idx = dmap->itn.start; + dmap_start = start_idx << FUSE_DAX_SHIFT; + dmap_end = dmap_start + FUSE_DAX_SZ - 1; + } + up_read(&fi->dax->sem); + + if (!dmap) + goto out_mmap_sem; + /* + * Make sure there are no references to inode pages using + * get_user_pages() + */ + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n", + ret); + dmap = ERR_PTR(ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + /* Range already got reclaimed by somebody else */ + if (!node) { + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) { + dmap = NULL; + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) { + dmap = ERR_PTR(ret); + goto out_write_dmap_sem; + } + + /* Clean up dmap. Do not add back to free list */ + dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + + pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n", + __func__, inode, dmap->window_offset, dmap->length); + +out_write_dmap_sem: + up_write(&fi->dax->sem); +out_mmap_sem: + filemap_invalidate_unlock(inode->i_mapping); + return dmap; +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) +{ + struct fuse_dax_mapping *dmap; + struct fuse_inode *fi = get_fuse_inode(inode); + + while (1) { + bool retry = false; + + dmap = alloc_dax_mapping(fcd); + if (dmap) + return dmap; + + dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); + /* + * Either we got a mapping or it is an error, return in both + * the cases. + */ + if (dmap) + return dmap; + + /* If we could not reclaim a mapping because it + * had a reference or some other temporary failure, + * Try again. We want to give up inline reclaim only + * if there is no range assigned to this node. Otherwise + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 + */ + if (retry) + continue; + /* + * There are no mappings which can be reclaimed. Wait for one. + * We are not holding fi->dax->sem. So it is possible + * that range gets added now. But as we are not holding + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. + */ + if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { + if (wait_event_killable_exclusive(fcd->range_waitq, + (fcd->nr_free_ranges > 0))) { + return ERR_PTR(-EINTR); + } + } + } +} + +static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + /* Find fuse dax mapping at file offset inode. */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + + /* Range already got cleaned up by somebody else */ + if (!node) + return 0; + dmap = node_to_dmap(node); + + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + return 0; + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) + return ret; + + /* Cleanup dmap entry and add back to free list */ + spin_lock(&fcd->lock); + dmap_reinit_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); + return ret; +} + +/* + * Free a range of memory. + * Locking: + * 1. Take mapping->invalidate_lock to block dax faults. + * 2. Take fi->dax->sem to protect interval tree and also to make sure + * read/write can not reuse a dmap which we might be freeing. + */ +static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx, + unsigned long end_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; + loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; + + filemap_invalidate_lock(inode->i_mapping); + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", + ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); + up_write(&fi->dax->sem); +out_mmap_sem: + filemap_invalidate_unlock(inode->i_mapping); + return ret; +} + +static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd, + unsigned long nr_to_free) +{ + struct fuse_dax_mapping *dmap, *pos, *temp; + int ret, nr_freed = 0; + unsigned long start_idx = 0, end_idx = 0; + struct inode *inode = NULL; + + /* Pick first busy range and free it for now*/ + while (1) { + if (nr_freed >= nr_to_free) + break; + + dmap = NULL; + spin_lock(&fcd->lock); + + if (!fcd->nr_busy_ranges) { + spin_unlock(&fcd->lock); + return 0; + } + + list_for_each_entry_safe(pos, temp, &fcd->busy_ranges, + busy_list) { + /* skip this range if it's in use. */ + if (refcount_read(&pos->refcnt) > 1) + continue; + + inode = igrab(pos->inode); + /* + * This inode is going away. That will free + * up all the ranges anyway, continue to + * next range. + */ + if (!inode) + continue; + /* + * Take this element off list and add it tail. If + * this element can't be freed, it will help with + * selecting new element in next iteration of loop. + */ + dmap = pos; + list_move_tail(&dmap->busy_list, &fcd->busy_ranges); + start_idx = end_idx = dmap->itn.start; + break; + } + spin_unlock(&fcd->lock); + if (!dmap) + return 0; + + ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx); + iput(inode); + if (ret) + return ret; + nr_freed++; + } + return 0; +} + +static void fuse_dax_free_mem_worker(struct work_struct *work) +{ + int ret; + struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax, + free_work.work); + ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); + if (ret) { + pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n", + ret); + } + + /* If number of free ranges are still below threshold, requeue */ + kick_dmap_free_worker(fcd, 1); +} + +static void fuse_free_dax_mem_ranges(struct list_head *mem_list) +{ + struct fuse_dax_mapping *range, *temp; + + /* Free All allocated elements */ + list_for_each_entry_safe(range, temp, mem_list, list) { + list_del(&range->list); + if (!list_empty(&range->busy_list)) + list_del(&range->busy_list); + kfree(range); + } +} + +void fuse_dax_conn_free(struct fuse_conn *fc) +{ + if (fc->dax) { + fuse_free_dax_mem_ranges(&fc->dax->free_ranges); + kfree(fc->dax); + } +} + +static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) +{ + long nr_pages, nr_ranges; + struct fuse_dax_mapping *range; + int ret, id; + size_t dax_size = -1; + unsigned long i; + + init_waitqueue_head(&fcd->range_waitq); + INIT_LIST_HEAD(&fcd->free_ranges); + INIT_LIST_HEAD(&fcd->busy_ranges); + INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); + + id = dax_read_lock(); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), + DAX_ACCESS, NULL, NULL); + dax_read_unlock(id); + if (nr_pages < 0) { + pr_debug("dax_direct_access() returned %ld\n", nr_pages); + return nr_pages; + } + + nr_ranges = nr_pages/FUSE_DAX_PAGES; + pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", + __func__, nr_pages, nr_ranges); + + for (i = 0; i < nr_ranges; i++) { + range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); + ret = -ENOMEM; + if (!range) + goto out_err; + + /* TODO: This offset only works if virtio-fs driver is not + * having some memory hidden at the beginning. This needs + * better handling + */ + range->window_offset = i * FUSE_DAX_SZ; + range->length = FUSE_DAX_SZ; + INIT_LIST_HEAD(&range->busy_list); + refcount_set(&range->refcnt, 1); + list_add_tail(&range->list, &fcd->free_ranges); + } + + fcd->nr_free_ranges = nr_ranges; + fcd->nr_ranges = nr_ranges; + return 0; +out_err: + /* Free All allocated elements */ + fuse_free_dax_mem_ranges(&fcd->free_ranges); + return ret; +} + +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, + struct dax_device *dax_dev) +{ + struct fuse_conn_dax *fcd; + int err; + + fc->dax_mode = dax_mode; + + if (!dax_dev) + return 0; + + fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); + if (!fcd) + return -ENOMEM; + + spin_lock_init(&fcd->lock); + fcd->dev = dax_dev; + err = fuse_dax_mem_range_init(fcd); + if (err) { + kfree(fcd); + return err; + } + + fc->dax = fcd; + return 0; +} + +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +static const struct address_space_operations fuse_dax_file_aops = { + .writepages = fuse_dax_writepages, + .direct_IO = noop_direct_IO, + .dirty_folio = noop_dirty_folio, +}; + +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + enum fuse_dax_mode dax_mode = fc->dax_mode; + + if (dax_mode == FUSE_DAX_NEVER) + return false; + + /* + * fc->dax may be NULL in 'inode' mode when filesystem device doesn't + * support DAX, in which case it will silently fallback to 'never' mode. + */ + if (!fc->dax) + return false; + + if (dax_mode == FUSE_DAX_ALWAYS) + return true; + + /* dax_mode is FUSE_DAX_INODE* */ + return fc->inode_dax && (flags & FUSE_ATTR_DAX); +} + +void fuse_dax_inode_init(struct inode *inode, unsigned int flags) +{ + if (!fuse_should_enable_dax(inode, flags)) + return; + + inode->i_flags |= S_DAX; + inode->i_data.a_ops = &fuse_dax_file_aops; +} + +void fuse_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_inode_dax_mode(fc->dax_mode) && + ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) + d_mark_dontcache(inode); +} + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} + +void fuse_dax_cancel_work(struct fuse_conn *fc) +{ + struct fuse_conn_dax *fcd = fc->dax; + + if (fcd) + cancel_delayed_work_sync(&fcd->free_work); + +} +EXPORT_SYMBOL_GPL(fuse_dax_cancel_work); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..b4a6e0a1b945 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); + req->fm = fm; } -static struct fuse_req *fuse_request_alloc(gfp_t flags) +static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) - fuse_request_init(req); + fuse_request_init(fm, req); return req; } @@ -90,7 +91,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc) { /* * lockess check of fc->connected is okay, because atomic_dec_and_test() - * provides a memory barrier mached with the one in fuse_wait_aborted() + * provides a memory barrier matched with the one in fuse_wait_aborted() * to ensure no wake-up is missed. */ if (atomic_dec_and_test(&fc->num_waiting) && @@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); +static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) +static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; int err; atomic_inc(&fc->num_waiting); @@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (fc->conn_error) goto out; - req = fuse_request_alloc(GFP_KERNEL); + req = fuse_request_alloc(fm, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (unlikely(req->in.h.uid == ((uid_t)-1) || req->in.h.gid == ((gid_t)-1))) { - fuse_put_request(fc, req); + fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } return req; @@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) return ERR_PTR(err); } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* @@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; struct fuse_iqueue *fiq = &fc->iq; if (test_and_set_bit(FR_FINISHED, &req->flags)) @@ -282,10 +288,10 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) /* * test_and_set_bit() implies smp_mb() between bit - * changing and below intr_entry check. Pairs with + * changing and below FR_INTERRUPTED check. Pairs with * smp_mb() from queue_interrupt(). */ - if (!list_empty(&req->intr_entry)) { + if (test_bit(FR_INTERRUPTED, &req->flags)) { spin_lock(&fiq->lock); list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); @@ -309,10 +315,6 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fc->sb) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); - } fc->num_background--; fc->active_background--; flush_bg_queue(fc); @@ -323,14 +325,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) } if (test_bit(FR_ASYNC, &req->flags)) - req->args->end(fc, req->args, req->out.h.error); + req->args->end(fm, req->args, req->out.h.error); put_request: - fuse_put_request(fc, req); + fuse_put_request(req); } EXPORT_SYMBOL_GPL(fuse_request_end); -static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_req *req) { + struct fuse_iqueue *fiq = &req->fm->fc->iq; + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { @@ -342,7 +346,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) list_add_tail(&req->intr_entry, &fiq->interrupts); /* * Pairs with smp_mb() implied by test_and_set_bit() - * from request_end(). + * from fuse_request_end(). */ smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { @@ -357,8 +361,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) return 0; } -static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +static void request_wait_answer(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; struct fuse_iqueue *fiq = &fc->iq; int err; @@ -373,7 +378,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) /* matches barrier in fuse_dev_do_read() */ smp_mb__after_atomic(); if (test_bit(FR_SENT, &req->flags)) - queue_interrupt(fiq, req); + queue_interrupt(req); } if (!test_bit(FR_FORCE, &req->flags)) { @@ -402,9 +407,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } -static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); spin_lock(&fiq->lock); @@ -418,7 +423,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) __fuse_get_request(req); queue_request_and_unlock(fiq, req); - request_wait_answer(fc, req); + request_wait_answer(req); /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } @@ -457,8 +462,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) } } -static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_force_creds(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); @@ -473,23 +480,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; ssize_t ret; if (args->force) { atomic_inc(&fc->num_waiting); - req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); + req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) - fuse_force_creds(fc, req); + fuse_force_creds(req); __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -500,20 +508,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); + __fuse_request_send(req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); ret = args->out_args[args->out_numargs - 1].size; } - fuse_put_request(fc, req); + fuse_put_request(req); return ret; } -static bool fuse_request_queue_background(struct fuse_conn *fc, - struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; bool queued = false; WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); @@ -527,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); - } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; @@ -540,28 +545,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, return queued; } -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags) { struct fuse_req *req; if (args->force) { WARN_ON(!args->nocreds); - req = fuse_request_alloc(gfp_flags); + req = fuse_request_alloc(fm, gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, true); + req = fuse_get_req(fm, true); if (IS_ERR(req)) return PTR_ERR(req); } fuse_args_to_req(req, args); - if (!fuse_request_queue_background(fc, req)) { - fuse_put_request(fc, req); + if (!fuse_request_queue_background(req)) { + fuse_put_request(req); return -ENOTCONN; } @@ -569,14 +574,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, } EXPORT_SYMBOL_GPL(fuse_simple_background); -static int fuse_simple_notify_reply(struct fuse_conn *fc, +static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64 unique) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &fm->fc->iq; int err = 0; - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -591,7 +596,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, } else { err = -ENODEV; spin_unlock(&fiq->lock); - fuse_put_request(fc, req); + fuse_put_request(req); } return err; @@ -725,14 +730,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) } } else { size_t off; - err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off); + err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); if (err < 0) return err; BUG_ON(!err); cs->len = err; cs->offset = off; cs->pg = page; - iov_iter_advance(cs->iter, err); } return lock_request(cs->req); @@ -743,7 +747,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { - void *pgaddr = kmap_atomic(cs->pg); + void *pgaddr = kmap_local_page(cs->pg); void *buf = pgaddr + cs->offset; if (cs->write) @@ -751,7 +755,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) else memcpy(*val, buf, ncpy); - kunmap_atomic(pgaddr); + kunmap_local(pgaddr); *val += ncpy; } *size -= ncpy; @@ -764,16 +768,17 @@ static int fuse_check_page(struct page *page) { if (page_mapcount(page) || page->mapping != NULL || - page_count(page) != 1 || (page->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | - 1 << PG_reclaim))) { - pr_warn("trying to steal weird page\n"); - pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + 1 << PG_workingset | + 1 << PG_reclaim | + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -786,15 +791,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; + get_page(oldpage); err = unlock_request(cs->req); if (err) - return err; + goto out_put_old; fuse_copy_finish(cs); err = pipe_buf_confirm(cs->pipe, buf); if (err) - return err; + goto out_put_old; BUG_ON(!cs->nr_segs); cs->currbuf = buf; @@ -805,7 +811,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (cs->len != PAGE_SIZE) goto out_fallback; - if (pipe_buf_steal(cs->pipe, buf) != 0) + if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; newpage = buf->page; @@ -831,16 +837,18 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); - if (err) { - unlock_page(newpage); - return err; - } + replace_page_cache_page(oldpage, newpage); get_page(newpage); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add_file(newpage); + lru_cache_add(newpage); + + /* + * Release while we have extra ref on stolen page. Otherwise + * anon_pipe_buf_release() might think the page can be reused. + */ + pipe_buf_release(cs->pipe, buf); err = 0; spin_lock(&cs->req->waitq.lock); @@ -853,14 +861,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (err) { unlock_page(newpage); put_page(newpage); - return err; + goto out_put_old; } unlock_page(oldpage); + /* Drop ref for ap->pages[] array */ put_page(oldpage); cs->len = 0; - return 0; + err = 0; +out_put_old: + /* Drop ref obtained in this function */ + put_page(oldpage); + return err; out_fallback_unlock: unlock_page(newpage); @@ -869,10 +882,10 @@ out_fallback: cs->offset = buf->offset; err = lock_request(cs->req); - if (err) - return err; + if (!err) + err = 1; - return 1; + goto out_put_old; } static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, @@ -884,14 +897,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; + get_page(page); err = unlock_request(cs->req); - if (err) + if (err) { + put_page(page); return err; + } fuse_copy_finish(cs); buf = cs->pipebufs; - get_page(page); buf->page = page; buf->offset = offset; buf->len = count; @@ -918,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { @@ -932,10 +957,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, } } if (page) { - void *mapaddr = kmap_atomic(page); + void *mapaddr = kmap_local_page(page); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); - kunmap_atomic(mapaddr); + kunmap_local(mapaddr); } else offset += fuse_copy_do(cs, NULL, &count); } @@ -1251,10 +1276,19 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - fuse_request_end(fc, req); + fuse_request_end(req); goto restart; } spin_lock(&fpq->lock); + /* + * Must not put request on fpq->io queue after having been shut down by + * fuse_abort_conn() + */ + if (!fpq->connected) { + req->out.h.error = err = -ECONNABORTED; + goto out_end; + + } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; @@ -1285,8 +1319,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) - queue_interrupt(fiq, req); - fuse_put_request(fc, req); + queue_interrupt(req); + fuse_put_request(req); return reqsize; @@ -1294,7 +1328,7 @@ out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); return err; err_unlock: @@ -1322,7 +1356,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) if (!fud) return -EPERM; - if (!iter_is_iovec(to)) + if (!user_backed_iter(to)) return -EINVAL; fuse_copy_init(&cs, 1, to); @@ -1417,11 +1451,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, fuse_copy_finish(cs); down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) { - err = fuse_reverse_inval_inode(fc->sb, outarg.ino, - outarg.off, outarg.len); - } + err = fuse_reverse_inval_inode(fc, outarg.ino, + outarg.off, outarg.len); up_read(&fc->killsb); return err; @@ -1467,9 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1517,10 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, - outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1562,10 +1588,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (!fc->sb) - goto out_up_killsb; - - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) goto out_up_killsb; @@ -1576,7 +1599,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, end = outarg.offset + outarg.size; if (end > file_size) { file_size = end; - fuse_write_update_size(inode, file_size); + fuse_write_update_attr(inode, file_size, outarg.size); } num = outarg.size; @@ -1622,7 +1645,7 @@ struct fuse_retrieve_args { struct fuse_notify_retrieve_in inarg; }; -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_retrieve_args *ra = @@ -1632,7 +1655,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, kfree(ra); } -static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, +static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, struct fuse_notify_retrieve_out *outarg) { int err; @@ -1643,6 +1666,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); struct fuse_args_pages *ap; @@ -1704,9 +1728,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, args->in_args[0].value = &ra->inarg; args->in_args[1].size = total_len; - err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) - fuse_retrieve_end(fc, args, err); + fuse_retrieve_end(fm, args, err); return err; } @@ -1715,7 +1739,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_retrieve_out outarg; + struct fuse_mount *fm; struct inode *inode; + u64 nodeid; int err; err = -EINVAL; @@ -1730,14 +1756,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (fc->sb) { - u64 nodeid = outarg.nodeid; + nodeid = outarg.nodeid; - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); - if (inode) { - err = fuse_retrieve(fc, inode, &outarg); - iput(inode); - } + inode = fuse_ilookup(fc, nodeid, &fm); + if (inode) { + err = fuse_retrieve(fm, inode, &outarg); + iput(inode); } up_read(&fc->killsb); @@ -1851,7 +1875,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, } err = -EINVAL; - if (oh.error <= -1000 || oh.error > 0) + if (oh.error <= -512 || oh.error > 0) goto copy_finish; spin_lock(&fpq->lock); @@ -1876,9 +1900,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - err = queue_interrupt(&fc->iq, req); + err = queue_interrupt(req); - fuse_put_request(fc, req); + fuse_put_request(req); goto copy_finish; } @@ -1908,7 +1932,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); out: return err ? err : nbytes; @@ -1925,7 +1949,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) if (!fud) return -EPERM; - if (!iter_is_iovec(from)) + if (!user_backed_iter(from)) return -EINVAL; fuse_copy_init(&cs, 0, from); @@ -1977,8 +2001,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - BUG_ON(nbuf >= pipe->ring_size); - BUG_ON(tail == head); + if (WARN_ON(nbuf >= count || tail == head)) + goto out_free; + ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; @@ -2014,8 +2039,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); out_free: - for (idx = 0; idx < nbuf; idx++) - pipe_buf_release(pipe, &bufs[idx]); + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + + if (buf->ops) + pipe_buf_release(pipe, buf); + } pipe_unlock(pipe); kvfree(bufs); @@ -2045,7 +2074,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct fuse_conn *fc, struct list_head *head) +static void end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2053,7 +2082,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - fuse_request_end(fc, req); + fuse_request_end(req); } } @@ -2081,7 +2110,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be @@ -2148,7 +2177,7 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(fc, &to_end); + end_requests(&to_end); } else { spin_unlock(&fc->lock); } @@ -2178,7 +2207,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(fc, &to_end); + end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { @@ -2222,19 +2251,18 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - int err = -ENOTTY; + int res; + int oldfd; + struct fuse_dev *fud = NULL; - if (cmd == FUSE_DEV_IOC_CLONE) { - int oldfd; - - err = -EFAULT; - if (!get_user(oldfd, (__u32 __user *) arg)) { + switch (cmd) { + case FUSE_DEV_IOC_CLONE: + res = -EFAULT; + if (!get_user(oldfd, (__u32 __user *)arg)) { struct file *old = fget(oldfd); - err = -EINVAL; + res = -EINVAL; if (old) { - struct fuse_dev *fud = NULL; - /* * Check against file->f_op because CUSE * uses the same ioctl handler. @@ -2245,14 +2273,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, if (fud) { mutex_lock(&fuse_mutex); - err = fuse_device_clone(fud->fc, file); + res = fuse_device_clone(fud->fc, file); mutex_unlock(&fuse_mutex); } fput(old); } } + break; + default: + res = -ENOTTY; + break; } - return err; + return res; } const struct file_operations fuse_dev_operations = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index de1e2fde60bd..bb97a384dc5d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -10,12 +10,22 @@ #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/fs_context.h> +#include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/iversion.h> #include <linux/posix_acl.h> +#include <linux/security.h> +#include <linux/types.h> +#include <linux/kernel.h> + +static bool __read_mostly allow_sys_admin_access; +module_param(allow_sys_admin_access, bool, 0644); +MODULE_PARM_DESC(allow_sys_admin_access, + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); static void fuse_advise_use_readdirplus(struct inode *dir) { @@ -115,7 +125,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) { set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); } @@ -196,15 +206,15 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; struct dentry *parent; - struct fuse_conn *fc; + struct fuse_mount *fm; struct fuse_inode *fi; int ret; inode = d_inode_rcu(entry); - if (inode && is_bad_inode(inode)) + if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & LOOKUP_REVAL)) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; @@ -218,27 +228,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (flags & LOOKUP_RCU) goto out; - fc = get_fuse_conn(inode); + fm = get_fuse_mount(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), + fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); - if (outarg.nodeid != get_node_id(inode)) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + if (outarg.nodeid != get_node_id(inode) || + (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { + fuse_queue_forget(fm->fc, forget, + outarg.nodeid, 1); goto invalid; } spin_lock(&fi->lock); @@ -249,7 +261,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || - (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + fuse_stale_inode(inode, outarg.generation, &outarg.attr)) goto invalid; forget_all_cached_acls(inode); @@ -298,6 +310,33 @@ static int fuse_dentry_delete(const struct dentry *dentry) return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); } +/* + * Create a fuse_mount object with a new superblock (with path->dentry + * as the root), and return that mount so it can be auto-mounted on + * @path. + */ +static struct vfsmount *fuse_dentry_automount(struct path *path) +{ + struct fs_context *fsc; + struct vfsmount *mnt; + struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); + + fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); + if (IS_ERR(fsc)) + return ERR_CAST(fsc); + + /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ + fsc->fs_private = mp_fi; + + /* Create the submount */ + mnt = fc_mount(fsc); + if (!IS_ERR(mnt)) + mntget(mnt); + + put_fs_context(fsc); + return mnt; +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, @@ -305,6 +344,7 @@ const struct dentry_operations fuse_dentry_operations = { .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif + .d_automount = fuse_dentry_automount, }; const struct dentry_operations fuse_root_dentry_operations = { @@ -329,7 +369,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr) int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -346,10 +386,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); - fuse_lookup_init(fc, &args, nodeid, name, outarg); - err = fuse_simple_request(fc, &args); + fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -365,7 +405,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version); err = -ENOMEM; if (!*inode) { - fuse_queue_forget(fc, forget, outarg->nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } err = 0; @@ -386,6 +426,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, bool outarg_valid = true; bool locked; + if (fuse_is_bad(dir)) + return ERR_PTR(-EIO); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -422,6 +465,62 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, return ERR_PTR(err); } +static int get_security_context(struct dentry *entry, umode_t mode, + void **security_ctx, u32 *security_ctxlen) +{ + struct fuse_secctx *fctx; + struct fuse_secctx_header *header; + void *ctx = NULL, *ptr; + u32 ctxlen, total_len = sizeof(*header); + int err, nr_ctx = 0; + const char *name; + size_t namelen; + + err = security_dentry_init_security(entry, mode, &entry->d_name, + &name, &ctx, &ctxlen); + if (err) { + if (err != -EOPNOTSUPP) + goto out_err; + /* No LSM is supporting this security hook. Ignore error */ + ctxlen = 0; + ctx = NULL; + } + + if (ctxlen) { + nr_ctx = 1; + namelen = strlen(name) + 1; + err = -EIO; + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + goto out_err; + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + } + + err = -ENOMEM; + header = ptr = kzalloc(total_len, GFP_KERNEL); + if (!ptr) + goto out_err; + + header->nr_secctx = nr_ctx; + header->size = total_len; + ptr += sizeof(*header); + if (nr_ctx) { + fctx = ptr; + fctx->size = ctxlen; + ptr += sizeof(*fctx); + + strcpy(ptr, name); + ptr += namelen; + + memcpy(ptr, ctx, ctxlen); + } + *security_ctxlen = total_len; + *security_ctx = header; + err = 0; +out_err: + kfree(ctx); + return err; +} + /* * Atomic create+open operation * @@ -429,12 +528,12 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, * 'mknod' + 'open' requests. */ static int fuse_create_open(struct inode *dir, struct dentry *entry, - struct file *file, unsigned flags, - umode_t mode) + struct file *file, unsigned int flags, + umode_t mode, u32 opcode) { int err; struct inode *inode; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; @@ -442,6 +541,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + void *security_ctx = NULL; + u32 security_ctxlen; + bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -452,11 +554,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) goto out_put_forget_req; - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; @@ -465,7 +567,13 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - args.opcode = FUSE_CREATE; + + if (fm->fc->handle_killpriv_v2 && trunc && + !(flags & O_EXCL) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + + args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); @@ -477,7 +585,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; - err = fuse_simple_request(fc, &args); + + if (fm->fc->init_security) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + args.in_numargs = 3; + args.in_args[2].size = security_ctxlen; + args.in_args[2].value = security_ctx; + } + + err = fuse_simple_request(fm, &args); + kfree(security_ctx); if (err) goto out_free_ff; @@ -494,7 +615,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); - fuse_queue_forget(fc, forget, outentry.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } @@ -509,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, } else { file->private_data = ff; fuse_finish_open(inode, file); + if (fm->fc->atomic_o_trunc && trunc) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; @@ -520,7 +645,8 @@ out_err: return err; } -static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, + umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) @@ -529,6 +655,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; + if (fuse_is_bad(dir)) + return -EIO; + if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) @@ -547,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode); + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -557,7 +686,7 @@ out_dput: return err; mknod: - err = fuse_mknod(dir, entry, mode, 0); + err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -567,7 +696,7 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, +static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -576,6 +705,11 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, struct dentry *d; int err; struct fuse_forget_link *forget; + void *security_ctx = NULL; + u32 security_ctxlen; + + if (fuse_is_bad(dir)) + return -EIO; forget = fuse_alloc_forget(); if (!forget) @@ -586,7 +720,22 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; - err = fuse_simple_request(fc, args); + + if (fm->fc->init_security && args->opcode != FUSE_LINK) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + BUG_ON(args->in_numargs != 2); + + args->in_numargs = 3; + args->in_args[2].size = security_ctxlen; + args->in_args[2].value = security_ctx; + } + + err = fuse_simple_request(fm, args); + kfree(security_ctx); if (err) goto out_put_forget_req; @@ -600,7 +749,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, entry_attr_timeout(&outarg), 0); if (!inode) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; } kfree(forget); @@ -624,14 +773,14 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, return err; } -static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, - dev_t rdev) +static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -644,22 +793,40 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, mode); + return create_new_entry(fm, &args, dir, entry, mode); } -static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, - bool excl) +static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(dir, entry, mode, 0); + return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } -static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) { - struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + +static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + struct fuse_mkdir_in inarg; + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -671,13 +838,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, S_IFDIR); + return create_new_entry(fm, &args, dir, entry, S_IFDIR); } -static int fuse_symlink(struct inode *dir, struct dentry *entry, - const char *link) +static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, const char *link) { - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); @@ -687,48 +854,72 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fc, &args, dir, entry, S_IFLNK); + return create_new_entry(fm, &args, dir, entry, S_IFLNK); } -void fuse_update_ctime(struct inode *inode) +void fuse_flush_time_update(struct inode *inode) +{ + int err = sync_inode_metadata(inode, 1); + + mapping_set_error(inode->i_mapping, err); +} + +static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); + fuse_flush_time_update(inode); } } +void fuse_update_ctime(struct inode *inode) +{ + fuse_invalidate_attr_mask(inode, STATX_CTIME); + fuse_update_ctime_in_cache(inode); +} + +static void fuse_entry_unlinked(struct dentry *entry) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&fi->lock); + fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); + if (fuse_is_bad(dir)) + return -EIO; + args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { - struct inode *inode = d_inode(entry); - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - /* - * If i_nlink == 0 then unlink doesn't make sense, yet this can - * happen if userspace filesystem is careless. It would be - * difficult to enforce correct nlink usage so just ignore this - * condition here - */ - if (inode->i_nlink > 0) - drop_nlink(inode); - spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); fuse_dir_changed(dir); - fuse_invalidate_entry_cache(entry); - fuse_update_ctime(inode); + fuse_entry_unlinked(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -737,19 +928,21 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); + if (fuse_is_bad(dir)) + return -EIO; + args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { - clear_nlink(d_inode(entry)); fuse_dir_changed(dir); - fuse_invalidate_entry_cache(entry); + fuse_entry_unlinked(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -761,7 +954,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, { int err; struct fuse_rename2_in inarg; - struct fuse_conn *fc = get_fuse_conn(olddir); + struct fuse_mount *fm = get_fuse_mount(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); @@ -776,27 +969,21 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { /* ctime changes */ - fuse_invalidate_attr(d_inode(oldent)); fuse_update_ctime(d_inode(oldent)); - if (flags & RENAME_EXCHANGE) { - fuse_invalidate_attr(d_inode(newent)); + if (flags & RENAME_EXCHANGE) fuse_update_ctime(d_inode(newent)); - } fuse_dir_changed(olddir); if (olddir != newdir) fuse_dir_changed(newdir); /* newent will end up negative */ - if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { - fuse_invalidate_attr(d_inode(newent)); - fuse_invalidate_entry_cache(newent); - fuse_update_ctime(d_inode(newent)); - } + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) + fuse_entry_unlinked(newent); } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation @@ -811,13 +998,16 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename2(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent, - unsigned int flags) +static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *oldent, struct inode *newdir, + struct dentry *newent, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(olddir); int err; + if (fuse_is_bad(olddir)) + return -EIO; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; @@ -847,7 +1037,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); @@ -858,26 +1048,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); - /* Contrary to "normal" filesystems it can happen that link - makes two "logical" inodes point to the same "physical" - inode. We invalidate the attributes of the old one, so it - will reflect changes in the backing inode (link count, - etc.) - */ - if (!err) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - if (likely(inode->i_nlink < UINT_MAX)) - inc_nlink(inode); - spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); - } else if (err == -EINTR) { + err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); + if (!err) + fuse_update_ctime_in_cache(inode); + else if (err == -EINTR) fuse_invalidate_attr(inode); - } + return err; } @@ -887,15 +1063,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); - /* see the comment in fuse_change_attributes() */ - if (fc->writeback_cache && S_ISREG(inode->i_mode)) { - attr->size = i_size_read(inode); - attr->mtime = inode->i_mtime.tv_sec; - attr->mtimensec = inode->i_mtime.tv_nsec; - attr->ctime = inode->i_ctime.tv_sec; - attr->ctimensec = inode->i_ctime.tv_nsec; - } - stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -926,11 +1093,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); u64 attr_version; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -949,11 +1116,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || - (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { - make_bad_inode(inode); + inode_wrong_type(inode, outarg.attr.mode)) { + fuse_make_bad(inode); err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, @@ -973,12 +1140,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; bool sync; + u32 inval_mask = READ_ONCE(fi->inval_mask); + u32 cache_mask = fuse_get_cache_mask(inode); if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; - else if (request_mask & READ_ONCE(fi->inval_mask)) + else if (request_mask & inval_mask & ~cache_mask) sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); @@ -987,7 +1156,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -995,14 +1164,12 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, return err; } -int fuse_update_attributes(struct inode *inode, struct file *file) +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - /* Do *not* need to get atime for internal purposes */ - return fuse_update_get_attr(inode, file, NULL, - STATX_BASIC_STATS & ~STATX_ATIME, 0); + return fuse_update_get_attr(inode, file, NULL, mask, 0); } -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; @@ -1010,11 +1177,11 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, struct dentry *dir; struct dentry *entry; - parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + parent = fuse_ilookup(fc, parent_nodeid, NULL); if (!parent) return -ENOENT; - inode_lock(parent); + inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -1085,6 +1252,9 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + return 1; + if (fc->allow_other) return current_in_userns(fc->user_ns); @@ -1102,14 +1272,14 @@ int fuse_allow_current_process(struct fuse_conn *fc) static int fuse_access(struct inode *inode, int mask) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); - if (fc->no_access) + if (fm->fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -1119,9 +1289,9 @@ static int fuse_access(struct inode *inode, int mask) args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_access = 1; + fm->fc->no_access = 1; err = 0; } return err; @@ -1149,12 +1319,16 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask) +static int fuse_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fc)) return -EACCES; @@ -1177,7 +1351,7 @@ static int fuse_permission(struct inode *inode, int mask) } if (fc->default_permissions) { - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1185,7 +1359,8 @@ static int fuse_permission(struct inode *inode, int mask) if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, + inode, mask); } /* Note: the opposite of the above test does not @@ -1209,7 +1384,7 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { .num_pages = 1, @@ -1226,7 +1401,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; - res = fuse_simple_request(fc, &ap.args); + res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); @@ -1250,7 +1425,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out_err; if (fc->cache_symlinks) @@ -1298,7 +1473,7 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; if (fc->no_fsyncdir) @@ -1454,7 +1629,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1465,7 +1640,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - if (fc->minor >= 23) { + if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode->i_ctime.tv_sec; inarg.ctimensec = inode->i_ctime.tv_nsec; @@ -1474,9 +1649,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } - fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } /* @@ -1491,24 +1666,43 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; - bool is_wb = fc->writeback_cache; + bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); loff_t oldsize; int err; - bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb; + bool fault_blocked = false; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; + is_truncate = true; + } + + if (FUSE_IS_DAX(inode) && is_truncate) { + filemap_invalidate_lock(mapping); + fault_blocked = true; + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) { + filemap_invalidate_unlock(mapping); + return err; + } + } + if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); @@ -1521,19 +1715,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, */ i_size_write(inode, 0); truncate_pagecache(inode, 0); - return 0; + goto out; } file = NULL; } - if (attr->ia_valid & ATTR_SIZE) { - if (WARN_ON(!S_ISREG(inode->i_mode))) - return -EIO; - is_truncate = true; - } - /* Flush dirty data/metadata before non-truncate SETATTR */ - if (is_wb && S_ISREG(inode->i_mode) && + if (is_wb && attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | ATTR_TIMES_SET)) { @@ -1560,13 +1748,23 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } + + /* Kill suid/sgid for non-directory chown unconditionally */ + if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) && + attr->ia_valid & (ATTR_UID | ATTR_GID)) + inarg.valid |= FATTR_KILL_SUIDGID; + if (attr->ia_valid & ATTR_SIZE) { /* For mandatory locking in truncate */ inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); + + /* Kill suid/sgid for truncate only if no CAP_FSETID */ + if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + inarg.valid |= FATTR_KILL_SUIDGID; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); @@ -1574,8 +1772,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (fuse_invalid_attr(&outarg.attr) || - (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { - make_bad_inode(inode); + inode_wrong_type(inode, outarg.attr.mode)) { + fuse_make_bad(inode); err = -EIO; goto error; } @@ -1591,10 +1789,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg)); + attr_timeout(&outarg), + fuse_get_cache_mask(inode)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ - if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + if (!is_wb || is_truncate) i_size_write(inode, outarg.attr.size); if (is_truncate) { @@ -1605,15 +1804,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* * Only call invalidate_inode_pages2() after removing - * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. + * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock. */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +out: + if (fault_blocked) + filemap_invalidate_unlock(mapping); + return 0; error: @@ -1621,16 +1824,23 @@ error: fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (fault_blocked) + filemap_invalidate_unlock(mapping); return err; } -static int fuse_setattr(struct dentry *entry, struct iattr *attr) +static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, + struct iattr *attr) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; int ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; @@ -1644,7 +1854,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) * * This should be done on write(), truncate() and chown(). */ - if (!fc->handle_killpriv) { + if (!fc->handle_killpriv && !fc->handle_killpriv_v2) { /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. @@ -1683,14 +1893,28 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) return ret; } -static int fuse_getattr(const struct path *path, struct kstat *stat, +static int fuse_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_current_process(fc)) + if (fuse_is_bad(inode)) + return -EIO; + + if (!fuse_allow_current_process(fc)) { + if (!request_mask) { + /* + * If user explicitly requested *nothing* then don't + * error out, but return st_dev only. + */ + stat->result_mask = 0; + stat->dev = inode->i_sb->s_dev; + return 0; + } return -EACCES; + } return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } @@ -1706,12 +1930,15 @@ static const struct inode_operations fuse_dir_inode_operations = { .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, + .fileattr_get = fuse_fileattr_get, + .fileattr_set = fuse_fileattr_set, }; static const struct file_operations fuse_dir_operations = { @@ -1732,6 +1959,8 @@ static const struct inode_operations fuse_common_inode_operations = { .listxattr = fuse_listxattr, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, + .fileattr_get = fuse_fileattr_get, + .fileattr_set = fuse_fileattr_set, }; static const struct inode_operations fuse_symlink_inode_operations = { @@ -1760,20 +1989,20 @@ void fuse_init_dir(struct inode *inode) fi->rdc.version = 0; } -static int fuse_symlink_readpage(struct file *null, struct page *page) +static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(page->mapping->host, page); + int err = fuse_readlink_page(folio->mapping->host, &folio->page); if (!err) - SetPageUptodate(page); + folio_mark_uptodate(folio); - unlock_page(page); + folio_unlock(folio); return err; } static const struct address_space_operations fuse_symlink_aops = { - .readpage = fuse_symlink_readpage, + .read_folio = fuse_symlink_read_folio, }; void fuse_init_symlink(struct inode *inode) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..71bfb663aac5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -14,33 +14,28 @@ #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/module.h> -#include <linux/compat.h> #include <linux/swap.h> #include <linux/falloc.h> #include <linux/uio.h> +#include <linux/fs.h> -static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) (pages + npages); - - return pages; -} - -static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - int opcode, struct fuse_open_out *outargp) +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, int opcode, + struct fuse_open_out *outargp) { struct fuse_open_in inarg; FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); - inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); - if (!fc->atomic_o_trunc) + inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + args.opcode = opcode; args.nodeid = nodeid; args.in_numargs = 1; @@ -50,7 +45,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } struct fuse_release_args { @@ -59,7 +54,7 @@ struct fuse_release_args { struct inode *inode; }; -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) { struct fuse_file *ff; @@ -67,7 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) if (unlikely(!ff)) return NULL; - ff->fc = fc; + ff->fm = fm; ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL_ACCOUNT); if (!ff->release_args) { @@ -81,7 +76,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - ff->kh = atomic64_inc_return(&fc->khctr); + ff->kh = atomic64_inc_return(&fm->fc->khctr); return ff; } @@ -99,7 +94,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_release_args *ra = container_of(args, typeof(*ra), args); @@ -113,31 +108,32 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_args *args = &ff->release_args->args; - if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { + if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fc, args, 0); + fuse_release_end(ff->fm, args, 0); } else if (sync) { - fuse_simple_request(ff->fc, args); - fuse_release_end(ff->fc, args, 0); + fuse_simple_request(ff->fm, args); + fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; - if (fuse_simple_background(ff->fc, args, + if (fuse_simple_background(ff->fm, args, GFP_KERNEL | __GFP_NOFAIL)) - fuse_release_end(ff->fc, args, -ENOTCONN); + fuse_release_end(ff->fm, args, -ENOTCONN); } kfree(ff); } } -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - bool isdir) +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir) { + struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ @@ -146,14 +142,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_open_out outarg; int err; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); - return err; + return ERR_PTR(err); } else { if (isdir) fc->no_opendir = 1; @@ -166,9 +162,19 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->open_flags &= ~FOPEN_DIRECT_IO; ff->nodeid = nodeid; - file->private_data = ff; - return 0; + return ff; +} + +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + + if (!IS_ERR(ff)) + file->private_data = ff; + + return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); @@ -192,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -205,9 +210,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); - if (fc->writeback_cache) - file_update_time(file); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -215,38 +219,62 @@ void fuse_finish_open(struct inode *inode, struct file *file) int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; int err; bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; + bool dax_truncate = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && FUSE_IS_DAX(inode); + + if (fuse_is_bad(inode)) + return -EIO; err = generic_file_open(inode, file); if (err) return err; - if (is_wb_truncate) { + if (is_wb_truncate || dax_truncate) inode_lock(inode); - fuse_set_nowrite(inode); + + if (dax_truncate) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out_inode_unlock; } - err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (is_wb_truncate) { + if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); - inode_unlock(inode); + if (!err) { + struct fuse_file *ff = file->private_data; + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } + if (dax_truncate) + filemap_invalidate_unlock(inode->i_mapping); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) + inode_unlock(inode); return err; } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - int flags, int opcode) + unsigned int flags, int opcode) { - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ @@ -273,22 +301,21 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nocreds = true; } -void fuse_release_common(struct file *file, bool isdir) +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir) { - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, file->f_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode); if (ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, - (fl_owner_t) file); + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } /* Hold inode until release is finished */ - ra->inode = igrab(file_inode(file)); + ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -299,7 +326,13 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy, isdir); +} + +void fuse_release_common(struct file *file, bool isdir) +{ + fuse_file_release(file_inode(file), file->private_data, file->f_flags, + (fl_owner_t) file, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -311,7 +344,10 @@ static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); - /* see fuse_vma_close() for !writeback_cache case */ + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ if (fc->writeback_cache) write_inode_now(inode, 1); @@ -321,7 +357,8 @@ static int fuse_release(struct inode *inode, struct file *file) return 0; } -void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags) +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); @@ -357,26 +394,33 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct list_head writepages_entry; + struct rb_node writepages_entry; struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; + struct fuse_sync_bucket *bucket; }; static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_writepage_args *wpa; + struct rb_node *n; + + n = fi->writepages.rb_node; - list_for_each_entry(wpa, &fi->writepages, writepages_entry) { + while (n) { + struct fuse_writepage_args *wpa; pgoff_t curr_index; + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + wpa->ia.ap.num_pages && - curr_index <= idx_to) { + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else return wpa; - } } return NULL; } @@ -436,16 +480,16 @@ static void fuse_sync_writes(struct inode *inode) static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; struct fuse_flush_in inarg; FUSE_ARGS(args); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; - if (fc->no_flush) + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) return 0; err = write_inode_now(inode, 1); @@ -460,9 +504,13 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; + err = 0; + if (fm->fc->no_flush) + goto inval_attr_out; + memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fc, id); + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); args.opcode = FUSE_FLUSH; args.nodeid = get_node_id(inode); args.in_numargs = 1; @@ -470,11 +518,19 @@ static int fuse_flush(struct file *file, fl_owner_t id) args.in_args[0].value = &inarg; args.force = true; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_flush = 1; + fm->fc->no_flush = 1; err = 0; } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fm->fc->writeback_cache) + fuse_invalidate_attr_mask(inode, STATX_BLOCKS); return err; } @@ -482,7 +538,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; @@ -495,7 +551,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -505,7 +561,7 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; inode_lock(inode); @@ -602,7 +658,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: - * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in @@ -640,7 +696,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fi->lock); } - io->iocb->ki_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res); } kref_put(&io->refcnt, fuse_io_release); @@ -670,7 +726,7 @@ static void fuse_io_free(struct fuse_io_args *ia) kfree(ia); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, int err) { struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); @@ -699,7 +755,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, fuse_io_free(ia); } -static ssize_t fuse_async_req_send(struct fuse_conn *fc, +static ssize_t fuse_async_req_send(struct fuse_mount *fm, struct fuse_io_args *ia, size_t num_bytes) { ssize_t err; @@ -712,9 +768,10 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, spin_unlock(&io->lock); ia->ap.args.end = fuse_aio_complete_req; - err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + ia->ap.args.may_block = io->should_dirty; + err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); if (err) - fuse_aio_complete_req(fc, &ia->ap.args, err); + fuse_aio_complete_req(fm, &ia->ap.args, err); return num_bytes; } @@ -724,18 +781,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, { struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; - ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(fc, &ia->ap.args); + return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -745,7 +802,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - if (attr_ver == fi->attr_version && size < inode->i_size && + if (attr_ver >= fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); @@ -758,21 +815,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, { struct fuse_conn *fc = get_fuse_conn(inode); - if (fc->writeback_cache) { - /* - * A hole in a file. Some data after the hole are in page cache, - * but have not reached the client fs yet. So, the hole is not - * present there. - */ - int i; - int start_idx = num_read >> PAGE_SHIFT; - size_t off = num_read & (PAGE_SIZE - 1); - - for (i = start_idx; i < ap->num_pages; i++) { - zero_user_segment(ap->pages[i], off, PAGE_SIZE); - off = 0; - } - } else { + /* + * If writeback_cache is enabled, a short read means there's a hole in + * the file. Some data after the hole is in page cache, but has not + * reached the client fs yet. So the hole is not present there. + */ + if (!fc->writeback_cache) { loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } @@ -781,7 +829,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, static int fuse_do_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = page_offset(page); struct fuse_page_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { @@ -801,14 +849,14 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - attr_ver = fuse_get_attr_version(fc); + attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ if (pos + (desc.length - 1) == LLONG_MAX) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); - res = fuse_simple_request(fc, &ia.ap.args); + res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* @@ -822,13 +870,14 @@ static int fuse_do_readpage(struct file *file, struct page *page) return 0; } -static int fuse_readpage(struct file *file, struct page *page) +static int fuse_read_folio(struct file *file, struct folio *folio) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; err = fuse_do_readpage(file, page); @@ -838,7 +887,7 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; @@ -882,7 +931,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; @@ -901,98 +950,62 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) WARN_ON((loff_t) (pos + count) < 0); fuse_read_args_fill(ia, file, pos, count, FUSE_READ); - ia->read.attr_ver = fuse_get_attr_version(fc); - if (fc->async_read) { + ia->read.attr_ver = fuse_get_attr_version(fm->fc); + if (fm->fc->async_read) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; - err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; } else { - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); err = res < 0 ? res : 0; } - fuse_readpages_end(fc, &ap->args, err); + fuse_readpages_end(fm, &ap->args, err); } -struct fuse_fill_data { - struct fuse_io_args *ia; - struct file *file; - struct inode *inode; - unsigned int nr_pages; - unsigned int max_pages; -}; - -static int fuse_readpages_fill(void *_data, struct page *page) +static void fuse_readahead(struct readahead_control *rac) { - struct fuse_fill_data *data = _data; - struct fuse_io_args *ia = data->ia; - struct fuse_args_pages *ap = &ia->ap; - struct inode *inode = data->inode; + struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; - fuse_wait_on_page_writeback(inode, page->index); + if (fuse_is_bad(inode)) + return; - if (ap->num_pages && - (ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || - ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { - data->max_pages = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(ia, data->file); - data->ia = ia = fuse_io_alloc(NULL, data->max_pages); - if (!ia) { - unlock_page(page); - return -ENOMEM; - } - ap = &ia->ap; - } + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); - if (WARN_ON(ap->num_pages >= data->max_pages)) { - unlock_page(page); - fuse_io_free(ia); - return -EIO; - } + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; - get_page(page); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->num_pages++; - data->nr_pages--; - return 0; -} - -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; - int err; - - err = -EIO; - if (is_bad_inode(inode)) - goto out; - - data.file = file; - data.inode = inode; - data.nr_pages = nr_pages; - data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); -; - data.ia = fuse_io_alloc(NULL, data.max_pages); - err = -ENOMEM; - if (!data.ia) - goto out; + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + break; - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.ia->ap.num_pages) - fuse_send_readpages(data.ia, file); - else - fuse_io_free(data.ia); + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); } -out: - return err; } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1008,7 +1021,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, iocb->ki_filp); + err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); if (err) return err; } @@ -1027,7 +1040,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, args->opcode = FUSE_WRITE; args->nodeid = ff->nodeid; args->in_numargs = 2; - if (ff->fc->minor < 9) + if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); @@ -1042,7 +1055,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) { unsigned int flags = iocb->ki_filp->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) flags |= O_DSYNC; if (iocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; @@ -1056,7 +1069,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; ssize_t err; @@ -1064,20 +1077,20 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - err = fuse_simple_request(fc, &ia->ap.args); + err = fuse_simple_request(fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; return err ?: ia->write.out.size; } -bool fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1085,12 +1098,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); - if (pos > inode->i_size) { + if (written > 0 && pos > inode->i_size) { i_size_write(inode, pos); ret = true; } spin_unlock(&fi->lock); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + return ret; } @@ -1101,8 +1116,9 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; unsigned int offset, i; + bool short_write; int err; for (i = 0; i < ap->num_pages; i++) @@ -1110,37 +1126,45 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); + if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; - err = fuse_simple_request(fc, &ap->args); + err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; + short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_pages; i++) { struct page *page = ap->pages[i]; - if (!err && !offset && count >= PAGE_SIZE) - SetPageUptodate(page); - - if (count > PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; - else - count = 0; - offset = 0; - - unlock_page(page); + if (err) { + ClearPageUptodate(page); + } else { + if (count >= PAGE_SIZE - offset) + count -= PAGE_SIZE - offset; + else { + if (short_write) + ClearPageUptodate(page); + count = 0; + } + offset = 0; + } + if (ia->write.page_locked && (i == ap->num_pages - 1)) + unlock_page(page); put_page(page); } return err; } -static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, +static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, unsigned int max_pages) { + struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; @@ -1160,25 +1184,23 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, again: err = -EFAULT; - if (iov_iter_fault_in_readable(ii, bytes)) + if (fault_in_iov_iter_readable(ii, bytes)) break; err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index, 0); + page = grab_cache_page_write_begin(mapping, index); if (!page) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); flush_dcache_page(page); - iov_iter_advance(ii, tmp); if (!tmp) { unlock_page(page); put_page(page); - bytes = min(bytes, iov_iter_single_seg_count(ii)); goto again; } @@ -1193,6 +1215,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, if (offset == PAGE_SIZE) offset = 0; + /* If we copied full page, mark it uptodate */ + if (tmp == PAGE_SIZE) + SetPageUptodate(page); + + if (PageUptodate(page)) { + unlock_page(page); + } else { + ia->write.page_locked = true; + break; + } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && @@ -1236,7 +1268,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, break; } - count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); + count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { @@ -1256,11 +1288,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, kfree(ap->pages); } while (!err && iov_iter_count(ii)); - if (res > 0) - fuse_write_update_size(inode, pos); - + fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - fuse_invalidate_attr(inode); return res > 0 ? res : err; } @@ -1273,17 +1302,25 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; + struct fuse_conn *fc = get_fuse_conn(inode); loff_t endbyte = 0; - if (get_fuse_conn(inode)->writeback_cache) { + if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, file); + err = fuse_update_attributes(mapping->host, file, + STATX_SIZE | STATX_MODE); if (err) return err; + if (fc->handle_killpriv_v2 && + should_remove_suid(file_dentry(file))) { + goto writethrough; + } + return generic_file_write_iter(iocb, from); } +writethrough: inode_lock(inode); /* We can write back this queue in page reclaim */ @@ -1341,16 +1378,6 @@ out: return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, - unsigned int index, - unsigned int nr_pages) -{ - int i; - - for (i = index; i < index + nr_pages; i++) - descs[i].length = PAGE_SIZE - descs[i].offset; -} - static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { return (unsigned long)ii->iov->iov_base + ii->iov_offset; @@ -1387,18 +1414,17 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], + ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], *nbytesp - nbytes, max_pages - ap->num_pages, &start); if (ret < 0) break; - iov_iter_advance(ii, ret); nbytes += ret; ret += start; - npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + npages = DIV_ROUND_UP(ret, PAGE_SIZE); ap->descs[ap->num_pages].offset = start; fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); @@ -1408,6 +1434,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else @@ -1426,7 +1453,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); @@ -1442,7 +1469,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1451,7 +1477,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } - io->should_dirty = !write && iter_is_iovec(iter); + io->should_dirty = !write && user_backed_iter(iter); while (count) { ssize_t nres; fl_owner_t owner = current->files; @@ -1464,7 +1490,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (write) { if (!capable(CAP_FSETID)) - ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; nres = fuse_send_write(ia, pos, nbytes, owner); } else { @@ -1552,11 +1578,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) } else { res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + fuse_write_update_attr(inode, iocb->ki_pos, res); } } - fuse_invalidate_attr(inode); - if (res > 0) - fuse_write_update_size(inode, iocb->ki_pos); inode_unlock(inode); return res; @@ -1566,10 +1590,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (fuse_is_bad(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else @@ -1580,10 +1608,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (fuse_is_bad(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else @@ -1595,6 +1627,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; int i; + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); @@ -1605,7 +1640,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, +static void fuse_writepage_finish(struct fuse_mount *fm, struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; @@ -1614,7 +1649,6 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&wpa->writepages_entry); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1624,7 +1658,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, +static void fuse_send_writepage(struct fuse_mount *fm, struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) @@ -1650,10 +1684,10 @@ __acquires(fi->lock) args->force = true; args->nocreds = true; - err = fuse_simple_background(fc, args, GFP_ATOMIC); + err = fuse_simple_background(fm, args, GFP_ATOMIC); if (err == -ENOMEM) { spin_unlock(&fi->lock); - err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); spin_lock(&fi->lock); } @@ -1665,7 +1699,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - fuse_writepage_finish(fc, wpa); + rb_erase(&wpa->writepages_entry, &fi->writepages); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ @@ -1689,7 +1724,7 @@ void fuse_flush_writepages(struct inode *inode) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); struct fuse_writepage_args *wpa; @@ -1698,29 +1733,76 @@ __acquires(fi->lock) wpa = list_entry(fi->queued_writes.next, struct fuse_writepage_args, queue_entry); list_del_init(&wpa->queue_entry); - fuse_send_writepage(fc, wpa, crop); + fuse_send_writepage(fm, wpa, crop); } } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, +static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, + struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return curr; + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); + return NULL; +} + +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + WARN_ON(fuse_insert_writeback(root, wpa)); +} + +static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_writepage_args *wpa = container_of(args, typeof(*wpa), ia.ap.args); struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); mapping_set_error(inode->i_mapping, error); + /* + * A writeback finished and this might have updated mtime/ctime on + * server making local mtime/ctime stale. Hence invalidate attrs. + * Do this only if writeback_cache is not enabled. If writeback_cache + * is enabled, we trust local ctime/mtime. + */ + if (!fc->writeback_cache) + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); + rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_writepage_args *next = wpa->next; wpa->next = next->next; next->next = NULL; next->ia.ff = fuse_file_get(wpa->ia.ff); - list_add(&next->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, next); /* * Skip fuse_flush_writepages() to make it easy to crop requests @@ -1745,46 +1827,53 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, * no invocations of fuse_writepage_end() while we're in * fuse_set_nowrite..fuse_release_nowrite section. */ - fuse_send_writepage(fc, next, inarg->offset + inarg->size); + fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } -static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = NULL; + struct fuse_file *ff; spin_lock(&fi->lock); - if (!list_empty(&fi->write_files)) { - ff = list_entry(fi->write_files.next, struct fuse_file, - write_entry); + ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, + write_entry); + if (ff) fuse_file_get(ff); - } spin_unlock(&fi->lock); return ff; } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = __fuse_write_file_get(fc, fi); + struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; - ff = __fuse_write_file_get(fc, fi); + /* + * Inode is always written before the last reference is dropped and + * hence this should not be reached from reclaim. + * + * Writing back the inode from reclaim can deadlock if the request + * processing itself needs an allocation. Allocations triggering + * reclaim while serving a request can't be prevented, because it can + * involve any number of unrelated userspace processes. + */ + WARN_ON(wbc->for_reclaim); + + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, false, false); @@ -1811,6 +1900,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) } +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; + + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1834,10 +1937,11 @@ static int fuse_writepage_locked(struct page *page) goto err_free; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fc, fi); + wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) goto err_nofile; + fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); @@ -1855,7 +1959,7 @@ static int fuse_writepage_locked(struct page *page) inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1876,6 +1980,7 @@ err: static int fuse_writepage(struct page *page, struct writeback_control *wbc) { + struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; if (fuse_page_is_writeback(page->mapping->host, page->index)) { @@ -1891,6 +1996,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) return 0; } + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return AOP_WRITEPAGE_ACTIVATE; + err = fuse_writepage_locked(page); unlock_page(page); @@ -1950,14 +2059,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) } /* - * First recheck under fi->lock if the offending offset is still under - * writeback. If yes, then iterate auxiliary write requests, to see if there's + * Check under fi->lock if the page is under writeback, and insert it onto the + * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's * one already added for a page at this offset. If there's none, then insert * this new request onto the auxiliary list, otherwise reuse the existing one by - * copying the new page contents over to the old temporary page. + * swapping the new temp page with the old one. */ -static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, - struct page *page) +static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, + struct page *page) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; @@ -1965,17 +2074,15 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct fuse_args_pages *new_ap = &new_wpa->ia.ap; WARN_ON(new_ap->num_pages != 0); + new_ap->num_pages = 1; spin_lock(&fi->lock); - list_del(&new_wpa->writepages_entry); - old_wpa = fuse_find_writeback(fi, page->index, page->index); + old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); if (!old_wpa) { - list_add(&new_wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); - return false; + return true; } - new_ap->num_pages = 1; for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; @@ -2004,7 +2111,41 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, fuse_writepage_free(new_wpa); } - return true; + return false; +} + +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, + struct fuse_args_pages *ap, + struct fuse_fill_wb_data *data) +{ + WARN_ON(!ap->num_pages); + + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + if (fuse_page_is_writeback(data->inode, page->index)) + return true; + + /* Reached max pages */ + if (ap->num_pages == fc->max_pages) + return true; + + /* Reached max write bytes */ + if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + return true; + + /* Discontinuity */ + if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + return true; + + /* Need to grow the pages array? If so, did the expansion fail? */ + if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + return true; + + return false; } static int fuse_writepages_fill(struct page *page, @@ -2017,35 +2158,18 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct page *tmp_page; - bool is_writeback; int err; if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, fi); + data->ff = fuse_write_file_get(fi); if (!data->ff) goto out_unlock; } - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - is_writeback = fuse_page_is_writeback(inode, page->index); - - if (wpa && ap->num_pages && - (is_writeback || ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { + if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; - } else if (wpa && ap->num_pages == data->max_pages) { - if (!fuse_pages_realloc(data)) { - fuse_writepages_send(data); - data->wpa = NULL; - } } err = -ENOMEM; @@ -2073,6 +2197,8 @@ static int fuse_writepages_fill(struct page *page, __free_page(tmp_page); goto out_unlock; } + fuse_writepage_add_to_bucket(fc, wpa); + data->max_pages = 1; ap = &wpa->ia.ap; @@ -2083,12 +2209,6 @@ static int fuse_writepages_fill(struct page *page, ap->args.end = fuse_writepage_end; ap->num_pages = 0; wpa->inode = inode; - - spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); - spin_unlock(&fi->lock); - - data->wpa = wpa; } set_page_writeback(page); @@ -2096,26 +2216,25 @@ static int fuse_writepages_fill(struct page *page, ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; + data->orig_pages[ap->num_pages] = page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(wpa, page)) { + if (data->wpa) { + /* + * Protected by fi->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fi->lock); + ap->num_pages++; + spin_unlock(&fi->lock); + } else if (fuse_writepage_add(wpa, page)) { + data->wpa = wpa; + } else { end_page_writeback(page); - data->wpa = NULL; - goto out_unlock; } - data->orig_pages[ap->num_pages] = page; - - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_pages++; - spin_unlock(&fi->lock); - out_unlock: unlock_page(page); @@ -2131,9 +2250,13 @@ static int fuse_writepages(struct address_space *mapping, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; + data.inode = inode; data.wpa = NULL; data.ff = NULL; @@ -2147,10 +2270,8 @@ static int fuse_writepages(struct address_space *mapping, err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - /* Ignore errors if we can write at least one page */ WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); - err = 0; } if (data.ff) fuse_file_put(data.ff, false, false); @@ -2165,8 +2286,7 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); @@ -2176,7 +2296,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index, flags); + page = grab_cache_page_write_begin(mapping, index); if (!page) goto error; @@ -2219,15 +2339,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, if (!copied) goto unlock; + pos += copied; if (!PageUptodate(page)) { /* Zero any unwritten bytes at the end of the page */ - size_t endoff = (pos + copied) & ~PAGE_MASK; + size_t endoff = pos & ~PAGE_MASK; if (endoff) zero_user_segment(page, endoff, PAGE_SIZE); SetPageUptodate(page); } - fuse_write_update_size(inode, pos + copied); + if (pos > inode->i_size) + i_size_write(inode, pos); + set_page_dirty(page); unlock: @@ -2237,25 +2360,31 @@ unlock: return copied; } -static int fuse_launder_page(struct page *page) +static int fuse_launder_folio(struct folio *folio) { int err = 0; - if (clear_page_dirty_for_io(page)) { - struct inode *inode = page->mapping->host; - err = fuse_writepage_locked(page); + if (folio_clear_dirty_for_io(folio)) { + struct inode *inode = folio->mapping->host; + + /* Serialize with pending writeback for the same page */ + fuse_wait_on_page_writeback(inode, folio->index); + err = fuse_writepage_locked(&folio->page); if (!err) - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_page_writeback(inode, folio->index); } return err; } /* - * Write back dirty pages now, because there may not be any suitable - * open files later + * Write back dirty data/metadata now (there may not be any suitable + * open files later for data) */ static void fuse_vma_close(struct vm_area_struct *vma) { - filemap_write_and_wait(vma->vm_file->f_mapping); + int err; + + err = write_inode_now(vma->vm_file->f_mapping->host, 1); + mapping_set_error(vma->vm_file->f_mapping, err); } /* @@ -2300,6 +2429,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + /* DAX mmap is superior to direct_io mmap */ + if (FUSE_IS_DAX(file_inode(file))) + return fuse_dax_mmap(file, vma); + if (ff->open_flags & FOPEN_DIRECT_IO) { /* Can't provide the coherency needed for MAP_SHARED */ if (vma->vm_flags & VM_MAYSHARE) @@ -2378,7 +2511,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; struct fuse_lk_out outarg; @@ -2388,9 +2521,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) - err = convert_fuse_file_lock(fc, &outarg.lk, fl); + err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); return err; } @@ -2398,12 +2531,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; - pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); + pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2416,7 +2549,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return 0; fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2470,13 +2603,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; - if (!inode->i_sb->s_bdev || fc->no_bmap) + if (!inode->i_sb->s_bdev || fm->fc->no_bmap) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -2490,9 +2623,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) - fc->no_bmap = 1; + fm->fc->no_bmap = 1; return err ? 0 : outarg.block; } @@ -2500,7 +2633,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_lseek_in inarg = { @@ -2511,7 +2644,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) struct fuse_lseek_out outarg; int err; - if (fc->no_lseek) + if (fm->fc->no_lseek) goto fallback; args.opcode = FUSE_LSEEK; @@ -2522,10 +2655,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { - fc->no_lseek = 1; + fm->fc->no_lseek = 1; goto fallback; } return err; @@ -2534,7 +2667,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, file); + err = fuse_update_attributes(inode, file, STATX_SIZE); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2554,7 +2687,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, file); + retval = fuse_update_attributes(inode, file, STATX_SIZE); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); @@ -2573,354 +2706,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) } /* - * CUSE servers compiled on 32bit broke on 64bit kernels because the - * ABI was defined to be 'struct iovec' which is different on 32bit - * and 64bit. Fortunately we can determine which structure the server - * used from the size of the reply. - */ -static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, - size_t transferred, unsigned count, - bool is_compat) -{ -#ifdef CONFIG_COMPAT - if (count * sizeof(struct compat_iovec) == transferred) { - struct compat_iovec *ciov = src; - unsigned i; - - /* - * With this interface a 32bit server cannot support - * non-compat (i.e. ones coming from 64bit apps) ioctl - * requests - */ - if (!is_compat) - return -EINVAL; - - for (i = 0; i < count; i++) { - dst[i].iov_base = compat_ptr(ciov[i].iov_base); - dst[i].iov_len = ciov[i].iov_len; - } - return 0; - } -#endif - - if (count * sizeof(struct iovec) != transferred) - return -EIO; - - memcpy(dst, src, transferred); - return 0; -} - -/* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, - size_t count) -{ - size_t n; - u32 max = fc->max_pages << PAGE_SHIFT; - - for (n = 0; n < count; n++, iov++) { - if (iov->iov_len > (size_t) max) - return -ENOMEM; - max -= iov->iov_len; - } - return 0; -} - -static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, - void *src, size_t transferred, unsigned count, - bool is_compat) -{ - unsigned i; - struct fuse_ioctl_iovec *fiov = src; - - if (fc->minor < 16) { - return fuse_copy_ioctl_iovec_old(dst, src, transferred, - count, is_compat); - } - - if (count * sizeof(struct fuse_ioctl_iovec) != transferred) - return -EIO; - - for (i = 0; i < count; i++) { - /* Did the server supply an inappropriate value? */ - if (fiov[i].base != (unsigned long) fiov[i].base || - fiov[i].len != (unsigned long) fiov[i].len) - return -EIO; - - dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; - dst[i].iov_len = (size_t) fiov[i].len; - -#ifdef CONFIG_COMPAT - if (is_compat && - (ptr_to_compat(dst[i].iov_base) != fiov[i].base || - (compat_size_t) dst[i].iov_len != fiov[i].len)) - return -EIO; -#endif - } - - return 0; -} - - -/* - * For ioctls, there is no generic way to determine how much memory - * needs to be read and/or written. Furthermore, ioctls are allowed - * to dereference the passed pointer, so the parameter requires deep - * copying but FUSE has no idea whatsoever about what to copy in or - * out. - * - * This is solved by allowing FUSE server to retry ioctl with - * necessary in/out iovecs. Let's assume the ioctl implementation - * needs to read in the following structure. - * - * struct a { - * char *buf; - * size_t buflen; - * } - * - * On the first callout to FUSE server, inarg->in_size and - * inarg->out_size will be NULL; then, the server completes the ioctl - * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and - * the actual iov array to - * - * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } - * - * which tells FUSE to copy in the requested area and retry the ioctl. - * On the second round, the server has access to the structure and - * from that it can tell what to look for next, so on the invocation, - * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to - * - * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, - * { .iov_base = a.buf, .iov_len = a.buflen } } - * - * FUSE will copy both struct a and the pointed buffer from the - * process doing the ioctl and retry ioctl with both struct a and the - * buffer. - * - * This time, FUSE server has everything it needs and completes ioctl - * without FUSE_IOCTL_RETRY which finishes the ioctl call. - * - * Copying data out works the same way. - * - * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel - * automatically initializes in and out iovs by decoding @cmd with - * _IOC_* macros and the server is not allowed to request RETRY. This - * limits ioctl data transfers to well-formed ioctls and is the forced - * behavior for all FUSE servers. - */ -long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, - unsigned int flags) -{ - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; - struct fuse_ioctl_in inarg = { - .fh = ff->fh, - .cmd = cmd, - .arg = arg, - .flags = flags - }; - struct fuse_ioctl_out outarg; - struct iovec *iov_page = NULL; - struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, max_pages; - size_t in_size, out_size, c; - ssize_t transferred; - int err, i; - struct iov_iter ii; - struct fuse_args_pages ap = {}; - -#if BITS_PER_LONG == 32 - inarg.flags |= FUSE_IOCTL_32BIT; -#else - if (flags & FUSE_IOCTL_COMPAT) { - inarg.flags |= FUSE_IOCTL_32BIT; -#ifdef CONFIG_X86_X32 - if (in_x32_syscall()) - inarg.flags |= FUSE_IOCTL_COMPAT_X32; -#endif - } -#endif - - /* assume all the iovs returned by client always fits in a page */ - BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); - - err = -ENOMEM; - ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); - iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!ap.pages || !iov_page) - goto out; - - fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); - - /* - * If restricted, initialize IO parameters as encoded in @cmd. - * RETRY from server is not allowed. - */ - if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { - struct iovec *iov = iov_page; - - iov->iov_base = (void __user *)arg; - iov->iov_len = _IOC_SIZE(cmd); - - if (_IOC_DIR(cmd) & _IOC_WRITE) { - in_iov = iov; - in_iovs = 1; - } - - if (_IOC_DIR(cmd) & _IOC_READ) { - out_iov = iov; - out_iovs = 1; - } - } - - retry: - inarg.in_size = in_size = iov_length(in_iov, in_iovs); - inarg.out_size = out_size = iov_length(out_iov, out_iovs); - - /* - * Out data can be used either for actual out data or iovs, - * make sure there always is at least one page. - */ - out_size = max_t(size_t, out_size, PAGE_SIZE); - max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); - - /* make sure there are enough buffer pages and init request with them */ - err = -ENOMEM; - if (max_pages > fc->max_pages) - goto out; - while (ap.num_pages < max_pages) { - ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!ap.pages[ap.num_pages]) - goto out; - ap.num_pages++; - } - - - /* okay, let's send it to the client */ - ap.args.opcode = FUSE_IOCTL; - ap.args.nodeid = ff->nodeid; - ap.args.in_numargs = 1; - ap.args.in_args[0].size = sizeof(inarg); - ap.args.in_args[0].value = &inarg; - if (in_size) { - ap.args.in_numargs++; - ap.args.in_args[1].size = in_size; - ap.args.in_pages = true; - - err = -EFAULT; - iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); - if (c != PAGE_SIZE && iov_iter_count(&ii)) - goto out; - } - } - - ap.args.out_numargs = 2; - ap.args.out_args[0].size = sizeof(outarg); - ap.args.out_args[0].value = &outarg; - ap.args.out_args[1].size = out_size; - ap.args.out_pages = true; - ap.args.out_argvar = true; - - transferred = fuse_simple_request(fc, &ap.args); - err = transferred; - if (transferred < 0) - goto out; - - /* did it ask for retry? */ - if (outarg.flags & FUSE_IOCTL_RETRY) { - void *vaddr; - - /* no retry if in restricted mode */ - err = -EIO; - if (!(flags & FUSE_IOCTL_UNRESTRICTED)) - goto out; - - in_iovs = outarg.in_iovs; - out_iovs = outarg.out_iovs; - - /* - * Make sure things are in boundary, separate checks - * are to protect against overflow. - */ - err = -ENOMEM; - if (in_iovs > FUSE_IOCTL_MAX_IOV || - out_iovs > FUSE_IOCTL_MAX_IOV || - in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) - goto out; - - vaddr = kmap_atomic(ap.pages[0]); - err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, - transferred, in_iovs + out_iovs, - (flags & FUSE_IOCTL_COMPAT) != 0); - kunmap_atomic(vaddr); - if (err) - goto out; - - in_iov = iov_page; - out_iov = in_iov + in_iovs; - - err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); - if (err) - goto out; - - err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); - if (err) - goto out; - - goto retry; - } - - err = -EIO; - if (transferred > inarg.out_size) - goto out; - - err = -EFAULT; - iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); - if (c != PAGE_SIZE && iov_iter_count(&ii)) - goto out; - } - err = 0; - out: - free_page((unsigned long) iov_page); - while (ap.num_pages) - __free_page(ap.pages[--ap.num_pages]); - kfree(ap.pages); - - return err ? err : outarg.result; -} -EXPORT_SYMBOL_GPL(fuse_do_ioctl); - -long fuse_ioctl_common(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - - if (!fuse_allow_current_process(fc)) - return -EACCES; - - if (is_bad_inode(inode)) - return -EIO; - - return fuse_do_ioctl(file, cmd, arg, flags); -} - -static long fuse_file_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return fuse_ioctl_common(file, cmd, arg, 0); -} - -static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); -} - -/* * All files which have been polled are linked to RB tree * fuse_conn->polled_files which is indexed by kh. Walk the tree and * find the matching one. @@ -2961,7 +2746,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { - struct rb_node **link, *uninitialized_var(parent); + struct rb_node **link, *parent; link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); @@ -2974,13 +2759,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc, __poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; FUSE_ARGS(args); int err; - if (fc->no_poll) + if (fm->fc->no_poll) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); @@ -2992,7 +2777,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) */ if (waitqueue_active(&ff->poll_wait)) { inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; - fuse_register_polled_file(fc, ff); + fuse_register_polled_file(fm->fc, ff); } args.opcode = FUSE_POLL; @@ -3003,12 +2788,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) { - fc->no_poll = 1; + fm->fc->no_poll = 1; return DEFAULT_POLLMASK; } return EPOLLERR; @@ -3065,11 +2850,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_iter_count(iter); + size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; @@ -3077,17 +2861,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode = file->f_mapping->host; i_size = i_size_read(inode); - if ((iov_iter_rw(iter) == READ) && (offset > i_size)) + if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; - /* optimization for short read */ - if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { - if (offset >= i_size) - return 0; - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); - count = iov_iter_count(iter); - } - io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); if (!io) return -ENOMEM; @@ -3103,15 +2879,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = async_dio; + io->async = ff->fm->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); + /* optimization for short read */ + if (io->async && !io->write && offset + count > i_size) { + iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); + shortened = count - iov_iter_count(iter); + count -= shortened; + } + /* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io. */ - if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) + if ((offset + count > i_size) && io->write) io->blocking = true; if (io->async && io->blocking) { @@ -3125,10 +2908,11 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } else { ret = __fuse_direct_read(io, iter, &pos); } + iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); if (io->async) { bool blocking = io->blocking; @@ -3146,9 +2930,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { - if (ret > 0) - fuse_write_update_size(inode, pos); - else if (ret < 0 && offset + count > i_size) + fuse_write_update_attr(inode, pos, ret); + if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } @@ -3157,7 +2940,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); @@ -3171,7 +2954,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, @@ -3181,17 +2964,28 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, }; int err; bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_PUNCH_HOLE); + (mode & (FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)); + + bool block_faults = FUSE_IS_DAX(inode) && lock_inode; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - if (fc->no_fallocate) + if (fm->fc->no_fallocate) return -EOPNOTSUPP; if (lock_inode) { inode_lock(inode); - if (mode & FALLOC_FL_PUNCH_HOLE) { + if (block_faults) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { loff_t endbyte = offset + length - 1; err = fuse_writeback_range(inode, offset, endbyte); @@ -3207,6 +3001,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } + err = file_modified(file); + if (err) + goto out; + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -3215,9 +3013,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_fallocate = 1; + fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; } if (err) @@ -3225,24 +3023,27 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { - bool changed = fuse_write_update_size(inode, offset + length); - - if (changed && fc->writeback_cache) + if (fuse_write_update_attr(inode, offset + length, length)) file_update_time(file); } - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); out: if (!(mode & FALLOC_FL_KEEP_SIZE)) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (block_faults) + filemap_invalidate_unlock(inode->i_mapping); + if (lock_inode) inode_unlock(inode); + fuse_flush_time_update(inode); + return err; } @@ -3255,7 +3056,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); struct fuse_inode *fi_out = get_fuse_inode(inode_out); - struct fuse_conn *fc = ff_in->fc; + struct fuse_mount *fm = ff_in->fm; + struct fuse_conn *fc = fm->fc; FUSE_ARGS(args); struct fuse_copy_file_range_in inarg = { .fh_in = ff_in->fh, @@ -3279,13 +3081,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; - if (fc->writeback_cache) { - inode_lock(inode_in); - err = fuse_writeback_range(inode_in, pos_in, pos_in + len); - inode_unlock(inode_in); - if (err) - return err; - } + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; inode_lock(inode_out); @@ -3293,11 +3093,27 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; - if (fc->writeback_cache) { - err = fuse_writeback_range(inode_out, pos_out, pos_out + len); - if (err) - goto out; - } + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a mapping->invalidate_lock could be used to prevent new + * faults while the copy is ongoing. + */ + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); @@ -3310,7 +3126,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; @@ -3318,12 +3134,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; - if (fc->writeback_cache) { - fuse_write_update_size(inode_out, pos_out + outarg.size); - file_update_time(file_out); - } + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); - fuse_invalidate_attr(inode_out); + file_update_time(file_out); + fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); err = outarg.size; out: @@ -3333,6 +3149,8 @@ out: inode_unlock(inode_out); file_accessed(file_in); + fuse_flush_time_update(inode_out); + return err; } @@ -3361,6 +3179,7 @@ static const struct file_operations fuse_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, + .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, @@ -3372,19 +3191,19 @@ static const struct file_operations fuse_file_operations = { }; static const struct address_space_operations fuse_file_aops = { - .readpage = fuse_readpage, + .read_folio = fuse_read_folio, + .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, - .launder_page = fuse_launder_page, - .readpages = fuse_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .launder_folio = fuse_launder_folio, + .dirty_folio = filemap_dirty_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, .write_end = fuse_write_end, }; -void fuse_init_file_inode(struct inode *inode) +void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -3395,5 +3214,8 @@ void fuse_init_file_inode(struct inode *inode) INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode, flags); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ca344bf71404..98a9cf531873 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -111,7 +111,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /* List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -148,6 +148,13 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif }; /** FUSE inode state bits */ @@ -158,15 +165,18 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, + /* Bad inode */ + FUSE_I_BAD, }; struct fuse_conn; +struct fuse_mount; struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ - struct fuse_conn *fc; + struct fuse_mount *fm; /* Argument space reserved for release */ struct fuse_release_args *release_args; @@ -246,12 +256,14 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; + bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; - void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); + void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); }; struct fuse_args_pages { @@ -359,6 +371,9 @@ struct fuse_req { /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; #endif + + /** fuse_mount this request belongs to */ + struct fuse_mount *fm; }; struct fuse_iqueue; @@ -466,8 +481,21 @@ struct fuse_dev { struct list_head entry; }; +enum fuse_dax_mode { + FUSE_DAX_INODE_DEFAULT, /* default */ + FUSE_DAX_ALWAYS, /* "-o dax=always" */ + FUSE_DAX_NEVER, /* "-o dax=never" */ + FUSE_DAX_INODE_USER, /* "-o dax=inode" */ +}; + +static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode) +{ + return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER; +} + struct fuse_fs_context { int fd; + struct file *file; unsigned int rootmode; kuid_t user_id; kgid_t group_id; @@ -481,21 +509,32 @@ struct fuse_fs_context { bool destroy:1; bool no_control:1; bool no_force_umount:1; - bool no_mount_options:1; + bool legacy_opts_show:1; + enum fuse_dax_mode dax_mode; unsigned int max_read; unsigned int blksize; const char *subtype; + /* DAX device, may be NULL */ + struct dax_device *dax_dev; + /* fuse_dev pointer to fill in, should contain NULL on entry */ void **fudptr; }; +struct fuse_sync_bucket { + /* count is a possible scalability bottleneck */ + atomic_t count; + wait_queue_head_t waitq; + struct rcu_head rcu; +}; + /** * A Fuse connection. * - * This structure is created, when the filesystem is mounted, and is - * destroyed, when the client device is closed and the filesystem is - * unmounted. + * This structure is created, when the root filesystem is mounted, and + * is destroyed, when the client device is closed and the last + * fuse_mount is destroyed. */ struct fuse_conn { /** Lock protecting accessess to members of this structure */ @@ -527,9 +566,12 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Maxmum number of pages that can be used in a single request */ + /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; + /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + /** Input queue */ struct fuse_iqueue iq; @@ -585,7 +627,7 @@ struct fuse_conn { /** Connection successful. Only set in INIT */ unsigned conn_init:1; - /** Do readpages asynchronously? Only set in INIT */ + /** Do readahead asynchronously? Only set in INIT */ unsigned async_read:1; /** Return an unique read error after abort. Only set in INIT */ @@ -609,6 +651,17 @@ struct fuse_conn { /** cache READLINK responses in page cache */ unsigned cache_symlinks:1; + /* show legacy mount options */ + unsigned int legacy_opts_show:1; + + /* + * fs kills suid/sgid/cap on write/chown/trunc. suid is killed on + * write/trunc only if caller did not have CAP_FSETID. sgid is killed + * on write/truncate only if caller did not have CAP_FSETID as well as + * file has group execute permission. + */ + unsigned handle_killpriv_v2:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -632,6 +685,9 @@ struct fuse_conn { /** Is setxattr not implemented by fs? */ unsigned no_setxattr:1; + /** Does file server support extended setxattr */ + unsigned setxattr_ext:1; + /** Is getxattr not implemented by fs? */ unsigned no_getxattr:1; @@ -677,7 +733,7 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; - /** Filesystem is fully reponsible for page cache invalidation. */ + /** Filesystem is fully responsible for page cache invalidation. */ unsigned explicit_inval_data:1; /** Does the filesystem support readdirplus? */ @@ -716,8 +772,20 @@ struct fuse_conn { /** Do not allow MNT_FORCE umount */ unsigned int no_force_umount:1; - /* Do not show mount options */ - unsigned int no_mount_options:1; + /* Auto-mount submounts announced by the server */ + unsigned int auto_submounts:1; + + /* Propagate syncfs() to server */ + unsigned int sync_fs:1; + + /* Initialize security xattrs when creating a new inode */ + unsigned int init_security:1; + + /* Does the filesystem support per inode DAX? */ + unsigned int inode_dax:1; + + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -725,10 +793,10 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_conn_list */ + /** Entry on the fuse_mount_list */ struct list_head entry; - /** Device ID from super block */ + /** Device ID from the root super block */ dev_t dev; /** Dentries in the control filesystem */ @@ -746,24 +814,69 @@ struct fuse_conn { /** Called on final put */ void (*release)(struct fuse_conn *); - /** Super block for this connection. */ - struct super_block *sb; - - /** Read/write semaphore to hold when accessing sb. */ + /** + * Read/write semaphore to hold when accessing the sb of any + * fuse_mount belonging to this connection + */ struct rw_semaphore killsb; /** List of device instances belonging to this connection */ struct list_head devices; + +#ifdef CONFIG_FUSE_DAX + /* Dax mode */ + enum fuse_dax_mode dax_mode; + + /* Dax specific conn data, non-NULL if DAX is enabled */ + struct fuse_conn_dax *dax; +#endif + + /** List of filesystems using this connection */ + struct list_head mounts; + + /* New writepages go into this bucket */ + struct fuse_sync_bucket __rcu *curr_bucket; }; -static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +/* + * Represents a mounted filesystem, potentially a submount. + * + * This object allows sharing a fuse_conn between separate mounts to + * allow submounts with dedicated superblocks and thus separate device + * IDs. + */ +struct fuse_mount { + /* Underlying (potentially shared) connection to the FUSE server */ + struct fuse_conn *fc; + + /* + * Super block for this connection (fc->killsb must be held when + * accessing this). + */ + struct super_block *sb; + + /* Entry on fc->mounts */ + struct list_head fc_entry; +}; + +static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; } +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + return get_fuse_mount_super(sb)->fc; +} + +static inline struct fuse_mount *get_fuse_mount(struct inode *inode) +{ + return get_fuse_mount_super(inode->i_sb); +} + static inline struct fuse_conn *get_fuse_conn(struct inode *inode) { - return get_fuse_conn_super(inode->i_sb); + return get_fuse_mount_super(inode->i_sb)->fc; } static inline struct fuse_inode *get_fuse_inode(struct inode *inode) @@ -786,6 +899,55 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline bool fuse_stale_inode(const struct inode *inode, int generation, + struct fuse_attr *attr) +{ + return inode->i_generation != generation || + inode_wrong_type(inode, attr->mode); +} + +static inline void fuse_make_bad(struct inode *inode) +{ + remove_inode_hash(inode); + set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); +} + +static inline bool fuse_is_bad(struct inode *inode) +{ + return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); +} + +static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + descs[i].length = PAGE_SIZE - descs[i].offset; +} + +static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) +{ + /* Need RCU protection to prevent use after free after the decrement */ + rcu_read_lock(); + if (atomic_dec_and_test(&bucket->count)) + wake_up(&bucket->waitq); + rcu_read_unlock(); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -793,11 +955,6 @@ extern const struct dentry_operations fuse_dentry_operations; extern const struct dentry_operations fuse_root_dentry_operations; /** - * Inode to nodeid comparison. - */ -int fuse_inode_eq(struct inode *inode, void *_nodeidp); - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, @@ -831,6 +988,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; + bool page_locked; } write; }; struct fuse_args_pages ap; @@ -847,11 +1005,12 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, */ int fuse_open_common(struct inode *inode, struct file *file, bool isdir); -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); -void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags); +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags); /** * Send RELEASE or RELEASEDIR request @@ -873,7 +1032,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, /** * Initialize file operations on a regular file */ -void fuse_init_file_inode(struct inode *inode); +void fuse_init_file_inode(struct inode *inode, unsigned int flags); /** * Initialize inode operations on regular files and special files @@ -897,7 +1056,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid); + u64 attr_valid, u32 cache_mask); + +u32 fuse_get_cache_mask(struct inode *inode); /** * Initialize the client device @@ -915,14 +1076,14 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** * End a finished request */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_end(struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -931,7 +1092,15 @@ void fuse_wait_aborted(struct fuse_conn *fc); /** * Invalidate inode attributes */ + +/* Attributes possibly changed on data modification */ +#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS) + +/* Attributes possibly changed on data and/or size modification */ +#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE) + void fuse_invalidate_attr(struct inode *inode); +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask); void fuse_invalidate_entry_cache(struct dentry *entry); @@ -948,7 +1117,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** @@ -960,7 +1130,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_conn *fc); +void fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -969,12 +1139,25 @@ void fuse_send_init(struct fuse_conn *fc); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); -/** - * Disassociate fuse connection from superblock and kill the superblock +/* + * Remove the mount from the connection * - * Calls kill_anon_super(), do not use with bdev mounts. + * Returns whether this was the last mount + */ +bool fuse_mount_remove(struct fuse_mount *fm); + +/* + * Setup context ops for submounts + */ +int fuse_init_fs_context_submount(struct fs_context *fsc); + +/* + * Shut down the connection (possibly sending DESTROY request). */ -void fuse_kill_sb_anon(struct super_block *sb); +void fuse_conn_destroy(struct fuse_mount *fm); + +/* Drop the connection and free the fuse mount */ +void fuse_mount_destroy(struct fuse_mount *fm); /** * Add connection to control filesystem @@ -1000,9 +1183,10 @@ int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); +void fuse_flush_time_update(struct inode *inode); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct file *file); +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask); void fuse_flush_writepages(struct inode *inode); @@ -1010,9 +1194,19 @@ void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); /** + * Scan all fuse_mounts belonging to fc to find the first where + * ilookup5() returns a result. Return that result and the + * respective fuse_mount in *fm (unless fm is NULL). + * + * The caller must hold fc->killsb. + */ +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm); + +/** * File-system tells the kernel to invalidate cache for the given node id. */ -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len); /** @@ -1025,10 +1219,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, * - is a file or oan empty directory * then the dentry is unhashed (d_delete()). */ -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name); -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); /** @@ -1050,7 +1244,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, __poll_t fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -bool fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); @@ -1064,7 +1258,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked); bool fuse_lock_inode(struct inode *inode); int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags); + size_t size, int flags, unsigned int extra_flags); ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); @@ -1074,9 +1268,9 @@ extern const struct xattr_handler *fuse_acl_xattr_handlers[]; extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; -struct posix_acl *fuse_get_acl(struct inode *inode, int type); -int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); - +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); @@ -1092,4 +1286,37 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); u64 fuse_get_unique(struct fuse_iqueue *fiq); void fuse_free_conn(struct fuse_conn *fc); +/* dax.c */ + +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, + struct dax_device *dax_dev); +void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode, unsigned int flags); +void fuse_dax_inode_cleanup(struct inode *inode); +void fuse_dax_dontcache(struct inode *inode, unsigned int flags); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); +void fuse_dax_cancel_work(struct fuse_conn *fc); + +/* ioctl.c */ +long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); + +/* file.c */ + +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir); +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 95d712d44ca1..6b3beda16c1b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -23,6 +23,7 @@ #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/pid_namespace.h> +#include <uapi/linux/magic.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ @@ -73,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) { struct fuse_inode *fi; - fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); + fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL); if (!fi) return NULL; @@ -87,12 +86,19 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, fi); - return NULL; - } + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } static void fuse_free_inode(struct inode *inode) @@ -101,6 +107,9 @@ static void fuse_free_inode(struct inode *inode) mutex_destroy(&fi->mutex); kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif kmem_cache_free(fuse_inode_cachep, fi); } @@ -108,23 +117,34 @@ static void fuse_evict_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); + /* Will write inode on close/munmap and in all other dirtiers */ + WARN_ON(inode->i_state & I_DIRTY_INODE); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); - fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); - fi->forget = NULL; + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); + if (fi->nlookup) { + fuse_queue_forget(fc, fi->forget, fi->nodeid, + fi->nlookup); + fi->forget = NULL; + } } - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { + if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } } -static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +static int fuse_reconfigure(struct fs_context *fsc) { + struct super_block *sb = fsc->root->d_sb; + sync_filesystem(sb); - if (*flags & SB_MANDLOCK) + if (fsc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -143,7 +163,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid) + u64 attr_valid, u32 cache_mask) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -160,12 +180,20 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_uid = make_kuid(fc->user_ns, attr->uid); inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; + + /* Sanitize nsecs */ + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); + inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + if (!(cache_mask & STATX_MTIME)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + } + if (!(cache_mask & STATX_CTIME)) { inode->i_ctime.tv_sec = attr->ctime; inode->i_ctime.tv_nsec = attr->ctimensec; } @@ -185,6 +213,26 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; + + /* + * We are refreshing inode data and it is possible that another + * client set suid/sgid or security.capability xattr. So clear + * S_NOSEC. Ideally, we could have cleared it only if suid/sgid + * was set or if security.capability xattr was set. But we don't + * know if security.capability has been set or not. So clear it + * anyway. Its less efficient but should be safe. + */ + inode->i_flags &= ~S_NOSEC; +} + +u32 fuse_get_cache_mask(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + return 0; + + return STATX_MTIME | STATX_CTIME | STATX_SIZE; } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, @@ -192,11 +240,29 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - bool is_wb = fc->writeback_cache; + u32 cache_mask; loff_t oldsize; struct timespec64 old_mtime; spin_lock(&fi->lock); + /* + * In case of writeback_cache enabled, writes update mtime, ctime and + * may update i_size. In these cases trust the cached value in the + * inode. + */ + cache_mask = fuse_get_cache_mask(inode); + if (cache_mask & STATX_SIZE) + attr->size = i_size_read(inode); + + if (cache_mask & STATX_MTIME) { + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + } + if (cache_mask & STATX_CTIME) { + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } + if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { spin_unlock(&fi->lock); @@ -204,7 +270,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid); + fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); oldsize = inode->i_size; /* @@ -212,11 +278,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ - if (!is_wb || !S_ISREG(inode->i_mode)) + if (!(cache_mask & STATX_SIZE)) i_size_write(inode, attr->size); spin_unlock(&fi->lock); - if (!is_wb && S_ISREG(inode->i_mode)) { + if (!cache_mask && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { @@ -240,6 +306,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); } static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) @@ -252,7 +321,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); - fuse_init_file_inode(inode); + fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) @@ -266,7 +335,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) BUG(); } -int fuse_inode_eq(struct inode *inode, void *_nodeidp) +static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) @@ -290,7 +359,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); - retry: + /* + * Auto mount points get their node id from the submount root, which is + * not a unique identifier within this filesystem. + * + * To avoid conflicts, do not place submount points into the inode hash + * table. + */ + if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && + S_ISDIR(attr->mode)) { + inode = new_inode(sb); + if (!inode) + return NULL; + + fuse_init_inode(inode, attr); + get_fuse_inode(inode)->nodeid = nodeid; + inode->i_flags |= S_AUTOMOUNT; + goto done; + } + +retry: inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); if (!inode) return NULL; @@ -302,13 +390,13 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, inode->i_generation = generation; fuse_init_inode(inode, attr); unlock_new_inode(inode); - } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { - /* Inode has changed type, any I/O on the old should fail */ - make_bad_inode(inode); + } else if (fuse_stale_inode(inode, generation, attr)) { + /* nodeid was reused, any I/O on the old inode should fail */ + fuse_make_bad(inode); iput(inode); goto retry; } - +done: fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; @@ -318,17 +406,45 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return inode; } -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm) +{ + struct fuse_mount *fm_iter; + struct inode *inode; + + WARN_ON(!rwsem_is_locked(&fc->killsb)); + list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { + if (!fm_iter->sb) + continue; + + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + if (fm) + *fm = fm_iter; + return inode; + } + } + + return NULL; +} + +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { + struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; - inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) return -ENOENT; + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -366,34 +482,28 @@ static void fuse_umount_begin(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - if (!fc->no_force_umount) - fuse_abort_conn(fc); + if (fc->no_force_umount) + return; + + fuse_abort_conn(fc); + + // Only retire block-device-based superblocks. + if (sb->s_bdev != NULL) + retire_super(sb); } -static void fuse_send_destroy(struct fuse_conn *fc) +static void fuse_send_destroy(struct fuse_mount *fm) { - if (fc->conn_init) { + if (fm->fc->conn_init) { FUSE_ARGS(args); args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); } } -static void fuse_put_super(struct super_block *sb) -{ - struct fuse_conn *fc = get_fuse_conn_super(sb); - - mutex_lock(&fuse_mutex); - list_del(&fc->entry); - fuse_ctl_remove_conn(fc); - mutex_unlock(&fuse_mutex); - - fuse_conn_put(fc); -} - static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) { stbuf->f_type = FUSE_SUPER_MAGIC; @@ -411,12 +521,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; - if (!fuse_allow_current_process(fc)) { + if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } @@ -428,12 +538,104 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + +static int fuse_sync_fs(struct super_block *sb, int wait) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct fuse_syncfs_in inarg; + FUSE_ARGS(args); + int err; + + /* + * Userspace cannot handle the wait == 0 case. Avoid a + * gratuitous roundtrip. + */ + if (!wait) + return 0; + + /* The filesystem is being unmounted. Nothing to do. */ + if (!sb->s_root) + return 0; + + if (!fc->sync_fs) + return 0; + + fuse_sync_fs_writes(fc); + + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.opcode = FUSE_SYNCFS; + args.nodeid = get_node_id(sb->s_root->d_inode); + args.out_numargs = 0; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->sync_fs = 0; + err = 0; + } + + return err; +} + enum { OPT_SOURCE, OPT_SUBTYPE, @@ -462,27 +664,38 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { {} }; -static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) +static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - opt = fs_parse(fc, fuse_fs_parameters, param, &result); + if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + /* + * Ignore options coming from mount(MS_REMOUNT) for backward + * compatibility. + */ + if (fsc->oldapi) + return 0; + + return invalfc(fsc, "No changes allowed in reconfigure"); + } + + opt = fs_parse(fsc, fuse_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case OPT_SOURCE: - if (fc->source) - return invalfc(fc, "Multiple sources specified"); - fc->source = param->string; + if (fsc->source) + return invalfc(fsc, "Multiple sources specified"); + fsc->source = param->string; param->string = NULL; break; case OPT_SUBTYPE: if (ctx->subtype) - return invalfc(fc, "Multiple subtypes specified"); + return invalfc(fsc, "Multiple subtypes specified"); ctx->subtype = param->string; param->string = NULL; return 0; @@ -494,22 +707,22 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_ROOTMODE: if (!fuse_valid_type(result.uint_32)) - return invalfc(fc, "Invalid rootmode"); + return invalfc(fsc, "Invalid rootmode"); ctx->rootmode = result.uint_32; ctx->rootmode_present = true; break; case OPT_USER_ID: - ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); if (!uid_valid(ctx->user_id)) - return invalfc(fc, "Invalid user_id"); + return invalfc(fsc, "Invalid user_id"); ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); if (!gid_valid(ctx->group_id)) - return invalfc(fc, "Invalid group_id"); + return invalfc(fsc, "Invalid group_id"); ctx->group_id_present = true; break; @@ -527,7 +740,7 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_BLKSIZE: if (!ctx->is_bdev) - return invalfc(fc, "blksize only supported for fuseblk"); + return invalfc(fsc, "blksize only supported for fuseblk"); ctx->blksize = result.uint_32; break; @@ -538,9 +751,9 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void fuse_free_fc(struct fs_context *fc) +static void fuse_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; if (ctx) { kfree(ctx->subtype); @@ -553,19 +766,29 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - if (fc->no_mount_options) - return 0; + if (fc->legacy_opts_show) { + seq_printf(m, ",user_id=%u", + from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", + from_kgid_munged(fc->user_ns, fc->group_id)); + if (fc->default_permissions) + seq_puts(m, ",default_permissions"); + if (fc->allow_other) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + } +#ifdef CONFIG_FUSE_DAX + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); +#endif - seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); - if (fc->default_permissions) - seq_puts(m, ",default_permissions"); - if (fc->allow_other) - seq_puts(m, ",allow_other"); - if (fc->max_read != ~0) - seq_printf(m, ",max_read=%u", fc->max_read); - if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", sb->s_blocksize); return 0; } @@ -595,7 +818,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); @@ -622,6 +846,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + + INIT_LIST_HEAD(&fc->mounts); + list_add(&fm->fc_entry, &fc->mounts); + fm->fc = fc; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -629,11 +858,19 @@ void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } fc->release(fc); } } @@ -776,14 +1013,13 @@ static struct dentry *fuse_get_parent(struct dentry *child) struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; - const struct qstr name = QSTR_INIT("..", 2); int err; if (!fc->export_support) return ERR_PTR(-ESTALE); err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), - &name, &outarg, &inode); + &dotdot_name, &outarg, &inode); if (err) { if (err == -ENOENT) return ERR_PTR(-ESTALE); @@ -810,10 +1046,9 @@ static const struct super_operations fuse_super_operations = { .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, - .remount_fs = fuse_remount_fs, - .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, + .sync_fs = fuse_sync_fs, .show_options = fuse_show_options, }; @@ -876,84 +1111,104 @@ struct fuse_init_args { struct fuse_init_out out; }; -static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, +static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; + bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; + ok = false; else { unsigned long ra_pages; process_init_limits(fc, arg); if (arg->minor >= 6) { + u64 flags = arg->flags | (u64) arg->flags2 << 32; + ra_pages = arg->max_readahead / PAGE_SIZE; - if (arg->flags & FUSE_ASYNC_READ) + if (flags & FUSE_ASYNC_READ) fc->async_read = 1; - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; if (arg->minor >= 17) { - if (!(arg->flags & FUSE_FLOCK_LOCKS)) + if (!(flags & FUSE_FLOCK_LOCKS)) fc->no_flock = 1; } else { - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_flock = 1; } - if (arg->flags & FUSE_ATOMIC_O_TRUNC) + if (flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { /* LOOKUP has dependency on proto version */ - if (arg->flags & FUSE_EXPORT_SUPPORT) + if (flags & FUSE_EXPORT_SUPPORT) fc->export_support = 1; } - if (arg->flags & FUSE_BIG_WRITES) + if (flags & FUSE_BIG_WRITES) fc->big_writes = 1; - if (arg->flags & FUSE_DONT_MASK) + if (flags & FUSE_DONT_MASK) fc->dont_mask = 1; - if (arg->flags & FUSE_AUTO_INVAL_DATA) + if (flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA) + else if (flags & FUSE_EXPLICIT_INVAL_DATA) fc->explicit_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) { + if (flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) + if (flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; } - if (arg->flags & FUSE_ASYNC_DIO) + if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; - if (arg->flags & FUSE_WRITEBACK_CACHE) + if (flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; - if (arg->flags & FUSE_PARALLEL_DIROPS) + if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; - if (arg->flags & FUSE_HANDLE_KILLPRIV) + if (flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) - fc->sb->s_time_gran = arg->time_gran; - if ((arg->flags & FUSE_POSIX_ACL)) { + fm->sb->s_time_gran = arg->time_gran; + if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fc->sb->s_xattr = fuse_acl_xattr_handlers; + fm->sb->s_xattr = fuse_acl_xattr_handlers; } - if (arg->flags & FUSE_CACHE_SYMLINKS) + if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; - if (arg->flags & FUSE_ABORT_ERROR) + if (flags & FUSE_ABORT_ERROR) fc->abort_err = 1; - if (arg->flags & FUSE_MAX_PAGES) { + if (flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; + } + if (flags & FUSE_HANDLE_KILLPRIV_V2) { + fc->handle_killpriv_v2 = 1; + fm->sb->s_flags |= SB_NOSEC; + } + if (flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; + if (flags & FUSE_SECURITY_CTX) + fc->init_security = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } - fc->sb->s_bdi->ra_pages = - min(fc->sb->s_bdi->ra_pages, ra_pages); + fm->sb->s_bdi->ra_pages = + min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -961,20 +1216,26 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, } kfree(ia); + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_conn *fc) +void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; + u64 flags; ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; - ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; - ia->in.flags |= + ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -983,7 +1244,21 @@ void fuse_send_init(struct fuse_conn *fc) FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | - FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; + FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | + FUSE_SECURITY_CTX; +#ifdef CONFIG_FUSE_DAX + if (fm->fc->dax) + flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; +#endif + if (fm->fc->auto_submounts) + flags |= FUSE_SUBMOUNTS; + + ia->in.flags = flags; + ia->in.flags2 = flags >> 32; + ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); @@ -999,8 +1274,8 @@ void fuse_send_init(struct fuse_conn *fc) ia->args.nocreds = true; ia->args.end = process_init_reply; - if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fc, &ia->args, -ENOTCONN); + if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fm, &ia->args, -ENOTCONN); } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1030,9 +1305,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + @@ -1111,10 +1386,141 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct fuse_inode *fi) +{ + *attr = (struct fuse_attr){ + .ino = fi->inode.i_ino, + .size = fi->inode.i_size, + .blocks = fi->inode.i_blocks, + .atime = fi->inode.i_atime.tv_sec, + .mtime = fi->inode.i_mtime.tv_sec, + .ctime = fi->inode.i_ctime.tv_sec, + .atimensec = fi->inode.i_atime.tv_nsec, + .mtimensec = fi->inode.i_mtime.tv_nsec, + .ctimensec = fi->inode.i_ctime.tv_nsec, + .mode = fi->inode.i_mode, + .nlink = fi->inode.i_nlink, + .uid = fi->inode.i_uid.val, + .gid = fi->inode.i_gid.val, + .rdev = fi->inode.i_rdev, + .blksize = 1u << fi->inode.i_blkbits, + }; +} + +static void fuse_sb_defaults(struct super_block *sb) +{ + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; +} + +static int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct super_block *parent_sb = parent_fi->inode.i_sb; + struct fuse_attr root_attr; + struct inode *root; + + fuse_sb_defaults(sb); + fm->sb = sb; + + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi_get(parent_sb->s_bdi); + + sb->s_xattr = parent_sb->s_xattr; + sb->s_time_gran = parent_sb->s_time_gran; + sb->s_blocksize = parent_sb->s_blocksize; + sb->s_blocksize_bits = parent_sb->s_blocksize_bits; + sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); + if (parent_sb->s_subtype && !sb->s_subtype) + return -ENOMEM; + + fuse_fill_attr_from_inode(&root_attr, parent_fi); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + /* + * This inode is just a duplicate, so it is not looked up and + * its nlookup should not be incremented. fuse_iget() does + * that, though, so undo it here. + */ + get_fuse_inode(root)->nlookup--; + sb->s_d_op = &fuse_dentry_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + + return 0; +} + +/* Filesystem context private data holds the FUSE inode of the mount point */ +static int fuse_get_tree_submount(struct fs_context *fsc) +{ + struct fuse_mount *fm; + struct fuse_inode *mp_fi = fsc->fs_private; + struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); + struct super_block *sb; + int err; + + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + return -ENOMEM; + + fm->fc = fuse_conn_get(fc); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) { + deactivate_locked_super(sb); + return err; + } + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + + return 0; +} + +static const struct fs_context_operations fuse_context_submount_ops = { + .get_tree = fuse_get_tree_submount, +}; + +int fuse_init_fs_context_submount(struct fs_context *fsc) +{ + fsc->ops = &fuse_context_submount_ops; + return 0; +} +EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { - struct fuse_dev *fud; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_dev *fud = NULL; + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct inode *root; struct dentry *root_dentry; int err; @@ -1123,7 +1529,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; - sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); + fuse_sb_defaults(sb); if (ctx->is_bdev) { #ifdef CONFIG_BLOCK @@ -1138,29 +1545,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; - sb->s_magic = FUSE_SUPER_MAGIC; - sb->s_op = &fuse_super_operations; - sb->s_xattr = fuse_xattr_handlers; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_time_gran = 1; - sb->s_export_op = &fuse_export_operations; - sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; - if (sb->s_user_ns != &init_user_ns) - sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); + if (err) + goto err; + } - fud = fuse_dev_alloc_install(fc); - if (!fud) - goto err; + if (ctx->fudptr) { + err = -ENOMEM; + fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err_free_dax; + } fc->dev = sb->s_dev; - fc->sb = sb; + fm->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_dev_free; @@ -1174,11 +1573,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->allow_other = ctx->allow_other; fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; - fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->legacy_opts_show = ctx->legacy_opts_show; + fc->max_read = max_t(unsigned int, 4096, ctx->max_read); fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; - fc->no_mount_options = ctx->no_mount_options; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1191,7 +1590,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (*ctx->fudptr) + if (ctx->fudptr && *ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1200,7 +1599,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - *ctx->fudptr = fud; + if (ctx->fudptr) + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); return 0; @@ -1208,7 +1608,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_unlock(&fuse_mutex); dput(root_dentry); err_dev_free: - fuse_dev_free(fud); + if (fud) + fuse_dev_free(fud); + err_free_dax: + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); err: return err; } @@ -1217,80 +1621,117 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; - struct file *file; int err; - struct fuse_conn *fc; - err = -EINVAL; - file = fget(ctx->fd); - if (!file) - goto err; + if (!ctx->file || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; /* * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != sb->s_user_ns)) - goto err_fput; - ctx->fudptr = &file->private_data; - - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err_fput; - - fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); - fc->release = fuse_free_conn; - sb->s_fs_info = fc; + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + return -EINVAL; + ctx->fudptr = &ctx->file->private_data; err = fuse_fill_super_common(sb, ctx); if (err) - goto err_put_conn; - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); - fuse_send_init(get_fuse_conn_super(sb)); + return err; + /* file->private_data shall be visible on all CPUs after this */ + smp_mb(); + fuse_send_init(get_fuse_mount_super(sb)); return 0; +} - err_put_conn: - fuse_conn_put(fc); - sb->s_fs_info = NULL; - err_fput: - fput(file); - err: - return err; +/* + * This is the path where user supplied an already initialized fuse dev. In + * this case never create a new super if the old one is gone. + */ +static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) +{ + return -ENOTCONN; } -static int fuse_get_tree(struct fs_context *fc) +static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; - if (!ctx->fd_present || !ctx->rootmode_present || - !ctx->user_id_present || !ctx->group_id_present) - return -EINVAL; + return fsc->sget_key == get_fuse_conn_super(sb); +} -#ifdef CONFIG_BLOCK - if (ctx->is_bdev) - return get_tree_bdev(fc, fuse_fill_super); -#endif +static int fuse_get_tree(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_mount *fm; + struct super_block *sb; + int err; - return get_tree_nodev(fc, fuse_fill_super); + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) + return -ENOMEM; + + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + + fsc->s_fs_info = fm; + + if (ctx->fd_present) + ctx->file = fget(ctx->fd); + + if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { + err = get_tree_bdev(fsc, fuse_fill_super); + goto out; + } + /* + * While block dev mount can be initialized with a dummy device fd + * (found by device name), normal fuse mounts can't + */ + err = -EINVAL; + if (!ctx->file) + goto out; + + /* + * Allow creating a fuse mount with an already initialized fuse + * connection + */ + fud = READ_ONCE(ctx->file->private_data); + if (ctx->file->f_op == &fuse_dev_operations && fud) { + fsc->sget_key = fud->fc; + sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); + err = PTR_ERR_OR_ZERO(sb); + if (!IS_ERR(sb)) + fsc->root = dget(sb->s_root); + } else { + err = get_tree_nodev(fsc, fuse_fill_super); + } +out: + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (ctx->file) + fput(ctx->file); + return err; } static const struct fs_context_operations fuse_context_ops = { - .free = fuse_free_fc, + .free = fuse_free_fsc, .parse_param = fuse_parse_param, + .reconfigure = fuse_reconfigure, .get_tree = fuse_get_tree, }; /* * Set up the filesystem mount context. */ -static int fuse_init_fs_context(struct fs_context *fc) +static int fuse_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; @@ -1300,42 +1741,79 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->max_read = ~0; ctx->blksize = FUSE_DEFAULT_BLKSIZE; + ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK - if (fc->fs_type == &fuseblk_fs_type) { + if (fsc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; ctx->destroy = true; } #endif - fc->fs_private = ctx; - fc->ops = &fuse_context_ops; + fsc->fs_private = ctx; + fsc->ops = &fuse_context_ops; return 0; } -static void fuse_sb_destroy(struct super_block *sb) +bool fuse_mount_remove(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_conn *fc = fm->fc; + bool last = false; - if (fc) { - if (fc->destroy) - fuse_send_destroy(fc); + down_write(&fc->killsb); + list_del_init(&fm->fc_entry); + if (list_empty(&fc->mounts)) + last = true; + up_write(&fc->killsb); - fuse_abort_conn(fc); - fuse_wait_aborted(fc); + return last; +} +EXPORT_SYMBOL_GPL(fuse_mount_remove); - down_write(&fc->killsb); - fc->sb = NULL; - up_write(&fc->killsb); +void fuse_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + + if (fc->destroy) + fuse_send_destroy(fm); + + fuse_abort_conn(fc); + fuse_wait_aborted(fc); + + if (!list_empty(&fc->entry)) { + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); } } +EXPORT_SYMBOL_GPL(fuse_conn_destroy); + +static void fuse_sb_destroy(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (sb->s_root) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } +} + +void fuse_mount_destroy(struct fuse_mount *fm) +{ + fuse_conn_put(fm->fc); + kfree(fm); +} +EXPORT_SYMBOL(fuse_mount_destroy); -void fuse_kill_sb_anon(struct super_block *sb) +static void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } -EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, @@ -1352,6 +1830,7 @@ static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); kill_block_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuseblk_fs_type = { diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c new file mode 100644 index 000000000000..61d8afcb10a3 --- /dev/null +++ b/fs/fuse/ioctl.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include <linux/uio.h> +#include <linux/compat.h> +#include <linux/fileattr.h> + +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +{ + ssize_t ret = fuse_simple_request(fm, args); + + /* Translate ENOSYS, which shouldn't be returned from fs */ + if (ret == -ENOSYS) + ret = -ENOTTY; + + return ret; +} + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) +{ + size_t n; + u32 max = fc->max_pages << PAGE_SHIFT; + + for (n = 0; n < count; n++, iov++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags) +{ + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct iovec *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; + int err, i; + struct iov_iter ii; + struct fuse_args_pages ap = {}; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) { + inarg.flags |= FUSE_IOCTL_32BIT; +#ifdef CONFIG_X86_X32_ABI + if (in_x32_syscall()) + inarg.flags |= FUSE_IOCTL_COMPAT_X32; +#endif + } +#endif + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + err = -ENOMEM; + ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); + if (!ap.pages || !iov_page) + goto out; + + fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = iov_page; + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > fm->fc->max_pages) + goto out; + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) + goto out; + ap.num_pages++; + } + + + /* okay, let's send it to the client */ + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; + if (in_size) { + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; + + err = -EFAULT; + iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + } + + ap.args.out_numargs = 2; + ap.args.out_args[0].size = sizeof(outarg); + ap.args.out_args[0].value = &outarg; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; + + transferred = fuse_send_ioctl(fm, &ap.args); + err = transferred; + if (transferred < 0) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + void *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + vaddr = kmap_local_page(ap.pages[0]); + err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); + kunmap_local(vaddr); + if (err) + goto out; + + in_iov = iov_page; + out_iov = in_iov + in_iovs; + + err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); + if (err) + goto out; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = -EFAULT; + iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; + out: + free_page((unsigned long) iov_page); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); + + return err ? err : outarg.result; +} +EXPORT_SYMBOL_GPL(fuse_do_ioctl); + +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fuse_is_bad(inode)) + return -EIO; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, 0); +} + +long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, + unsigned int cmd, void *ptr, size_t size) +{ + struct fuse_mount *fm = ff->fm; + struct fuse_ioctl_in inarg; + struct fuse_ioctl_out outarg; + FUSE_ARGS(args); + int err; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.cmd = cmd; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + if (S_ISDIR(inode->i_mode)) + inarg.flags |= FUSE_IOCTL_DIR; + + if (_IOC_DIR(cmd) & _IOC_READ) + inarg.out_size = size; + if (_IOC_DIR(cmd) & _IOC_WRITE) + inarg.in_size = size; + + args.opcode = FUSE_IOCTL; + args.nodeid = ff->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = inarg.in_size; + args.in_args[1].value = ptr; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + args.out_args[1].size = inarg.out_size; + args.out_args[1].value = ptr; + + err = fuse_send_ioctl(fm, &args); + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } + return err; +} + +static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + bool isdir = S_ISDIR(inode->i_mode); + + if (!S_ISREG(inode->i_mode) && !isdir) + return ERR_PTR(-ENOTTY); + + return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); +} + +static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) +{ + fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); +} + +int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fuse_file *ff; + unsigned int flags; + struct fsxattr xfa; + int err; + + ff = fuse_priv_ioctl_prepare(inode); + if (IS_ERR(ff)) + return PTR_ERR(ff); + + if (fa->flags_valid) { + err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS, + &flags, sizeof(flags)); + if (err) + goto cleanup; + + fileattr_fill_flags(fa, flags); + } else { + err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR, + &xfa, sizeof(xfa)); + if (err) + goto cleanup; + + fileattr_fill_xflags(fa, xfa.fsx_xflags); + fa->fsx_extsize = xfa.fsx_extsize; + fa->fsx_nextents = xfa.fsx_nextents; + fa->fsx_projid = xfa.fsx_projid; + fa->fsx_cowextsize = xfa.fsx_cowextsize; + } +cleanup: + fuse_priv_ioctl_cleanup(inode, ff); + + return err; +} + +int fuse_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fuse_file *ff; + unsigned int flags = fa->flags; + struct fsxattr xfa; + int err; + + ff = fuse_priv_ioctl_prepare(inode); + if (IS_ERR(ff)) + return PTR_ERR(ff); + + if (fa->flags_valid) { + err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS, + &flags, sizeof(flags)); + if (err) + goto cleanup; + } else { + memset(&xfa, 0, sizeof(xfa)); + xfa.fsx_xflags = fa->fsx_xflags; + xfa.fsx_extsize = fa->fsx_extsize; + xfa.fsx_nextents = fa->fsx_nextents; + xfa.fsx_projid = fa->fsx_projid; + xfa.fsx_cowextsize = fa->fsx_cowextsize; + + err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR, + &xfa, sizeof(xfa)); + } + +cleanup: + fuse_priv_ioctl_cleanup(inode, ff); + + return err; +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 90e3f01bd796..e8deaacf1832 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -76,11 +76,13 @@ static void fuse_add_dirent_to_cache(struct file *file, WARN_ON(fi->rdc.pos != pos)) goto unlock; - addr = kmap_atomic(page); - if (!offset) + addr = kmap_local_page(page); + if (!offset) { clear_page(addr); + SetPageUptodate(page); + } memcpy(addr + offset, dirent, reclen); - kunmap_atomic(addr); + kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; fi->rdc.pos = dirent->off; unlock: @@ -200,14 +202,17 @@ retry: if (!d_in_lookup(dentry)) { struct fuse_inode *fi; inode = d_inode(dentry); + if (inode && get_node_id(inode) != o->nodeid) + inode = NULL; if (!inode || - get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + fuse_stale_inode(inode, o->generation, &o->attr)) { + if (inode) + fuse_make_bad(inode); d_invalidate(dentry); dput(dentry); goto retry; } - if (is_bad_inode(inode)) { + if (fuse_is_bad(inode)) { dput(dentry); return -EIO; } @@ -252,7 +257,7 @@ retry: static void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_forget_in inarg; FUSE_ARGS(args); @@ -266,7 +271,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid) args.force = true; args.noreply = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); /* ignore errors */ } @@ -320,7 +325,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ssize_t res; struct page *page; struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_page_desc desc = { .length = PAGE_SIZE }; @@ -337,7 +342,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ap->pages = &page; ap->descs = &desc; if (plus) { - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -345,7 +350,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -451,7 +456,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) * cache; both cases require an up-to-date mtime value. */ if (!ctx->pos && fc->auto_inval_data) { - int err = fuse_update_attributes(inode, file); + int err = fuse_update_attributes(inode, file, STATX_MTIME); if (err) return err; @@ -513,6 +518,12 @@ retry_locked: page = find_get_page_flags(file->f_mapping, index, FGP_ACCESSED | FGP_LOCK); + /* Page gone missing, then re-added to cache, but not initialized? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } spin_lock(&fi->rdc.lock); if (!page) { /* @@ -568,7 +579,7 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; mutex_lock(&ff->readdir.lock); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bade74768903..4d8d4f16c727 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -5,14 +5,26 @@ */ #include <linux/fs.h> +#include <linux/dax.h> +#include <linux/pci.h> +#include <linux/pfn_t.h> +#include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> #include <linux/delay.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/highmem.h> +#include <linux/uio.h> #include "fuse_i.h" +/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -24,6 +36,8 @@ enum { VQ_REQUEST }; +#define VQ_NAME_LEN 24 + /* Per-virtqueue state */ struct virtio_fs_vq { spinlock_t lock; @@ -36,7 +50,7 @@ struct virtio_fs_vq { bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ - char name[24]; + char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ @@ -47,6 +61,12 @@ struct virtio_fs { struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; }; struct virtio_fs_forget_req { @@ -60,19 +80,70 @@ struct virtio_fs_forget { struct virtio_fs_forget_req req; }; +struct virtio_fs_req_work { + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + struct work_struct done_work; +}; + static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); -static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) +static const struct constant_table dax_param_enums[] = { + {"always", FUSE_DAX_ALWAYS }, + {"never", FUSE_DAX_NEVER }, + {"inode", FUSE_DAX_INODE_USER }, + {} +}; + +enum { + OPT_DAX, + OPT_DAX_ENUM, +}; + +static const struct fs_parameter_spec virtio_fs_parameters[] = { + fsparam_flag("dax", OPT_DAX), + fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), + {} +}; + +static int virtio_fs_parse_param(struct fs_context *fsc, + struct fs_parameter *param) { - struct virtio_fs *fs = vq->vdev->priv; + struct fs_parse_result result; + struct fuse_fs_context *ctx = fsc->fs_private; + int opt; + + opt = fs_parse(fsc, virtio_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_DAX: + ctx->dax_mode = FUSE_DAX_ALWAYS; + break; + case OPT_DAX_ENUM: + ctx->dax_mode = result.uint_32; + break; + default: + return -EINVAL; + } - return &fs->vqs[vq->index]; + return 0; } -static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) +static void virtio_fs_free_fsc(struct fs_context *fsc) { - return &vq_to_fsvq(vq)->fud->pq; + struct fuse_fs_context *ctx = fsc->fs_private; + + kfree(ctx); +} + +static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) +{ + struct virtio_fs *fs = vq->vdev->priv; + + return &fs->vqs[vq->index]; } /* Should be called with fsvq->lock held. */ @@ -283,7 +354,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct fuse_conn *fc = fsvq->fud->fc; int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); @@ -298,7 +368,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); } /* Dispatch pending requests */ @@ -329,7 +399,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) spin_unlock(&fsvq->lock); pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); - fuse_request_end(fc, req); + fuse_request_end(req); } } } @@ -485,19 +555,66 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) } /* Work function for request completion */ +static void virtio_fs_request_complete(struct fuse_req *req, + struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_args *args; + struct fuse_args_pages *ap; + unsigned int len, i, thislen; + struct page *page; + + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); + + fuse_request_end(req); + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_complete_req_work(struct work_struct *work) +{ + struct virtio_fs_req_work *w = + container_of(work, typeof(*w), done_work); + + virtio_fs_request_complete(w->req, w->fsvq); + kfree(w); +} + static void virtio_fs_requests_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, done_work); struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct virtqueue *vq = fsvq->vq; struct fuse_req *req; - struct fuse_args_pages *ap; struct fuse_req *next; - struct fuse_args *args; - unsigned int len, i, thislen; - struct page *page; + unsigned int len; LIST_HEAD(reqs); /* Collect completed requests off the virtqueue */ @@ -515,38 +632,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work) /* End requests */ list_for_each_entry_safe(req, next, &reqs, list) { - /* - * TODO verify that server properly follows FUSE protocol - * (oh.uniq, oh.len) - */ - args = req->args; - copy_args_from_argbuf(args, req); - - if (args->out_pages && args->page_zeroing) { - len = args->out_args[args->out_numargs - 1].size; - ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { - thislen = ap->descs[i].length; - if (len < thislen) { - WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); - len = 0; - } else { - len -= thislen; - } - } - } - - spin_lock(&fpq->lock); - clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - spin_unlock(&fpq->lock); - fuse_request_end(fc, req); - spin_lock(&fsvq->lock); - dec_in_flight_req(fsvq); - spin_unlock(&fsvq->lock); + /* blocking async request completes in a worker context */ + if (req->args->may_block) { + struct virtio_fs_req_work *w; + + w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL); + INIT_WORK(&w->done_work, virtio_fs_complete_req_work); + w->fsvq = fsvq; + w->req = req; + schedule_work(&w->done_work); + } else { + virtio_fs_request_complete(req, fsvq); + } } } @@ -560,6 +659,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq) schedule_work(&fsvq->done_work); } +static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, + int vq_type) +{ + strscpy(fsvq->name, name, VQ_NAME_LEN); + spin_lock_init(&fsvq->lock); + INIT_LIST_HEAD(&fsvq->queued_reqs); + INIT_LIST_HEAD(&fsvq->end_reqs); + init_completion(&fsvq->in_flight_zero); + + if (vq_type == VQ_REQUEST) { + INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); + } else { + INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); + } +} + /* Initialize virtqueues */ static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) @@ -570,12 +689,12 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, unsigned int i; int ret = 0; - virtio_cread(vdev, struct virtio_fs_config, num_request_queues, - &fs->num_request_queues); + virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues, + &fs->num_request_queues); if (fs->num_request_queues == 0) return -EINVAL; - fs->nvqs = 1 + fs->num_request_queues; + fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; @@ -589,29 +708,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, goto out; } + /* Initialize the hiprio/forget request virtqueue */ callbacks[VQ_HIPRIO] = virtio_fs_vq_done; - snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), - "hiprio"); + virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; - INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); - INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, - virtio_fs_hiprio_dispatch_work); - init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); - spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { - spin_lock_init(&fs->vqs[i].lock); - INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); - INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, - virtio_fs_request_dispatch_work); - INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[i].end_reqs); - init_completion(&fs->vqs[i].in_flight_zero); - snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), - "requests.%u", i - VQ_REQUEST); + char vq_name[VQ_NAME_LEN]; + + snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); + virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; names[i] = fs->vqs[i].name; } @@ -634,12 +741,121 @@ out: } /* Free virtqueues (device must already be reset) */ -static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, - struct virtio_fs *fs) +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) { vdev->config->del_vqs(vdev); } +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, + NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->range = (struct range) { + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + pgmap->nr_range = 1; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -661,6 +877,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ @@ -673,8 +893,9 @@ static int virtio_fs_probe(struct virtio_device *vdev) return 0; out_vqs: - vdev->config->reset(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_reset_device(vdev); + virtio_fs_cleanup_vqs(vdev); + kfree(fs->vqs); out: vdev->priv = NULL; @@ -704,8 +925,8 @@ static void virtio_fs_remove(struct virtio_device *vdev) list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); - vdev->config->reset(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_reset_device(vdev); + virtio_fs_cleanup_vqs(vdev); vdev->priv = NULL; /* Put device reference on virtio_fs object */ @@ -797,18 +1018,37 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +/* Count number of scatter-gather elements required */ +static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + this_len = min(page_descs[i].length, total_len); + total_len -= this_len; + } + + return i; +} + /* Return the number of scatter-gather list elements required */ static unsigned int sg_count_fuse_req(struct fuse_req *req) { struct fuse_args *args = req->args; struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); - unsigned int total_sgs = 1 /* fuse_in_header */; + unsigned int size, total_sgs = 1 /* fuse_in_header */; if (args->in_numargs - args->in_pages) total_sgs += 1; - if (args->in_pages) - total_sgs += ap->num_pages; + if (args->in_pages) { + size = args->in_args[args->in_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } if (!test_bit(FR_ISREPLY, &req->flags)) return total_sgs; @@ -818,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_numargs - args->out_pages) total_sgs += 1; - if (args->out_pages) - total_sgs += ap->num_pages; + if (args->out_pages) { + size = args->out_args[args->out_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } return total_sgs; } @@ -1035,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .release = virtio_fs_fiq_release, }; -static int virtio_fs_fill_super(struct super_block *sb) +static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) +{ + ctx->rootmode = S_IFDIR; + ctx->default_permissions = 1; + ctx->allow_other = 1; + ctx->max_read = UINT_MAX; + ctx->blksize = 512; + ctx->destroy = true; + ctx->no_control = true; + ctx->no_force_umount = true; +} + +static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct virtio_fs *fs = fc->iq.priv; + struct fuse_fs_context *ctx = fsc->fs_private; unsigned int i; int err; - struct fuse_fs_context ctx = { - .rootmode = S_IFDIR, - .default_permissions = 1, - .allow_other = 1, - .max_read = UINT_MAX, - .blksize = 512, - .destroy = true, - .no_control = true, - .no_force_umount = true, - .no_mount_options = true, - }; + virtio_fs_ctx_set_defaults(ctx); mutex_lock(&virtio_fs_mutex); /* After holding mutex, make sure virtiofs device is still there. @@ -1067,7 +1314,7 @@ static int virtio_fs_fill_super(struct super_block *sb) err = -ENOMEM; /* Allocate fuse_dev for hiprio and notification queues */ - for (i = 0; i < VQ_REQUEST; i++) { + for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; fsvq->fud = fuse_dev_alloc(); @@ -1075,24 +1322,30 @@ static int virtio_fs_fill_super(struct super_block *sb) goto err_free_fuse_devs; } - ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; - err = fuse_fill_super_common(sb, &ctx); + /* virtiofs allocates and installs its own fuse devices */ + ctx->fudptr = NULL; + if (ctx->dax_mode != FUSE_DAX_NEVER) { + if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { + err = -EINVAL; + pr_err("virtio-fs: dax can't be enabled as filesystem" + " device does not support it.\n"); + goto err_free_fuse_devs; + } + ctx->dax_dev = fs->dax_dev; + } + err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; - fc = fs->vqs[VQ_REQUEST].fud->fc; - for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; - if (i == VQ_REQUEST) - continue; /* already initialized */ fuse_dev_install(fsvq->fud, fc); } /* Previous unmount will stop all queues. Start these again */ virtio_fs_start_all_queues(fs); - fuse_send_init(fc); + fuse_send_init(fm); mutex_unlock(&virtio_fs_mutex); return 0; @@ -1103,18 +1356,17 @@ err: return err; } -static void virtio_kill_sb(struct super_block *sb) +static void virtio_fs_conn_destroy(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); - struct virtio_fs *vfs; - struct virtio_fs_vq *fsvq; + struct fuse_conn *fc = fm->fc; + struct virtio_fs *vfs = fc->iq.priv; + struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO]; - /* If mount failed, we can still be called without any fc */ - if (!fc) - return fuse_kill_sb_anon(sb); - - vfs = fc->iq.priv; - fsvq = &vfs->vqs[VQ_HIPRIO]; + /* Stop dax worker. Soon evict_inodes() will be called which + * will free all memory ranges belonging to all inodes. + */ + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_cancel_work(fc); /* Stop forget queue. Soon destroy will be sent */ spin_lock(&fsvq->lock); @@ -1122,9 +1374,9 @@ static void virtio_kill_sb(struct super_block *sb) spin_unlock(&fsvq->lock); virtio_fs_drain_all_queues(vfs); - fuse_kill_sb_anon(sb); + fuse_conn_destroy(fm); - /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + /* fuse_conn_destroy() must have sent destroy. Stop all queues * and drain one more time and free fuse devices. Freeing fuse * devices will drop their reference on fuse_conn and that in * turn will drop its reference on virtio_fs object. @@ -1134,32 +1386,38 @@ static void virtio_kill_sb(struct super_block *sb) virtio_fs_free_devs(vfs); } -static int virtio_fs_test_super(struct super_block *sb, - struct fs_context *fsc) +static void virtio_kill_sb(struct super_block *sb) { - struct fuse_conn *fc = fsc->s_fs_info; + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; - return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; + /* If mount failed, we can still be called without any fc */ + if (sb->s_root) { + last = fuse_mount_remove(fm); + if (last) + virtio_fs_conn_destroy(fm); + } + kill_anon_super(sb); + fuse_mount_destroy(fm); } -static int virtio_fs_set_super(struct super_block *sb, - struct fs_context *fsc) +static int virtio_fs_test_super(struct super_block *sb, + struct fs_context *fsc) { - int err; + struct fuse_mount *fsc_fm = fsc->s_fs_info; + struct fuse_mount *sb_fm = get_fuse_mount_super(sb); - err = get_anon_bdev(&sb->s_dev); - if (!err) - fuse_conn_get(fsc->s_fs_info); - - return err; + return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; } static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; - struct fuse_conn *fc; - int err; + struct fuse_conn *fc = NULL; + struct fuse_mount *fm; + unsigned int virtqueue_size; + int err = -EIO; /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() @@ -1171,27 +1429,38 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; } + virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); - if (!fc) { - mutex_lock(&virtio_fs_mutex); - virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); - return -ENOMEM; - } + if (!fc) + goto out_err; - fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, - fs); + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + goto out_err; + + fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; + fc->auto_submounts = true; + fc->sync_fs = true; + + /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); - fsc->s_fs_info = fc; - sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); - fuse_conn_put(fc); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { - err = virtio_fs_fill_super(sb); + err = virtio_fs_fill_super(sb, fsc); if (err) { deactivate_locked_super(sb); return err; @@ -1203,14 +1472,32 @@ static int virtio_fs_get_tree(struct fs_context *fsc) WARN_ON(fsc->root); fsc->root = dget(sb->s_root); return 0; + +out_err: + kfree(fc); + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return err; } static const struct fs_context_operations virtio_fs_context_ops = { + .free = virtio_fs_free_fsc, + .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; static int virtio_fs_init_fs_context(struct fs_context *fsc) { + struct fuse_fs_context *ctx; + + if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT) + return fuse_init_fs_context_submount(fsc); + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fsc->fs_private = ctx; fsc->ops = &virtio_fs_context_ops; return 0; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 20d052e08b3b..0d3e7177fce0 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -12,50 +12,52 @@ #include <linux/posix_acl_xattr.h> int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags) + size_t size, int flags, unsigned int extra_flags) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; - if (fc->no_setxattr) + if (fm->fc->no_setxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; + inarg.setxattr_flags = extra_flags; + args.opcode = FUSE_SETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 3; - args.in_args[0].size = sizeof(inarg); + args.in_args[0].size = fm->fc->setxattr_ext ? + sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_setxattr = 1; + fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) { - fuse_invalidate_attr(inode); + if (!err) fuse_update_ctime(inode); - } + return err; } ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (fc->no_getxattr) + if (fm->fc->no_getxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -77,11 +79,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { - fc->no_getxattr = 1; + fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -107,16 +109,19 @@ static int fuse_verify_xattr_list(char *list, size_t size) ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_current_process(fc)) + if (fuse_is_bad(inode)) + return -EIO; + + if (!fuse_allow_current_process(fm->fc)) return -EACCES; - if (fc->no_listxattr) + if (fm->fc->no_listxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -136,13 +141,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { - fc->no_listxattr = 1; + fm->fc->no_listxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -150,11 +155,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) int fuse_removexattr(struct inode *inode, const char *name) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); int err; - if (fc->no_removexattr) + if (fm->fc->no_removexattr) return -EOPNOTSUPP; args.opcode = FUSE_REMOVEXATTR; @@ -162,15 +167,14 @@ int fuse_removexattr(struct inode *inode, const char *name) args.in_numargs = 1; args.in_args[0].size = strlen(name) + 1; args.in_args[0].value = name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_removexattr = 1; + fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) { - fuse_invalidate_attr(inode); + if (!err) fuse_update_ctime(inode); - } + return err; } @@ -178,18 +182,25 @@ static int fuse_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { + if (fuse_is_bad(inode)) + return -EIO; + return fuse_getxattr(inode, name, value, size); } static int fuse_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { + if (fuse_is_bad(inode)) + return -EIO; + if (!value) return fuse_removexattr(inode, name); - return fuse_setxattr(inode, name, value, size, flags); + return fuse_setxattr(inode, name, value, size, flags, 0); } static bool no_xattr_list(struct dentry *dentry) @@ -205,6 +216,7 @@ static int no_xattr_get(const struct xattr_handler *handler, } static int no_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *nodee, const char *name, const void *value, size_t size, int flags) |