aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fuse
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fuse')
-rw-r--r--fs/fuse/Kconfig18
-rw-r--r--fs/fuse/Makefile6
-rw-r--r--fs/fuse/acl.c21
-rw-r--r--fs/fuse/control.c29
-rw-r--r--fs/fuse/cuse.c35
-rw-r--r--fs/fuse/dax.c1390
-rw-r--r--fs/fuse/dev.c296
-rw-r--r--fs/fuse/dir.c569
-rw-r--r--fs/fuse/file.c1332
-rw-r--r--fs/fuse/fuse_i.h321
-rw-r--r--fs/fuse/inode.c885
-rw-r--r--fs/fuse/ioctl.c504
-rw-r--r--fs/fuse/readdir.c37
-rw-r--r--fs/fuse/virtio_fs.c557
-rw-r--r--fs/fuse/xattr.c64
15 files changed, 4541 insertions, 1523 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index eb2a585572dc..038ed0b9aaa5 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -8,11 +8,11 @@ config FUSE_FS
There's also a companion library: libfuse2. This library is available
from the FUSE homepage:
- <http://fuse.sourceforge.net/>
+ <https://github.com/libfuse/>
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
- See <file:Documentation/filesystems/fuse.txt> for more information.
+ See <file:Documentation/filesystems/fuse.rst> for more information.
See <file:Documentation/Changes> for needed library/utility version.
If you want to develop a userspace FS, or if you want to use
@@ -38,3 +38,17 @@ config VIRTIO_FS
If you want to share files between guests or with the host, answer Y
or M.
+
+config FUSE_DAX
+ bool "Virtio Filesystem Direct Host Memory Access support"
+ default y
+ select INTERVAL_TREE
+ depends on VIRTIO_FS
+ depends on FS_DAX
+ depends on DAX
+ help
+ This allows bypassing guest page cache and allows mapping host page
+ cache directly in guest address space.
+
+ If you want to allow mounting a Virtio Filesystem with the "dax"
+ option, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3e8cebfb59b7..0c48b35c058d 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
-virtiofs-y += virtio_fs.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-$(CONFIG_FUSE_DAX) += dax.o
+
+virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 5a48cee6d7d3..337cb29a8dd5 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -11,7 +11,7 @@
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
-struct posix_acl *fuse_get_acl(struct inode *inode, int type)
+struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int size;
@@ -19,6 +19,12 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
void *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ if (fuse_is_bad(inode))
+ return ERR_PTR(-EIO);
+
if (!fc->posix_acl || fc->no_getxattr)
return NULL;
@@ -47,12 +53,16 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
return acl;
}
-int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
struct fuse_conn *fc = get_fuse_conn(inode);
const char *name;
int ret;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fc->posix_acl || fc->no_setxattr)
return -EOPNOTSUPP;
@@ -64,6 +74,7 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return -EINVAL;
if (acl) {
+ unsigned int extra_flags = 0;
/*
* Fuse userspace is responsible for updating access
* permissions in the inode, if needed. fuse_setxattr
@@ -87,7 +98,11 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return ret;
}
- ret = fuse_setxattr(inode, name, value, size, 0);
+ if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
+
+ ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
kfree(value);
} else {
ret = fuse_removexattr(inode, name);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index c23f6f243ad4..247ef4f76761 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -120,7 +120,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned uninitialized_var(val);
+ unsigned val;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -162,7 +162,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned uninitialized_var(val);
+ unsigned val;
struct fuse_conn *fc;
ssize_t ret;
@@ -174,18 +174,11 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
if (!fc)
goto out;
+ down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
- if (fc->sb) {
- if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
- } else {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
- }
- }
spin_unlock(&fc->bg_lock);
+ up_read(&fc->killsb);
fuse_conn_put(fc);
out:
return ret;
@@ -265,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
struct dentry *parent;
char name[32];
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return 0;
parent = fuse_control_sb->s_root;
@@ -303,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
int i;
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return;
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
@@ -318,7 +311,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
drop_nlink(d_inode(fuse_control_sb->s_root));
}
-static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
+static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
{
static const struct tree_descr empty_descr = {""};
struct fuse_conn *fc;
@@ -344,18 +337,18 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
return 0;
}
-static int fuse_ctl_get_tree(struct fs_context *fc)
+static int fuse_ctl_get_tree(struct fs_context *fsc)
{
- return get_tree_single(fc, fuse_ctl_fill_super);
+ return get_tree_single(fsc, fuse_ctl_fill_super);
}
static const struct fs_context_operations fuse_ctl_context_ops = {
.get_tree = fuse_ctl_get_tree,
};
-static int fuse_ctl_init_fs_context(struct fs_context *fc)
+static int fuse_ctl_init_fs_context(struct fs_context *fsc)
{
- fc->ops = &fuse_ctl_context_ops;
+ fsc->ops = &fuse_ctl_context_ops;
return 0;
}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 030f094910c3..c7d882a9fe33 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -57,6 +57,7 @@
struct cuse_conn {
struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_mount fm; /* Dummy mount referencing fc */
struct fuse_conn fc; /* fuse connection */
struct cdev *cdev; /* associated character device */
struct device *dev; /* device representing @cdev */
@@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file)
* Generic permission check is already done against the chrdev
* file, proceed to open.
*/
- rc = fuse_do_open(&cc->fc, 0, file, 0);
+ rc = fuse_do_open(&cc->fm, 0, file, 0);
if (rc)
fuse_conn_put(&cc->fc);
return rc;
@@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file)
static int cuse_release(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_sync_release(NULL, ff, file->f_flags);
- fuse_conn_put(fc);
+ fuse_conn_put(fm->fc);
return 0;
}
@@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = 0;
if (cc->unrestricted_ioctl)
@@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = FUSE_IOCTL_COMPAT;
if (cc->unrestricted_ioctl)
@@ -270,7 +271,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
{
char *end = p + len;
- char *uninitialized_var(key), *uninitialized_var(val);
+ char *key, *val;
int rc;
while (true) {
@@ -313,9 +314,10 @@ struct cuse_init_args {
* required data structures for it. Please read the comment at the
* top of this file for high level overview.
*/
-static void cuse_process_init_reply(struct fuse_conn *fc,
+static void cuse_process_init_reply(struct fuse_mount *fm,
struct fuse_args *args, int error)
{
+ struct fuse_conn *fc = fm->fc;
struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_args_pages *ap = &ia->ap;
struct cuse_conn *cc = fc_to_cc(fc), *pos;
@@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc)
{
int rc;
struct page *page;
- struct fuse_conn *fc = &cc->fc;
+ struct fuse_mount *fm = &cc->fm;
struct cuse_init_args *ia;
struct fuse_args_pages *ap;
@@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc)
ia->desc.length = ap->args.out_args[1].size;
ap->args.end = cuse_process_init_reply;
- rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (rc) {
kfree(ia);
err_free_page:
@@ -506,22 +508,21 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
* Limit the cuse channel to requests that can
* be represented in file->f_cred->user_ns.
*/
- fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
+ fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
+ &fuse_dev_fiq_ops, NULL);
+ cc->fc.release = cuse_fc_release;
fud = fuse_dev_alloc_install(&cc->fc);
- if (!fud) {
- kfree(cc);
+ fuse_conn_put(&cc->fc);
+ if (!fud)
return -ENOMEM;
- }
INIT_LIST_HEAD(&cc->list);
- cc->fc.release = cuse_fc_release;
cc->fc.initialized = 1;
rc = cuse_send_init(cc);
if (rc) {
fuse_dev_free(fud);
- fuse_conn_put(&cc->fc);
return rc;
}
file->private_data = fud;
@@ -558,8 +559,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
unregister_chrdev_region(cc->cdev->dev, 1);
cdev_del(cc->cdev);
}
- /* Base reference is now owned by "fud" */
- fuse_conn_put(&cc->fc);
rc = fuse_dev_release(inode, file); /* puts the base reference */
@@ -624,6 +623,8 @@ static int __init cuse_init(void)
cuse_channel_fops.owner = THIS_MODULE;
cuse_channel_fops.open = cuse_channel_open;
cuse_channel_fops.release = cuse_channel_release;
+ /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
+ cuse_channel_fops.unlocked_ioctl = NULL;
cuse_class = class_create(THIS_MODULE, "cuse");
if (IS_ERR(cuse_class))
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
new file mode 100644
index 000000000000..e23e802a8013
--- /dev/null
+++ b/fs/fuse/dax.c
@@ -0,0 +1,1390 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dax: direct host memory access
+ * Copyright (C) 2020 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/delay.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/pagemap.h>
+#include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
+
+/*
+ * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
+#define FUSE_DAX_SHIFT 21
+#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
+#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
+
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK (10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ */
+#define FUSE_DAX_RECLAIM_THRESHOLD (20)
+
+/** Translation information for file offsets to DAX window offsets */
+struct fuse_dax_mapping {
+ /* Pointer to inode where this memory range is mapped */
+ struct inode *inode;
+
+ /* Will connect in fcd->free_ranges to keep track of free memory */
+ struct list_head list;
+
+ /* For interval tree in file/inode */
+ struct interval_tree_node itn;
+
+ /* Will connect in fc->busy_ranges to keep track busy memory */
+ struct list_head busy_list;
+
+ /** Position in DAX window */
+ u64 window_offset;
+
+ /** Length of mapping, in bytes */
+ loff_t length;
+
+ /* Is this mapping read-only or read-write */
+ bool writable;
+
+ /* reference count when the mapping is used by dax iomap. */
+ refcount_t refcnt;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+ /* Semaphore to protect modifications to the dmap tree */
+ struct rw_semaphore sem;
+
+ /* Sorted rb tree of struct fuse_dax_mapping elements */
+ struct rb_root_cached tree;
+ unsigned long nr;
+};
+
+struct fuse_conn_dax {
+ /* DAX device */
+ struct dax_device *dev;
+
+ /* Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /* List of memory ranges which are busy */
+ unsigned long nr_busy_ranges;
+ struct list_head busy_ranges;
+
+ /* Worker to free up memory ranges */
+ struct delayed_work free_work;
+
+ /* Wait queue for a dax range to become free */
+ wait_queue_head_t range_waitq;
+
+ /* DAX Window Free Ranges */
+ long nr_free_ranges;
+ struct list_head free_ranges;
+
+ unsigned long nr_ranges;
+};
+
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+ if (!node)
+ return NULL;
+
+ return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
+
+static void
+__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
+{
+ unsigned long free_threshold;
+
+ /* If number of free ranges are below threshold, start reclaim */
+ free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
+ 1);
+ if (fcd->nr_free_ranges < free_threshold)
+ queue_delayed_work(system_long_wq, &fcd->free_work,
+ msecs_to_jiffies(delay_ms));
+}
+
+static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
+ unsigned long delay_ms)
+{
+ spin_lock(&fcd->lock);
+ __kick_dmap_free_worker(fcd, delay_ms);
+ spin_unlock(&fcd->lock);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+ struct fuse_dax_mapping *dmap;
+
+ spin_lock(&fcd->lock);
+ dmap = list_first_entry_or_null(&fcd->free_ranges,
+ struct fuse_dax_mapping, list);
+ if (dmap) {
+ list_del_init(&dmap->list);
+ WARN_ON(fcd->nr_free_ranges <= 0);
+ fcd->nr_free_ranges--;
+ }
+ __kick_dmap_free_worker(fcd, 0);
+ spin_unlock(&fcd->lock);
+
+ return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_del_init(&dmap->busy_list);
+ WARN_ON(fcd->nr_busy_ranges == 0);
+ fcd->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ spin_lock(&fcd->lock);
+ __dmap_remove_busy_list(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_add_tail(&dmap->list, &fcd->free_ranges);
+ fcd->nr_free_ranges++;
+ wake_up(&fcd->range_waitq);
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ /* Return fuse_dax_mapping to free list */
+ spin_lock(&fcd->lock);
+ __dmap_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+ struct fuse_dax_mapping *dmap, bool writable,
+ bool upgrade)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn_dax *fcd = fm->fc->dax;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_setupmapping_in inarg;
+ loff_t offset = start_idx << FUSE_DAX_SHIFT;
+ FUSE_ARGS(args);
+ ssize_t err;
+
+ WARN_ON(fcd->nr_free_ranges < 0);
+
+ /* Ask fuse daemon to setup mapping */
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.foffset = offset;
+ inarg.fh = -1;
+ inarg.moffset = dmap->window_offset;
+ inarg.len = FUSE_DAX_SZ;
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+ if (writable)
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+ args.opcode = FUSE_SETUPMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err < 0)
+ return err;
+ dmap->writable = writable;
+ if (!upgrade) {
+ /*
+ * We don't take a reference on inode. inode is valid right now
+ * and when inode is going away, cleanup logic should first
+ * cleanup dmap entries.
+ */
+ dmap->inode = inode;
+ dmap->itn.start = dmap->itn.last = start_idx;
+ /* Protected by fi->dax->sem */
+ interval_tree_insert(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr++;
+ spin_lock(&fcd->lock);
+ list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
+ fcd->nr_busy_ranges++;
+ spin_unlock(&fcd->lock);
+ }
+ return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+ struct fuse_removemapping_in *inargp,
+ struct fuse_removemapping_one *remove_one)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_REMOVEMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(*inargp);
+ args.in_args[0].value = inargp;
+ args.in_args[1].size = inargp->count * sizeof(*remove_one);
+ args.in_args[1].value = remove_one;
+ return fuse_simple_request(fm, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+ struct list_head *to_remove)
+{
+ struct fuse_removemapping_one *remove_one, *ptr;
+ struct fuse_removemapping_in inarg;
+ struct fuse_dax_mapping *dmap;
+ int ret, i = 0, nr_alloc;
+
+ nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+ remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+ if (!remove_one)
+ return -ENOMEM;
+
+ ptr = remove_one;
+ list_for_each_entry(dmap, to_remove, list) {
+ ptr->moffset = dmap->window_offset;
+ ptr->len = dmap->length;
+ ptr++;
+ i++;
+ num--;
+ if (i >= nr_alloc || num == 0) {
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = i;
+ ret = fuse_send_removemapping(inode, &inarg,
+ remove_one);
+ if (ret)
+ goto out;
+ ptr = remove_one;
+ i = 0;
+ }
+ }
+out:
+ kfree(remove_one);
+ return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+ dmap->itn.start, dmap->itn.last, dmap->window_offset,
+ dmap->length);
+ __dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+ __dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ loff_t start, loff_t end)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap, *n;
+ int err, num = 0;
+ LIST_HEAD(to_remove);
+ unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+ unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ while (1) {
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+ end_idx);
+ if (!node)
+ break;
+ dmap = node_to_dmap(node);
+ /* inode is going away. There should not be any users of dmap */
+ WARN_ON(refcount_read(&dmap->refcnt) > 1);
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ num++;
+ list_add(&dmap->list, &to_remove);
+ }
+
+ /* Nothing to remove */
+ if (list_empty(&to_remove))
+ return;
+
+ WARN_ON(fi->dax->nr < num);
+ fi->dax->nr -= num;
+ err = dmap_removemapping_list(inode, num, &to_remove);
+ if (err && err != -ENOTCONN) {
+ pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+ start, end);
+ }
+ spin_lock(&fcd->lock);
+ list_for_each_entry_safe(dmap, n, &to_remove, list) {
+ list_del_init(&dmap->list);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ }
+ spin_unlock(&fcd->lock);
+}
+
+static int dmap_removemapping_one(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ struct fuse_removemapping_one forget_one;
+ struct fuse_removemapping_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = 1;
+ memset(&forget_one, 0, sizeof(forget_one));
+ forget_one.moffset = dmap->window_offset;
+ forget_one.len = dmap->length;
+
+ return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * fuse_evict_inode() has already called truncate_inode_pages_final()
+ * before we arrive here. So we should not have to worry about any
+ * pages/exception entries still associated with inode.
+ */
+ inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+ WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap, struct fuse_dax_mapping *dmap,
+ unsigned int flags)
+{
+ loff_t offset, len;
+ loff_t i_size = i_size_read(inode);
+
+ offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+ len = min(length, dmap->length - offset);
+
+ /* If length is beyond end of file, truncate further */
+ if (pos + len > i_size)
+ len = i_size - pos;
+
+ if (len > 0) {
+ iomap->addr = dmap->window_offset + offset;
+ iomap->length = len;
+ if (flags & IOMAP_FAULT)
+ iomap->length = ALIGN(len, PAGE_SIZE);
+ iomap->type = IOMAP_MAPPED;
+ /*
+ * increace refcnt so that reclaim code knows this dmap is in
+ * use. This assumes fi->dax->sem mutex is held either
+ * shared/exclusive.
+ */
+ refcount_inc(&dmap->refcnt);
+
+ /* iomap->private should be NULL */
+ WARN_ON_ONCE(iomap->private);
+ iomap->private = dmap;
+ } else {
+ /* Mapping beyond end of file is hole */
+ fuse_fill_iomap_hole(iomap, length);
+ }
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+ int ret;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Can't do inline reclaim in fault path. We call
+ * dax_layout_busy_page() before we free a range. And
+ * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
+ * In fault path we enter with mapping->invalidate_lock held and can't
+ * drop it. Also in fault path we hold mapping->invalidate_lock shared
+ * and not exclusive, so that creates further issues with
+ * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
+ * will wait for a memory range to become free and retry.
+ */
+ if (flags & IOMAP_FAULT) {
+ alloc_dmap = alloc_dax_mapping(fcd);
+ if (!alloc_dmap)
+ return -EAGAIN;
+ } else {
+ alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
+ if (IS_ERR(alloc_dmap))
+ return PTR_ERR(alloc_dmap);
+ }
+
+ /* If we are here, we should have memory allocated */
+ if (WARN_ON(!alloc_dmap))
+ return -EIO;
+
+ /*
+ * Take write lock so that only one caller can try to setup mapping
+ * and other waits.
+ */
+ down_write(&fi->dax->sem);
+ /*
+ * We dropped lock. Check again if somebody else setup
+ * mapping already.
+ */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return 0;
+ }
+
+ /* Setup one mapping */
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+ writable, false);
+ if (ret < 0) {
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return ret;
+ }
+ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+ up_write(&fi->dax->sem);
+ return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ int ret;
+ unsigned long idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Take exclusive lock so that only one caller can try to setup
+ * mapping and others wait.
+ */
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+ /* We are holding either inode lock or invalidate_lock, and that should
+ * ensure that dmap can't be truncated. We are holding a reference
+ * on dmap and that should make sure it can't be reclaimed. So dmap
+ * should still be there in tree despite the fact we dropped and
+ * re-acquired the fi->dax->sem lock.
+ */
+ ret = -EIO;
+ if (WARN_ON(!node))
+ goto out_err;
+
+ dmap = node_to_dmap(node);
+
+ /* We took an extra reference on dmap to make sure its not reclaimd.
+ * Now we hold fi->dax->sem lock and that reference is not needed
+ * anymore. Drop it.
+ */
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Maybe another thread already upgraded mapping while we were not
+ * holding lock.
+ */
+ if (dmap->writable) {
+ ret = 0;
+ goto out_fill_iomap;
+ }
+
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+ true);
+ if (ret < 0)
+ goto out_err;
+out_fill_iomap:
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+ up_write(&fi->dax->sem);
+ return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_dax_mapping *dmap;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /* We don't support FIEMAP */
+ if (WARN_ON(flags & IOMAP_REPORT))
+ return -EIO;
+
+ iomap->offset = pos;
+ iomap->flags = 0;
+ iomap->bdev = NULL;
+ iomap->dax_dev = fc->dax->dev;
+
+ /*
+ * Both read/write and mmap path can race here. So we need something
+ * to make sure if we are setting up mapping, then other path waits
+ *
+ * For now, use a semaphore for this. It probably needs to be
+ * optimized later.
+ */
+ down_read(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ if (writable && !dmap->writable) {
+ /* Upgrade read-only mapping to read-write. This will
+ * require exclusive fi->dax->sem lock as we don't want
+ * two threads to be trying to this simultaneously
+ * for same dmap. So drop shared lock and acquire
+ * exclusive lock.
+ *
+ * Before dropping fi->dax->sem lock, take reference
+ * on dmap so that its not freed by range reclaim.
+ */
+ refcount_inc(&dmap->refcnt);
+ up_read(&fi->dax->sem);
+ pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ return fuse_upgrade_dax_mapping(inode, pos, length,
+ flags, iomap);
+ } else {
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ up_read(&fi->dax->sem);
+ return 0;
+ }
+ } else {
+ up_read(&fi->dax->sem);
+ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ if (pos >= i_size_read(inode))
+ goto iomap_hole;
+
+ return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+ iomap);
+ }
+
+ /*
+ * If read beyond end of file happens, fs code seems to return
+ * it as hole
+ */
+iomap_hole:
+ fuse_fill_iomap_hole(iomap, length);
+ pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+ __func__, pos, length, iomap->length);
+ return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_dax_mapping *dmap = iomap->private;
+
+ if (dmap) {
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* DAX writes beyond end-of-file aren't handled using iomap, so the
+ * file size is unchanged and there is nothing to do here.
+ */
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+ .iomap_end = fuse_iomap_end,
+};
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+ filemap_invalidate_unlock(inode->i_mapping);
+ schedule();
+ filemap_invalidate_lock(inode->i_mapping);
+}
+
+/* Should be called with mapping->invalidate_lock held exclusively */
+static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
+ loff_t start, loff_t end)
+{
+ struct page *page;
+
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
+ u64 dmap_end)
+{
+ bool retry;
+ int ret;
+
+ do {
+ retry = false;
+ ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
+ dmap_end);
+ } while (ret == 0 && retry);
+
+ return ret;
+}
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+ inode_unlock_shared(inode);
+
+ /* TODO file_accessed(iocb->f_filp) */
+ return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return (iov_iter_rw(from) == WRITE &&
+ ((iocb->ki_pos) >= i_size_read(inode) ||
+ (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+ ssize_t ret;
+
+ ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
+ return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ /* TODO file_update_time() but we don't want metadata I/O */
+
+ /* Do not use dax for file extending writes as write and on
+ * disk i_size increase are not atomic otherwise.
+ */
+ if (file_extending_write(iocb, from))
+ ret = fuse_dax_direct_write(iocb, from);
+ else
+ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
+static int fuse_dax_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
+}
+
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, bool write)
+{
+ vm_fault_t ret;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ pfn_t pfn;
+ int error = 0;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ bool retry = false;
+
+ if (write)
+ sb_start_pagefault(sb);
+retry:
+ if (retry && !(fcd->nr_free_ranges > 0))
+ wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
+
+ /*
+ * We need to serialize against not only truncate but also against
+ * fuse dax memory range reclaim. While a range is being reclaimed,
+ * we do not want any read/write/mmap to make progress and try
+ * to populate page cache or access memory we are trying to free.
+ */
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
+ error = 0;
+ retry = true;
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+ goto retry;
+ }
+
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+
+ if (write)
+ sb_end_pagefault(sb);
+
+ return ret;
+}
+
+static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE,
+ vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static const struct vm_operations_struct fuse_dax_vm_ops = {
+ .fault = fuse_dax_fault,
+ .huge_fault = fuse_dax_huge_fault,
+ .page_mkwrite = fuse_dax_page_mkwrite,
+ .pfn_mkwrite = fuse_dax_pfn_mkwrite,
+};
+
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &fuse_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
+}
+
+static int dmap_writeback_invalidate(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
+ loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
+ if (ret) {
+ pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
+ ret, start_pos, end_pos);
+ return ret;
+ }
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ start_pos >> PAGE_SHIFT,
+ end_pos >> PAGE_SHIFT);
+ if (ret)
+ pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+ ret);
+
+ return ret;
+}
+
+static int reclaim_one_dmap_locked(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * igrab() was done to make sure inode won't go under us, and this
+ * further avoids the race with evict().
+ */
+ ret = dmap_writeback_invalidate(inode, dmap);
+ if (ret)
+ return ret;
+
+ /* Remove dax mapping from inode interval tree now */
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr--;
+
+ /* It is possible that umount/shutdown has killed the fuse connection
+ * and worker thread is trying to reclaim memory in parallel. Don't
+ * warn in that case.
+ */
+ ret = dmap_removemapping_one(inode, dmap);
+ if (ret && ret != -ENOTCONN) {
+ pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
+ dmap->window_offset, dmap->length, ret);
+ }
+ return 0;
+}
+
+/* Find first mapped dmap for an inode and return file offset. Caller needs
+ * to hold fi->dax->sem lock either shared or exclusive.
+ */
+static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
+ node = interval_tree_iter_next(node, 0, -1)) {
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ continue;
+
+ return dmap;
+ }
+
+ return NULL;
+}
+
+/*
+ * Find first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ */
+static struct fuse_dax_mapping *
+inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
+ bool *retry)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ u64 dmap_start, dmap_end;
+ unsigned long start_idx;
+ int ret;
+ struct interval_tree_node *node;
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ /* Lookup a dmap and corresponding file offset to reclaim. */
+ down_read(&fi->dax->sem);
+ dmap = inode_lookup_first_dmap(inode);
+ if (dmap) {
+ start_idx = dmap->itn.start;
+ dmap_start = start_idx << FUSE_DAX_SHIFT;
+ dmap_end = dmap_start + FUSE_DAX_SZ - 1;
+ }
+ up_read(&fi->dax->sem);
+
+ if (!dmap)
+ goto out_mmap_sem;
+ /*
+ * Make sure there are no references to inode pages using
+ * get_user_pages()
+ */
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ dmap = ERR_PTR(ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ /* Range already got reclaimed by somebody else */
+ if (!node) {
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1) {
+ dmap = NULL;
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0) {
+ dmap = ERR_PTR(ret);
+ goto out_write_dmap_sem;
+ }
+
+ /* Clean up dmap. Do not add back to free list */
+ dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+
+ pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
+ __func__, inode, dmap->window_offset, dmap->length);
+
+out_write_dmap_sem:
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return dmap;
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
+{
+ struct fuse_dax_mapping *dmap;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ while (1) {
+ bool retry = false;
+
+ dmap = alloc_dax_mapping(fcd);
+ if (dmap)
+ return dmap;
+
+ dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
+ /*
+ * Either we got a mapping or it is an error, return in both
+ * the cases.
+ */
+ if (dmap)
+ return dmap;
+
+ /* If we could not reclaim a mapping because it
+ * had a reference or some other temporary failure,
+ * Try again. We want to give up inline reclaim only
+ * if there is no range assigned to this node. Otherwise
+ * if a deadlock is possible if we sleep with
+ * mapping->invalidate_lock held and worker to free memory
+ * can't make progress due to unavailability of
+ * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
+ */
+ if (retry)
+ continue;
+ /*
+ * There are no mappings which can be reclaimed. Wait for one.
+ * We are not holding fi->dax->sem. So it is possible
+ * that range gets added now. But as we are not holding
+ * mapping->invalidate_lock, worker should still be able to
+ * free up a range and wake us up.
+ */
+ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
+ if (wait_event_killable_exclusive(fcd->range_waitq,
+ (fcd->nr_free_ranges > 0))) {
+ return ERR_PTR(-EINTR);
+ }
+ }
+ }
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ /* Find fuse dax mapping at file offset inode. */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+
+ /* Range already got cleaned up by somebody else */
+ if (!node)
+ return 0;
+ dmap = node_to_dmap(node);
+
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ return 0;
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0)
+ return ret;
+
+ /* Cleanup dmap entry and add back to free list */
+ spin_lock(&fcd->lock);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+ return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking:
+ * 1. Take mapping->invalidate_lock to block dax faults.
+ * 2. Take fi->dax->sem to protect interval tree and also to make sure
+ * read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx,
+ unsigned long end_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
+ loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
+ unsigned long nr_to_free)
+{
+ struct fuse_dax_mapping *dmap, *pos, *temp;
+ int ret, nr_freed = 0;
+ unsigned long start_idx = 0, end_idx = 0;
+ struct inode *inode = NULL;
+
+ /* Pick first busy range and free it for now*/
+ while (1) {
+ if (nr_freed >= nr_to_free)
+ break;
+
+ dmap = NULL;
+ spin_lock(&fcd->lock);
+
+ if (!fcd->nr_busy_ranges) {
+ spin_unlock(&fcd->lock);
+ return 0;
+ }
+
+ list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
+ busy_list) {
+ /* skip this range if it's in use. */
+ if (refcount_read(&pos->refcnt) > 1)
+ continue;
+
+ inode = igrab(pos->inode);
+ /*
+ * This inode is going away. That will free
+ * up all the ranges anyway, continue to
+ * next range.
+ */
+ if (!inode)
+ continue;
+ /*
+ * Take this element off list and add it tail. If
+ * this element can't be freed, it will help with
+ * selecting new element in next iteration of loop.
+ */
+ dmap = pos;
+ list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
+ start_idx = end_idx = dmap->itn.start;
+ break;
+ }
+ spin_unlock(&fcd->lock);
+ if (!dmap)
+ return 0;
+
+ ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
+ iput(inode);
+ if (ret)
+ return ret;
+ nr_freed++;
+ }
+ return 0;
+}
+
+static void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+ int ret;
+ struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
+ free_work.work);
+ ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
+ if (ret) {
+ pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+ ret);
+ }
+
+ /* If number of free ranges are still below threshold, requeue */
+ kick_dmap_free_worker(fcd, 1);
+}
+
+static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
+{
+ struct fuse_dax_mapping *range, *temp;
+
+ /* Free All allocated elements */
+ list_for_each_entry_safe(range, temp, mem_list, list) {
+ list_del(&range->list);
+ if (!list_empty(&range->busy_list))
+ list_del(&range->busy_list);
+ kfree(range);
+ }
+}
+
+void fuse_dax_conn_free(struct fuse_conn *fc)
+{
+ if (fc->dax) {
+ fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
+ kfree(fc->dax);
+ }
+}
+
+static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
+{
+ long nr_pages, nr_ranges;
+ struct fuse_dax_mapping *range;
+ int ret, id;
+ size_t dax_size = -1;
+ unsigned long i;
+
+ init_waitqueue_head(&fcd->range_waitq);
+ INIT_LIST_HEAD(&fcd->free_ranges);
+ INIT_LIST_HEAD(&fcd->busy_ranges);
+ INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
+
+ id = dax_read_lock();
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size),
+ DAX_ACCESS, NULL, NULL);
+ dax_read_unlock(id);
+ if (nr_pages < 0) {
+ pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+ return nr_pages;
+ }
+
+ nr_ranges = nr_pages/FUSE_DAX_PAGES;
+ pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
+ __func__, nr_pages, nr_ranges);
+
+ for (i = 0; i < nr_ranges; i++) {
+ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!range)
+ goto out_err;
+
+ /* TODO: This offset only works if virtio-fs driver is not
+ * having some memory hidden at the beginning. This needs
+ * better handling
+ */
+ range->window_offset = i * FUSE_DAX_SZ;
+ range->length = FUSE_DAX_SZ;
+ INIT_LIST_HEAD(&range->busy_list);
+ refcount_set(&range->refcnt, 1);
+ list_add_tail(&range->list, &fcd->free_ranges);
+ }
+
+ fcd->nr_free_ranges = nr_ranges;
+ fcd->nr_ranges = nr_ranges;
+ return 0;
+out_err:
+ /* Free All allocated elements */
+ fuse_free_dax_mem_ranges(&fcd->free_ranges);
+ return ret;
+}
+
+int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode,
+ struct dax_device *dax_dev)
+{
+ struct fuse_conn_dax *fcd;
+ int err;
+
+ fc->dax_mode = dax_mode;
+
+ if (!dax_dev)
+ return 0;
+
+ fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
+ if (!fcd)
+ return -ENOMEM;
+
+ spin_lock_init(&fcd->lock);
+ fcd->dev = dax_dev;
+ err = fuse_dax_mem_range_init(fcd);
+ if (err) {
+ kfree(fcd);
+ return err;
+ }
+
+ fc->dax = fcd;
+ return 0;
+}
+
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fi->dax = NULL;
+ if (fc->dax) {
+ fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+ if (!fi->dax)
+ return false;
+
+ init_rwsem(&fi->dax->sem);
+ fi->dax->tree = RB_ROOT_CACHED;
+ }
+
+ return true;
+}
+
+static const struct address_space_operations fuse_dax_file_aops = {
+ .writepages = fuse_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .dirty_folio = noop_dirty_folio,
+};
+
+static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ enum fuse_dax_mode dax_mode = fc->dax_mode;
+
+ if (dax_mode == FUSE_DAX_NEVER)
+ return false;
+
+ /*
+ * fc->dax may be NULL in 'inode' mode when filesystem device doesn't
+ * support DAX, in which case it will silently fallback to 'never' mode.
+ */
+ if (!fc->dax)
+ return false;
+
+ if (dax_mode == FUSE_DAX_ALWAYS)
+ return true;
+
+ /* dax_mode is FUSE_DAX_INODE* */
+ return fc->inode_dax && (flags & FUSE_ATTR_DAX);
+}
+
+void fuse_dax_inode_init(struct inode *inode, unsigned int flags)
+{
+ if (!fuse_should_enable_dax(inode, flags))
+ return;
+
+ inode->i_flags |= S_DAX;
+ inode->i_data.a_ops = &fuse_dax_file_aops;
+}
+
+void fuse_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fuse_is_inode_dax_mode(fc->dax_mode) &&
+ ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX)))
+ d_mark_dontcache(inode);
+}
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+ if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+ pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+ map_alignment, FUSE_DAX_SZ);
+ return false;
+ }
+ return true;
+}
+
+void fuse_dax_cancel_work(struct fuse_conn *fc)
+{
+ struct fuse_conn_dax *fcd = fc->dax;
+
+ if (fcd)
+ cancel_delayed_work_sync(&fcd->free_work);
+
+}
+EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 97eec7522bf2..b4a6e0a1b945 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
return READ_ONCE(file->private_data);
}
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
+ req->fm = fm;
}
-static struct fuse_req *fuse_request_alloc(gfp_t flags)
+static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
{
struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req)
- fuse_request_init(req);
+ fuse_request_init(fm, req);
return req;
}
@@ -90,7 +91,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
{
/*
* lockess check of fc->connected is okay, because atomic_dec_and_test()
- * provides a memory barrier mached with the one in fuse_wait_aborted()
+ * provides a memory barrier matched with the one in fuse_wait_aborted()
* to ensure no wake-up is missed.
*/
if (atomic_dec_and_test(&fc->num_waiting) &&
@@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
}
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+static void fuse_put_request(struct fuse_req *req);
-static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
+static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
int err;
atomic_inc(&fc->num_waiting);
@@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
if (fc->conn_error)
goto out;
- req = fuse_request_alloc(GFP_KERNEL);
+ req = fuse_request_alloc(fm, GFP_KERNEL);
err = -ENOMEM;
if (!req) {
if (for_background)
@@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
if (unlikely(req->in.h.uid == ((uid_t)-1) ||
req->in.h.gid == ((gid_t)-1))) {
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ERR_PTR(-EOVERFLOW);
}
return req;
@@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
return ERR_PTR(err);
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
@@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
* the 'end' callback is called if given, else the reference to the
* request is released
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
if (test_and_set_bit(FR_FINISHED, &req->flags))
@@ -282,10 +288,10 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
/*
* test_and_set_bit() implies smp_mb() between bit
- * changing and below intr_entry check. Pairs with
+ * changing and below FR_INTERRUPTED check. Pairs with
* smp_mb() from queue_interrupt().
*/
- if (!list_empty(&req->intr_entry)) {
+ if (test_bit(FR_INTERRUPTED, &req->flags)) {
spin_lock(&fiq->lock);
list_del_init(&req->intr_entry);
spin_unlock(&fiq->lock);
@@ -309,10 +315,6 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
- }
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
@@ -323,14 +325,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
}
if (test_bit(FR_ASYNC, &req->flags))
- req->args->end(fc, req->args, req->out.h.error);
+ req->args->end(fm, req->args, req->out.h.error);
put_request:
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
EXPORT_SYMBOL_GPL(fuse_request_end);
-static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_req *req)
{
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
spin_lock(&fiq->lock);
/* Check for we've sent request to interrupt this req */
if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
@@ -342,7 +346,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
list_add_tail(&req->intr_entry, &fiq->interrupts);
/*
* Pairs with smp_mb() implied by test_and_set_bit()
- * from request_end().
+ * from fuse_request_end().
*/
smp_mb();
if (test_bit(FR_FINISHED, &req->flags)) {
@@ -357,8 +361,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
return 0;
}
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+static void request_wait_answer(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
int err;
@@ -373,7 +378,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
/* matches barrier in fuse_dev_do_read() */
smp_mb__after_atomic();
if (test_bit(FR_SENT, &req->flags))
- queue_interrupt(fiq, req);
+ queue_interrupt(req);
}
if (!test_bit(FR_FORCE, &req->flags)) {
@@ -402,9 +407,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
}
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_req *req)
{
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
spin_lock(&fiq->lock);
@@ -418,7 +423,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
__fuse_get_request(req);
queue_request_and_unlock(fiq, req);
- request_wait_answer(fc, req);
+ request_wait_answer(req);
/* Pairs with smp_wmb() in fuse_request_end() */
smp_rmb();
}
@@ -457,8 +462,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
}
}
-static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_force_creds(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
@@ -473,23 +480,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
__set_bit(FR_ASYNC, &req->flags);
}
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
ssize_t ret;
if (args->force) {
atomic_inc(&fc->num_waiting);
- req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+ req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);
if (!args->nocreds)
- fuse_force_creds(fc, req);
+ fuse_force_creds(req);
__set_bit(FR_WAITING, &req->flags);
__set_bit(FR_FORCE, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -500,20 +508,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
if (!args->noreply)
__set_bit(FR_ISREPLY, &req->flags);
- __fuse_request_send(fc, req);
+ __fuse_request_send(req);
ret = req->out.h.error;
if (!ret && args->out_argvar) {
BUG_ON(args->out_numargs == 0);
ret = args->out_args[args->out_numargs - 1].size;
}
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ret;
}
-static bool fuse_request_queue_background(struct fuse_conn *fc,
- struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
bool queued = false;
WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
@@ -527,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
- }
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
queued = true;
@@ -540,28 +545,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
return queued;
}
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags)
{
struct fuse_req *req;
if (args->force) {
WARN_ON(!args->nocreds);
- req = fuse_request_alloc(gfp_flags);
+ req = fuse_request_alloc(fm, gfp_flags);
if (!req)
return -ENOMEM;
__set_bit(FR_BACKGROUND, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, true);
+ req = fuse_get_req(fm, true);
if (IS_ERR(req))
return PTR_ERR(req);
}
fuse_args_to_req(req, args);
- if (!fuse_request_queue_background(fc, req)) {
- fuse_put_request(fc, req);
+ if (!fuse_request_queue_background(req)) {
+ fuse_put_request(req);
return -ENOTCONN;
}
@@ -569,14 +574,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
}
EXPORT_SYMBOL_GPL(fuse_simple_background);
-static int fuse_simple_notify_reply(struct fuse_conn *fc,
+static int fuse_simple_notify_reply(struct fuse_mount *fm,
struct fuse_args *args, u64 unique)
{
struct fuse_req *req;
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &fm->fc->iq;
int err = 0;
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -591,7 +596,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc,
} else {
err = -ENODEV;
spin_unlock(&fiq->lock);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
return err;
@@ -725,14 +730,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
}
} else {
size_t off;
- err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+ err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
if (err < 0)
return err;
BUG_ON(!err);
cs->len = err;
cs->offset = off;
cs->pg = page;
- iov_iter_advance(cs->iter, err);
}
return lock_request(cs->req);
@@ -743,7 +747,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
unsigned ncpy = min(*size, cs->len);
if (val) {
- void *pgaddr = kmap_atomic(cs->pg);
+ void *pgaddr = kmap_local_page(cs->pg);
void *buf = pgaddr + cs->offset;
if (cs->write)
@@ -751,7 +755,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
else
memcpy(*val, buf, ncpy);
- kunmap_atomic(pgaddr);
+ kunmap_local(pgaddr);
*val += ncpy;
}
*size -= ncpy;
@@ -764,16 +768,17 @@ static int fuse_check_page(struct page *page)
{
if (page_mapcount(page) ||
page->mapping != NULL ||
- page_count(page) != 1 ||
(page->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
- 1 << PG_reclaim))) {
- pr_warn("trying to steal weird page\n");
- pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+ 1 << PG_workingset |
+ 1 << PG_reclaim |
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
+ dump_page(page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@@ -786,15 +791,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
+ get_page(oldpage);
err = unlock_request(cs->req);
if (err)
- return err;
+ goto out_put_old;
fuse_copy_finish(cs);
err = pipe_buf_confirm(cs->pipe, buf);
if (err)
- return err;
+ goto out_put_old;
BUG_ON(!cs->nr_segs);
cs->currbuf = buf;
@@ -805,7 +811,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (cs->len != PAGE_SIZE)
goto out_fallback;
- if (pipe_buf_steal(cs->pipe, buf) != 0)
+ if (!pipe_buf_try_steal(cs->pipe, buf))
goto out_fallback;
newpage = buf->page;
@@ -831,16 +837,18 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (WARN_ON(PageMlocked(oldpage)))
goto out_fallback_unlock;
- err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
- if (err) {
- unlock_page(newpage);
- return err;
- }
+ replace_page_cache_page(oldpage, newpage);
get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
- lru_cache_add_file(newpage);
+ lru_cache_add(newpage);
+
+ /*
+ * Release while we have extra ref on stolen page. Otherwise
+ * anon_pipe_buf_release() might think the page can be reused.
+ */
+ pipe_buf_release(cs->pipe, buf);
err = 0;
spin_lock(&cs->req->waitq.lock);
@@ -853,14 +861,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (err) {
unlock_page(newpage);
put_page(newpage);
- return err;
+ goto out_put_old;
}
unlock_page(oldpage);
+ /* Drop ref for ap->pages[] array */
put_page(oldpage);
cs->len = 0;
- return 0;
+ err = 0;
+out_put_old:
+ /* Drop ref obtained in this function */
+ put_page(oldpage);
+ return err;
out_fallback_unlock:
unlock_page(newpage);
@@ -869,10 +882,10 @@ out_fallback:
cs->offset = buf->offset;
err = lock_request(cs->req);
- if (err)
- return err;
+ if (!err)
+ err = 1;
- return 1;
+ goto out_put_old;
}
static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
@@ -884,14 +897,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
+ get_page(page);
err = unlock_request(cs->req);
- if (err)
+ if (err) {
+ put_page(page);
return err;
+ }
fuse_copy_finish(cs);
buf = cs->pipebufs;
- get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = count;
@@ -918,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
while (count) {
if (cs->write && cs->pipebufs && page) {
- return fuse_ref_page(cs, page, offset, count);
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->args->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
} else if (!cs->len) {
if (cs->move_pages && page &&
offset == 0 && count == PAGE_SIZE) {
@@ -932,10 +957,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
}
}
if (page) {
- void *mapaddr = kmap_atomic(page);
+ void *mapaddr = kmap_local_page(page);
void *buf = mapaddr + offset;
offset += fuse_copy_do(cs, &buf, &count);
- kunmap_atomic(mapaddr);
+ kunmap_local(mapaddr);
} else
offset += fuse_copy_do(cs, NULL, &count);
}
@@ -1251,10 +1276,19 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* SETXATTR is special, since it may contain too large data */
if (args->opcode == FUSE_SETXATTR)
req->out.h.error = -E2BIG;
- fuse_request_end(fc, req);
+ fuse_request_end(req);
goto restart;
}
spin_lock(&fpq->lock);
+ /*
+ * Must not put request on fpq->io queue after having been shut down by
+ * fuse_abort_conn()
+ */
+ if (!fpq->connected) {
+ req->out.h.error = err = -ECONNABORTED;
+ goto out_end;
+
+ }
list_add(&req->list, &fpq->io);
spin_unlock(&fpq->lock);
cs->req = req;
@@ -1285,8 +1319,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* matches barrier in request_wait_answer() */
smp_mb__after_atomic();
if (test_bit(FR_INTERRUPTED, &req->flags))
- queue_interrupt(fiq, req);
- fuse_put_request(fc, req);
+ queue_interrupt(req);
+ fuse_put_request(req);
return reqsize;
@@ -1294,7 +1328,7 @@ out_end:
if (!test_bit(FR_PRIVATE, &req->flags))
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
return err;
err_unlock:
@@ -1322,7 +1356,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
if (!fud)
return -EPERM;
- if (!iter_is_iovec(to))
+ if (!user_backed_iter(to))
return -EINVAL;
fuse_copy_init(&cs, 1, to);
@@ -1417,11 +1451,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
fuse_copy_finish(cs);
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb) {
- err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
- outarg.off, outarg.len);
- }
+ err = fuse_reverse_inval_inode(fc, outarg.ino,
+ outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
@@ -1467,9 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1517,10 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
- outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1562,10 +1588,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
- if (!fc->sb)
- goto out_up_killsb;
-
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
goto out_up_killsb;
@@ -1576,7 +1599,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
end = outarg.offset + outarg.size;
if (end > file_size) {
file_size = end;
- fuse_write_update_size(inode, file_size);
+ fuse_write_update_attr(inode, file_size, outarg.size);
}
num = outarg.size;
@@ -1622,7 +1645,7 @@ struct fuse_retrieve_args {
struct fuse_notify_retrieve_in inarg;
};
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_retrieve_args *ra =
@@ -1632,7 +1655,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
kfree(ra);
}
-static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
struct fuse_notify_retrieve_out *outarg)
{
int err;
@@ -1643,6 +1666,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int offset;
size_t total_len = 0;
unsigned int num_pages;
+ struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
struct fuse_args_pages *ap;
@@ -1704,9 +1728,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
args->in_args[0].value = &ra->inarg;
args->in_args[1].size = total_len;
- err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+ err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
- fuse_retrieve_end(fc, args, err);
+ fuse_retrieve_end(fm, args, err);
return err;
}
@@ -1715,7 +1739,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_retrieve_out outarg;
+ struct fuse_mount *fm;
struct inode *inode;
+ u64 nodeid;
int err;
err = -EINVAL;
@@ -1730,14 +1756,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
- if (fc->sb) {
- u64 nodeid = outarg.nodeid;
+ nodeid = outarg.nodeid;
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
- if (inode) {
- err = fuse_retrieve(fc, inode, &outarg);
- iput(inode);
- }
+ inode = fuse_ilookup(fc, nodeid, &fm);
+ if (inode) {
+ err = fuse_retrieve(fm, inode, &outarg);
+ iput(inode);
}
up_read(&fc->killsb);
@@ -1851,7 +1875,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
}
err = -EINVAL;
- if (oh.error <= -1000 || oh.error > 0)
+ if (oh.error <= -512 || oh.error > 0)
goto copy_finish;
spin_lock(&fpq->lock);
@@ -1876,9 +1900,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- err = queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(req);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
goto copy_finish;
}
@@ -1908,7 +1932,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
out:
return err ? err : nbytes;
@@ -1925,7 +1949,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
if (!fud)
return -EPERM;
- if (!iter_is_iovec(from))
+ if (!user_backed_iter(from))
return -EINVAL;
fuse_copy_init(&cs, 0, from);
@@ -1977,8 +2001,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- BUG_ON(nbuf >= pipe->ring_size);
- BUG_ON(tail == head);
+ if (WARN_ON(nbuf >= count || tail == head))
+ goto out_free;
+
ibuf = &pipe->bufs[tail & mask];
obuf = &bufs[nbuf];
@@ -2014,8 +2039,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
out_free:
- for (idx = 0; idx < nbuf; idx++)
- pipe_buf_release(pipe, &bufs[idx]);
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
+ }
pipe_unlock(pipe);
kvfree(bufs);
@@ -2045,7 +2074,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct fuse_conn *fc, struct list_head *head)
+static void end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -2053,7 +2082,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
req->out.h.error = -ECONNABORTED;
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
@@ -2081,7 +2110,7 @@ static void end_polls(struct fuse_conn *fc)
* The same effect is usually achievable through killing the filesystem daemon
* and all users of the filesystem. The exception is the combination of an
* asynchronous request and the tricky deadlock (see
- * Documentation/filesystems/fuse.txt).
+ * Documentation/filesystems/fuse.rst).
*
* Aborting requests under I/O goes as follows: 1: Separate out unlocked
* requests, they should be finished off immediately. Locked requests will be
@@ -2148,7 +2177,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
} else {
spin_unlock(&fc->lock);
}
@@ -2178,7 +2207,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
@@ -2222,19 +2251,18 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- int err = -ENOTTY;
+ int res;
+ int oldfd;
+ struct fuse_dev *fud = NULL;
- if (cmd == FUSE_DEV_IOC_CLONE) {
- int oldfd;
-
- err = -EFAULT;
- if (!get_user(oldfd, (__u32 __user *) arg)) {
+ switch (cmd) {
+ case FUSE_DEV_IOC_CLONE:
+ res = -EFAULT;
+ if (!get_user(oldfd, (__u32 __user *)arg)) {
struct file *old = fget(oldfd);
- err = -EINVAL;
+ res = -EINVAL;
if (old) {
- struct fuse_dev *fud = NULL;
-
/*
* Check against file->f_op because CUSE
* uses the same ioctl handler.
@@ -2245,14 +2273,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
if (fud) {
mutex_lock(&fuse_mutex);
- err = fuse_device_clone(fud->fc, file);
+ res = fuse_device_clone(fud->fc, file);
mutex_unlock(&fuse_mutex);
}
fput(old);
}
}
+ break;
+ default:
+ res = -ENOTTY;
+ break;
}
- return err;
+ return res;
}
const struct file_operations fuse_dev_operations = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index de1e2fde60bd..bb97a384dc5d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,12 +10,22 @@
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fs_context.h>
+#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/iversion.h>
#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+static bool __read_mostly allow_sys_admin_access;
+module_param(allow_sys_admin_access, bool, 0644);
+MODULE_PARM_DESC(allow_sys_admin_access,
+ "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
static void fuse_advise_use_readdirplus(struct inode *dir)
{
@@ -115,7 +125,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
-static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
{
set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
}
@@ -196,15 +206,15 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
{
struct inode *inode;
struct dentry *parent;
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct fuse_inode *fi;
int ret;
inode = d_inode_rcu(entry);
- if (inode && is_bad_inode(inode))
+ if (inode && fuse_is_bad(inode))
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
- (flags & LOOKUP_REVAL)) {
+ (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
struct fuse_entry_out outarg;
FUSE_ARGS(args);
struct fuse_forget_link *forget;
@@ -218,27 +228,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (flags & LOOKUP_RCU)
goto out;
- fc = get_fuse_conn(inode);
+ fm = get_fuse_mount(inode);
forget = fuse_alloc_forget();
ret = -ENOMEM;
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
parent = dget_parent(entry);
- fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+ fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
&entry->d_name, &outarg);
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
dput(parent);
/* Zero nodeid is same as -ENOENT */
if (!ret && !outarg.nodeid)
ret = -ENOENT;
if (!ret) {
fi = get_fuse_inode(inode);
- if (outarg.nodeid != get_node_id(inode)) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ if (outarg.nodeid != get_node_id(inode) ||
+ (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
+ fuse_queue_forget(fm->fc, forget,
+ outarg.nodeid, 1);
goto invalid;
}
spin_lock(&fi->lock);
@@ -249,7 +261,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (ret == -ENOMEM)
goto out;
if (ret || fuse_invalid_attr(&outarg.attr) ||
- (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ fuse_stale_inode(inode, outarg.generation, &outarg.attr))
goto invalid;
forget_all_cached_acls(inode);
@@ -298,6 +310,33 @@ static int fuse_dentry_delete(const struct dentry *dentry)
return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
}
+/*
+ * Create a fuse_mount object with a new superblock (with path->dentry
+ * as the root), and return that mount so it can be auto-mounted on
+ * @path.
+ */
+static struct vfsmount *fuse_dentry_automount(struct path *path)
+{
+ struct fs_context *fsc;
+ struct vfsmount *mnt;
+ struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
+
+ fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fsc))
+ return ERR_CAST(fsc);
+
+ /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */
+ fsc->fs_private = mp_fi;
+
+ /* Create the submount */
+ mnt = fc_mount(fsc);
+ if (!IS_ERR(mnt))
+ mntget(mnt);
+
+ put_fs_context(fsc);
+ return mnt;
+}
+
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_delete = fuse_dentry_delete,
@@ -305,6 +344,7 @@ const struct dentry_operations fuse_dentry_operations = {
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
#endif
+ .d_automount = fuse_dentry_automount,
};
const struct dentry_operations fuse_root_dentry_operations = {
@@ -329,7 +369,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
u64 attr_version;
@@ -346,10 +386,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
- fuse_lookup_init(fc, &args, nodeid, name, outarg);
- err = fuse_simple_request(fc, &args);
+ fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fm, &args);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
if (err || !outarg->nodeid)
goto out_put_forget;
@@ -365,7 +405,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
attr_version);
err = -ENOMEM;
if (!*inode) {
- fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
goto out;
}
err = 0;
@@ -386,6 +426,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
bool outarg_valid = true;
bool locked;
+ if (fuse_is_bad(dir))
+ return ERR_PTR(-EIO);
+
locked = fuse_lock_inode(dir);
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
&outarg, &inode);
@@ -422,6 +465,62 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
return ERR_PTR(err);
}
+static int get_security_context(struct dentry *entry, umode_t mode,
+ void **security_ctx, u32 *security_ctxlen)
+{
+ struct fuse_secctx *fctx;
+ struct fuse_secctx_header *header;
+ void *ctx = NULL, *ptr;
+ u32 ctxlen, total_len = sizeof(*header);
+ int err, nr_ctx = 0;
+ const char *name;
+ size_t namelen;
+
+ err = security_dentry_init_security(entry, mode, &entry->d_name,
+ &name, &ctx, &ctxlen);
+ if (err) {
+ if (err != -EOPNOTSUPP)
+ goto out_err;
+ /* No LSM is supporting this security hook. Ignore error */
+ ctxlen = 0;
+ ctx = NULL;
+ }
+
+ if (ctxlen) {
+ nr_ctx = 1;
+ namelen = strlen(name) + 1;
+ err = -EIO;
+ if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX))
+ goto out_err;
+ total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen);
+ }
+
+ err = -ENOMEM;
+ header = ptr = kzalloc(total_len, GFP_KERNEL);
+ if (!ptr)
+ goto out_err;
+
+ header->nr_secctx = nr_ctx;
+ header->size = total_len;
+ ptr += sizeof(*header);
+ if (nr_ctx) {
+ fctx = ptr;
+ fctx->size = ctxlen;
+ ptr += sizeof(*fctx);
+
+ strcpy(ptr, name);
+ ptr += namelen;
+
+ memcpy(ptr, ctx, ctxlen);
+ }
+ *security_ctxlen = total_len;
+ *security_ctx = header;
+ err = 0;
+out_err:
+ kfree(ctx);
+ return err;
+}
+
/*
* Atomic create+open operation
*
@@ -429,12 +528,12 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
* 'mknod' + 'open' requests.
*/
static int fuse_create_open(struct inode *dir, struct dentry *entry,
- struct file *file, unsigned flags,
- umode_t mode)
+ struct file *file, unsigned int flags,
+ umode_t mode, u32 opcode)
{
int err;
struct inode *inode;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
@@ -442,6 +541,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
+ void *security_ctx = NULL;
+ u32 security_ctxlen;
+ bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@@ -452,11 +554,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_err;
err = -ENOMEM;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
goto out_put_forget_req;
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
flags &= ~O_NOCTTY;
@@ -465,7 +567,13 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.flags = flags;
inarg.mode = mode;
inarg.umask = current_umask();
- args.opcode = FUSE_CREATE;
+
+ if (fm->fc->handle_killpriv_v2 && trunc &&
+ !(flags & O_EXCL) && !capable(CAP_FSETID)) {
+ inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+ }
+
+ args.opcode = opcode;
args.nodeid = get_node_id(dir);
args.in_numargs = 2;
args.in_args[0].size = sizeof(inarg);
@@ -477,7 +585,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_args[0].value = &outentry;
args.out_args[1].size = sizeof(outopen);
args.out_args[1].value = &outopen;
- err = fuse_simple_request(fc, &args);
+
+ if (fm->fc->init_security) {
+ err = get_security_context(entry, mode, &security_ctx,
+ &security_ctxlen);
+ if (err)
+ goto out_put_forget_req;
+
+ args.in_numargs = 3;
+ args.in_args[2].size = security_ctxlen;
+ args.in_args[2].value = security_ctx;
+ }
+
+ err = fuse_simple_request(fm, &args);
+ kfree(security_ctx);
if (err)
goto out_free_ff;
@@ -494,7 +615,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(NULL, ff, flags);
- fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
}
@@ -509,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
+ if (fm->fc->atomic_o_trunc && trunc)
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
}
return err;
@@ -520,7 +645,8 @@ out_err:
return err;
}
-static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *,
+ umode_t, dev_t);
static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
struct file *file, unsigned flags,
umode_t mode)
@@ -529,6 +655,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
struct fuse_conn *fc = get_fuse_conn(dir);
struct dentry *res = NULL;
+ if (fuse_is_bad(dir))
+ return -EIO;
+
if (d_in_lookup(entry)) {
res = fuse_lookup(dir, entry, 0);
if (IS_ERR(res))
@@ -547,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (fc->no_create)
goto mknod;
- err = fuse_create_open(dir, entry, file, flags, mode);
+ err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
if (err == -ENOSYS) {
fc->no_create = 1;
goto mknod;
@@ -557,7 +686,7 @@ out_dput:
return err;
mknod:
- err = fuse_mknod(dir, entry, mode, 0);
+ err = fuse_mknod(&init_user_ns, dir, entry, mode, 0);
if (err)
goto out_dput;
no_open:
@@ -567,7 +696,7 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
struct inode *dir, struct dentry *entry,
umode_t mode)
{
@@ -576,6 +705,11 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
struct dentry *d;
int err;
struct fuse_forget_link *forget;
+ void *security_ctx = NULL;
+ u32 security_ctxlen;
+
+ if (fuse_is_bad(dir))
+ return -EIO;
forget = fuse_alloc_forget();
if (!forget)
@@ -586,7 +720,22 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
args->out_numargs = 1;
args->out_args[0].size = sizeof(outarg);
args->out_args[0].value = &outarg;
- err = fuse_simple_request(fc, args);
+
+ if (fm->fc->init_security && args->opcode != FUSE_LINK) {
+ err = get_security_context(entry, mode, &security_ctx,
+ &security_ctxlen);
+ if (err)
+ goto out_put_forget_req;
+
+ BUG_ON(args->in_numargs != 2);
+
+ args->in_numargs = 3;
+ args->in_args[2].size = security_ctxlen;
+ args->in_args[2].value = security_ctx;
+ }
+
+ err = fuse_simple_request(fm, args);
+ kfree(security_ctx);
if (err)
goto out_put_forget_req;
@@ -600,7 +749,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr, entry_attr_timeout(&outarg), 0);
if (!inode) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
return -ENOMEM;
}
kfree(forget);
@@ -624,14 +773,14 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
return err;
}
-static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
- dev_t rdev)
+static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, umode_t mode, dev_t rdev)
{
struct fuse_mknod_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
@@ -644,22 +793,40 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, mode);
+ return create_new_entry(fm, &args, dir, entry, mode);
}
-static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
- bool excl)
+static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, umode_t mode, bool excl)
{
- return fuse_mknod(dir, entry, mode, 0);
+ return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
}
-static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
+static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
{
- struct fuse_mkdir_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
+ int err;
+
+ if (fc->no_tmpfile)
+ return -EOPNOTSUPP;
+
+ err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+ if (err == -ENOSYS) {
+ fc->no_tmpfile = 1;
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
+static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, umode_t mode)
+{
+ struct fuse_mkdir_in inarg;
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
@@ -671,13 +838,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+ return create_new_entry(fm, &args, dir, entry, S_IFDIR);
}
-static int fuse_symlink(struct inode *dir, struct dentry *entry,
- const char *link)
+static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, const char *link)
{
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
unsigned len = strlen(link) + 1;
FUSE_ARGS(args);
@@ -687,48 +854,72 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
args.in_args[0].value = entry->d_name.name;
args.in_args[1].size = len;
args.in_args[1].value = link;
- return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+ return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
-void fuse_update_ctime(struct inode *inode)
+void fuse_flush_time_update(struct inode *inode)
+{
+ int err = sync_inode_metadata(inode, 1);
+
+ mapping_set_error(inode->i_mapping, err);
+}
+
+static void fuse_update_ctime_in_cache(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
inode->i_ctime = current_time(inode);
mark_inode_dirty_sync(inode);
+ fuse_flush_time_update(inode);
}
}
+void fuse_update_ctime(struct inode *inode)
+{
+ fuse_invalidate_attr_mask(inode, STATX_CTIME);
+ fuse_update_ctime_in_cache(inode);
+}
+
+static void fuse_entry_unlinked(struct dentry *entry)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (S_ISDIR(inode->i_mode))
+ clear_nlink(inode);
+ else if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&fi->lock);
+ fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
+ if (fuse_is_bad(dir))
+ return -EIO;
+
args.opcode = FUSE_UNLINK;
args.nodeid = get_node_id(dir);
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
- struct inode *inode = d_inode(entry);
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
- /*
- * If i_nlink == 0 then unlink doesn't make sense, yet this can
- * happen if userspace filesystem is careless. It would be
- * difficult to enforce correct nlink usage so just ignore this
- * condition here
- */
- if (inode->i_nlink > 0)
- drop_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
- fuse_update_ctime(inode);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -737,19 +928,21 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
+ if (fuse_is_bad(dir))
+ return -EIO;
+
args.opcode = FUSE_RMDIR;
args.nodeid = get_node_id(dir);
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
- clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -761,7 +954,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
{
int err;
struct fuse_rename2_in inarg;
- struct fuse_conn *fc = get_fuse_conn(olddir);
+ struct fuse_mount *fm = get_fuse_mount(olddir);
FUSE_ARGS(args);
memset(&inarg, 0, argsize);
@@ -776,27 +969,21 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
args.in_args[1].value = oldent->d_name.name;
args.in_args[2].size = newent->d_name.len + 1;
args.in_args[2].value = newent->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
- fuse_invalidate_attr(d_inode(oldent));
fuse_update_ctime(d_inode(oldent));
- if (flags & RENAME_EXCHANGE) {
- fuse_invalidate_attr(d_inode(newent));
+ if (flags & RENAME_EXCHANGE)
fuse_update_ctime(d_inode(newent));
- }
fuse_dir_changed(olddir);
if (olddir != newdir)
fuse_dir_changed(newdir);
/* newent will end up negative */
- if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
- fuse_invalidate_attr(d_inode(newent));
- fuse_invalidate_entry_cache(newent);
- fuse_update_ctime(d_inode(newent));
- }
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent))
+ fuse_entry_unlinked(newent);
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
@@ -811,13 +998,16 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
return err;
}
-static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
- struct inode *newdir, struct dentry *newent,
- unsigned int flags)
+static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir,
+ struct dentry *oldent, struct inode *newdir,
+ struct dentry *newent, unsigned int flags)
{
struct fuse_conn *fc = get_fuse_conn(olddir);
int err;
+ if (fuse_is_bad(olddir))
+ return -EIO;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
@@ -847,7 +1037,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
int err;
struct fuse_link_in inarg;
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
@@ -858,26 +1048,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
- /* Contrary to "normal" filesystems it can happen that link
- makes two "logical" inodes point to the same "physical"
- inode. We invalidate the attributes of the old one, so it
- will reflect changes in the backing inode (link count,
- etc.)
- */
- if (!err) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
- if (likely(inode->i_nlink < UINT_MAX))
- inc_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
- fuse_update_ctime(inode);
- } else if (err == -EINTR) {
+ err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
+ if (!err)
+ fuse_update_ctime_in_cache(inode);
+ else if (err == -EINTR)
fuse_invalidate_attr(inode);
- }
+
return err;
}
@@ -887,15 +1063,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
unsigned int blkbits;
struct fuse_conn *fc = get_fuse_conn(inode);
- /* see the comment in fuse_change_attributes() */
- if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
- attr->size = i_size_read(inode);
- attr->mtime = inode->i_mtime.tv_sec;
- attr->mtimensec = inode->i_mtime.tv_nsec;
- attr->ctime = inode->i_ctime.tv_sec;
- attr->ctimensec = inode->i_ctime.tv_nsec;
- }
-
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -926,11 +1093,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
int err;
struct fuse_getattr_in inarg;
struct fuse_attr_out outarg;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
u64 attr_version;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -949,11 +1116,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
if (fuse_invalid_attr(&outarg.attr) ||
- (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
- make_bad_inode(inode);
+ inode_wrong_type(inode, outarg.attr.mode)) {
+ fuse_make_bad(inode);
err = -EIO;
} else {
fuse_change_attributes(inode, &outarg.attr,
@@ -973,12 +1140,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
bool sync;
+ u32 inval_mask = READ_ONCE(fi->inval_mask);
+ u32 cache_mask = fuse_get_cache_mask(inode);
if (flags & AT_STATX_FORCE_SYNC)
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
- else if (request_mask & READ_ONCE(fi->inval_mask))
+ else if (request_mask & inval_mask & ~cache_mask)
sync = true;
else
sync = time_before64(fi->i_time, get_jiffies_64());
@@ -987,7 +1156,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
forget_all_cached_acls(inode);
err = fuse_do_getattr(inode, stat, file);
} else if (stat) {
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
}
@@ -995,14 +1164,12 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
return err;
}
-int fuse_update_attributes(struct inode *inode, struct file *file)
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
{
- /* Do *not* need to get atime for internal purposes */
- return fuse_update_get_attr(inode, file, NULL,
- STATX_BASIC_STATS & ~STATX_ATIME, 0);
+ return fuse_update_get_attr(inode, file, NULL, mask, 0);
}
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
@@ -1010,11 +1177,11 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
struct dentry *dir;
struct dentry *entry;
- parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+ parent = fuse_ilookup(fc, parent_nodeid, NULL);
if (!parent)
return -ENOENT;
- inode_lock(parent);
+ inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
goto unlock;
@@ -1085,6 +1252,9 @@ int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
+ if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ return 1;
+
if (fc->allow_other)
return current_in_userns(fc->user_ns);
@@ -1102,14 +1272,14 @@ int fuse_allow_current_process(struct fuse_conn *fc)
static int fuse_access(struct inode *inode, int mask)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_access_in inarg;
int err;
BUG_ON(mask & MAY_NOT_BLOCK);
- if (fc->no_access)
+ if (fm->fc->no_access)
return 0;
memset(&inarg, 0, sizeof(inarg));
@@ -1119,9 +1289,9 @@ static int fuse_access(struct inode *inode, int mask)
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_access = 1;
+ fm->fc->no_access = 1;
err = 0;
}
return err;
@@ -1149,12 +1319,16 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
* access request is sent. Execute permission is still checked
* locally based on file mode.
*/
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
bool refreshed = false;
int err = 0;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fuse_allow_current_process(fc))
return -EACCES;
@@ -1177,7 +1351,7 @@ static int fuse_permission(struct inode *inode, int mask)
}
if (fc->default_permissions) {
- err = generic_permission(inode, mask);
+ err = generic_permission(&init_user_ns, inode, mask);
/* If permission is denied, try to refresh file
attributes. This is also needed, because the root
@@ -1185,7 +1359,8 @@ static int fuse_permission(struct inode *inode, int mask)
if (err == -EACCES && !refreshed) {
err = fuse_perm_getattr(inode, mask);
if (!err)
- err = generic_permission(inode, mask);
+ err = generic_permission(&init_user_ns,
+ inode, mask);
}
/* Note: the opposite of the above test does not
@@ -1209,7 +1384,7 @@ static int fuse_permission(struct inode *inode, int mask)
static int fuse_readlink_page(struct inode *inode, struct page *page)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
struct fuse_args_pages ap = {
.num_pages = 1,
@@ -1226,7 +1401,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page)
ap.args.page_zeroing = true;
ap.args.out_numargs = 1;
ap.args.out_args[0].size = desc.length;
- res = fuse_simple_request(fc, &ap.args);
+ res = fuse_simple_request(fm, &ap.args);
fuse_invalidate_atime(inode);
@@ -1250,7 +1425,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
int err;
err = -EIO;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
goto out_err;
if (fc->cache_symlinks)
@@ -1298,7 +1473,7 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
if (fc->no_fsyncdir)
@@ -1454,7 +1629,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
*/
int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1465,7 +1640,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid = FATTR_MTIME;
inarg.mtime = inode->i_mtime.tv_sec;
inarg.mtimensec = inode->i_mtime.tv_nsec;
- if (fc->minor >= 23) {
+ if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
inarg.ctime = inode->i_ctime.tv_sec;
inarg.ctimensec = inode->i_ctime.tv_nsec;
@@ -1474,9 +1649,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
- fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg);
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
/*
@@ -1491,24 +1666,43 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct file *file)
{
struct inode *inode = d_inode(dentry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct address_space *mapping = inode->i_mapping;
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
- bool is_wb = fc->writeback_cache;
+ bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode);
loff_t oldsize;
int err;
- bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool trust_local_cmtime = is_wb;
+ bool fault_blocked = false;
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err)
return err;
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
+ is_truncate = true;
+ }
+
+ if (FUSE_IS_DAX(inode) && is_truncate) {
+ filemap_invalidate_lock(mapping);
+ fault_blocked = true;
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err) {
+ filemap_invalidate_unlock(mapping);
+ return err;
+ }
+ }
+
if (attr->ia_valid & ATTR_OPEN) {
/* This is coming from open(..., ... | O_TRUNC); */
WARN_ON(!(attr->ia_valid & ATTR_SIZE));
@@ -1521,19 +1715,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
*/
i_size_write(inode, 0);
truncate_pagecache(inode, 0);
- return 0;
+ goto out;
}
file = NULL;
}
- if (attr->ia_valid & ATTR_SIZE) {
- if (WARN_ON(!S_ISREG(inode->i_mode)))
- return -EIO;
- is_truncate = true;
- }
-
/* Flush dirty data/metadata before non-truncate SETATTR */
- if (is_wb && S_ISREG(inode->i_mode) &&
+ if (is_wb &&
attr->ia_valid &
(ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
ATTR_TIMES_SET)) {
@@ -1560,13 +1748,23 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
+
+ /* Kill suid/sgid for non-directory chown unconditionally */
+ if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) &&
+ attr->ia_valid & (ATTR_UID | ATTR_GID))
+ inarg.valid |= FATTR_KILL_SUIDGID;
+
if (attr->ia_valid & ATTR_SIZE) {
/* For mandatory locking in truncate */
inarg.valid |= FATTR_LOCKOWNER;
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
+
+ /* Kill suid/sgid for truncate only if no CAP_FSETID */
+ if (fc->handle_killpriv_v2 && !capable(CAP_FSETID))
+ inarg.valid |= FATTR_KILL_SUIDGID;
}
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -EINTR)
fuse_invalidate_attr(inode);
@@ -1574,8 +1772,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
if (fuse_invalid_attr(&outarg.attr) ||
- (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
- make_bad_inode(inode);
+ inode_wrong_type(inode, outarg.attr.mode)) {
+ fuse_make_bad(inode);
err = -EIO;
goto error;
}
@@ -1591,10 +1789,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg));
+ attr_timeout(&outarg),
+ fuse_get_cache_mask(inode));
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
- if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ if (!is_wb || is_truncate)
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
@@ -1605,15 +1804,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
/*
* Only call invalidate_inode_pages2() after removing
- * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+ * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock.
*/
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
truncate_pagecache(inode, outarg.attr.size);
- invalidate_inode_pages2(inode->i_mapping);
+ invalidate_inode_pages2(mapping);
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+out:
+ if (fault_blocked)
+ filemap_invalidate_unlock(mapping);
+
return 0;
error:
@@ -1621,16 +1824,23 @@ error:
fuse_release_nowrite(inode);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (fault_blocked)
+ filemap_invalidate_unlock(mapping);
return err;
}
-static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL;
int ret;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fuse_allow_current_process(get_fuse_conn(inode)))
return -EACCES;
@@ -1644,7 +1854,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
*
* This should be done on write(), truncate() and chown().
*/
- if (!fc->handle_killpriv) {
+ if (!fc->handle_killpriv && !fc->handle_killpriv_v2) {
/*
* ia_mode calculation may have used stale i_mode.
* Refresh and recalculate.
@@ -1683,14 +1893,28 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
return ret;
}
-static int fuse_getattr(const struct path *path, struct kstat *stat,
+static int fuse_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!fuse_allow_current_process(fc))
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fuse_allow_current_process(fc)) {
+ if (!request_mask) {
+ /*
+ * If user explicitly requested *nothing* then don't
+ * error out, but return st_dev only.
+ */
+ stat->result_mask = 0;
+ stat->dev = inode->i_sb->s_dev;
+ return 0;
+ }
return -EACCES;
+ }
return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
}
@@ -1706,12 +1930,15 @@ static const struct inode_operations fuse_dir_inode_operations = {
.setattr = fuse_setattr,
.create = fuse_create,
.atomic_open = fuse_atomic_open,
+ .tmpfile = fuse_tmpfile,
.mknod = fuse_mknod,
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
.get_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
+ .fileattr_get = fuse_fileattr_get,
+ .fileattr_set = fuse_fileattr_set,
};
static const struct file_operations fuse_dir_operations = {
@@ -1732,6 +1959,8 @@ static const struct inode_operations fuse_common_inode_operations = {
.listxattr = fuse_listxattr,
.get_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
+ .fileattr_get = fuse_fileattr_get,
+ .fileattr_set = fuse_fileattr_set,
};
static const struct inode_operations fuse_symlink_inode_operations = {
@@ -1760,20 +1989,20 @@ void fuse_init_dir(struct inode *inode)
fi->rdc.version = 0;
}
-static int fuse_symlink_readpage(struct file *null, struct page *page)
+static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
{
- int err = fuse_readlink_page(page->mapping->host, page);
+ int err = fuse_readlink_page(folio->mapping->host, &folio->page);
if (!err)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
static const struct address_space_operations fuse_symlink_aops = {
- .readpage = fuse_symlink_readpage,
+ .read_folio = fuse_symlink_read_folio,
};
void fuse_init_symlink(struct inode *inode)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9d67b830fb7a..71bfb663aac5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -14,33 +14,28 @@
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/module.h>
-#include <linux/compat.h>
#include <linux/swap.h>
#include <linux/falloc.h>
#include <linux/uio.h>
+#include <linux/fs.h>
-static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
- struct fuse_page_desc **desc)
-{
- struct page **pages;
-
- pages = kzalloc(npages * (sizeof(struct page *) +
- sizeof(struct fuse_page_desc)), flags);
- *desc = (void *) (pages + npages);
-
- return pages;
-}
-
-static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
- int opcode, struct fuse_open_out *outargp)
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, int opcode,
+ struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
- inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
- if (!fc->atomic_o_trunc)
+ inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
+ if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
+
+ if (fm->fc->handle_killpriv_v2 &&
+ (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
+ inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+ }
+
args.opcode = opcode;
args.nodeid = nodeid;
args.in_numargs = 1;
@@ -50,7 +45,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
args.out_args[0].size = sizeof(*outargp);
args.out_args[0].value = outargp;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
struct fuse_release_args {
@@ -59,7 +54,7 @@ struct fuse_release_args {
struct inode *inode;
};
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
{
struct fuse_file *ff;
@@ -67,7 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
if (unlikely(!ff))
return NULL;
- ff->fc = fc;
+ ff->fm = fm;
ff->release_args = kzalloc(sizeof(*ff->release_args),
GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
@@ -81,7 +76,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- ff->kh = atomic64_inc_return(&fc->khctr);
+ ff->kh = atomic64_inc_return(&fm->fc->khctr);
return ff;
}
@@ -99,7 +94,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
return ff;
}
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
@@ -113,31 +108,32 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
if (refcount_dec_and_test(&ff->count)) {
struct fuse_args *args = &ff->release_args->args;
- if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
+ if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
/* Do nothing when client does not implement 'open' */
- fuse_release_end(ff->fc, args, 0);
+ fuse_release_end(ff->fm, args, 0);
} else if (sync) {
- fuse_simple_request(ff->fc, args);
- fuse_release_end(ff->fc, args, 0);
+ fuse_simple_request(ff->fm, args);
+ fuse_release_end(ff->fm, args, 0);
} else {
args->end = fuse_release_end;
- if (fuse_simple_background(ff->fc, args,
+ if (fuse_simple_background(ff->fm, args,
GFP_KERNEL | __GFP_NOFAIL))
- fuse_release_end(ff->fc, args, -ENOTCONN);
+ fuse_release_end(ff->fm, args, -ENOTCONN);
}
kfree(ff);
}
}
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
- bool isdir)
+struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, bool isdir)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ff->fh = 0;
/* Default for no-open */
@@ -146,14 +142,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
struct fuse_open_out outarg;
int err;
- err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
if (!err) {
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
} else if (err != -ENOSYS) {
fuse_file_free(ff);
- return err;
+ return ERR_PTR(err);
} else {
if (isdir)
fc->no_opendir = 1;
@@ -166,9 +162,19 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
ff->open_flags &= ~FOPEN_DIRECT_IO;
ff->nodeid = nodeid;
- file->private_data = ff;
- return 0;
+ return ff;
+}
+
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
+ bool isdir)
+{
+ struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
+
+ if (!IS_ERR(ff))
+ file->private_data = ff;
+
+ return PTR_ERR_OR_ZERO(ff);
}
EXPORT_SYMBOL_GPL(fuse_do_open);
@@ -192,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!(ff->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -205,9 +210,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
- if (fc->writeback_cache)
- file_update_time(file);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
}
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
@@ -215,38 +219,62 @@ void fuse_finish_open(struct inode *inode, struct file *file)
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
int err;
bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
+ bool dax_truncate = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc && FUSE_IS_DAX(inode);
+
+ if (fuse_is_bad(inode))
+ return -EIO;
err = generic_file_open(inode, file);
if (err)
return err;
- if (is_wb_truncate) {
+ if (is_wb_truncate || dax_truncate)
inode_lock(inode);
- fuse_set_nowrite(inode);
+
+ if (dax_truncate) {
+ filemap_invalidate_lock(inode->i_mapping);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out_inode_unlock;
}
- err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+ if (is_wb_truncate || dax_truncate)
+ fuse_set_nowrite(inode);
+ err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (is_wb_truncate) {
+ if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
- inode_unlock(inode);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
}
+ if (dax_truncate)
+ filemap_invalidate_unlock(inode->i_mapping);
+out_inode_unlock:
+ if (is_wb_truncate || dax_truncate)
+ inode_unlock(inode);
return err;
}
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
- int flags, int opcode)
+ unsigned int flags, int opcode)
{
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
struct fuse_release_args *ra = ff->release_args;
/* Inode is NULL on error path of fuse_create_open() */
@@ -273,22 +301,21 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
ra->args.nocreds = true;
}
-void fuse_release_common(struct file *file, bool isdir)
+void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+ unsigned int open_flags, fl_owner_t id, bool isdir)
{
- struct fuse_inode *fi = get_fuse_inode(file_inode(file));
- struct fuse_file *ff = file->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_release_args *ra = ff->release_args;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(fi, ff, file->f_flags, opcode);
+ fuse_prepare_release(fi, ff, open_flags, opcode);
if (ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
- ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
- (fl_owner_t) file);
+ ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
}
/* Hold inode until release is finished */
- ra->inode = igrab(file_inode(file));
+ ra->inode = igrab(inode);
/*
* Normally this will send the RELEASE request, however if
@@ -299,7 +326,13 @@ void fuse_release_common(struct file *file, bool isdir)
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fc->destroy, isdir);
+ fuse_file_put(ff, ff->fm->fc->destroy, isdir);
+}
+
+void fuse_release_common(struct file *file, bool isdir)
+{
+ fuse_file_release(file_inode(file), file->private_data, file->f_flags,
+ (fl_owner_t) file, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
@@ -311,7 +344,10 @@ static int fuse_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
- /* see fuse_vma_close() for !writeback_cache case */
+ /*
+ * Dirty pages might remain despite write_inode_now() call from
+ * fuse_flush() due to writes racing with the close.
+ */
if (fc->writeback_cache)
write_inode_now(inode, 1);
@@ -321,7 +357,8 @@ static int fuse_release(struct inode *inode, struct file *file)
return 0;
}
-void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
@@ -357,26 +394,33 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
struct fuse_writepage_args {
struct fuse_io_args ia;
- struct list_head writepages_entry;
+ struct rb_node writepages_entry;
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
+ struct fuse_sync_bucket *bucket;
};
static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
pgoff_t idx_from, pgoff_t idx_to)
{
- struct fuse_writepage_args *wpa;
+ struct rb_node *n;
+
+ n = fi->writepages.rb_node;
- list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
+ while (n) {
+ struct fuse_writepage_args *wpa;
pgoff_t curr_index;
+ wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
WARN_ON(get_fuse_inode(wpa->inode) != fi);
curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
- if (idx_from < curr_index + wpa->ia.ap.num_pages &&
- curr_index <= idx_to) {
+ if (idx_from >= curr_index + wpa->ia.ap.num_pages)
+ n = n->rb_right;
+ else if (idx_to < curr_index)
+ n = n->rb_left;
+ else
return wpa;
- }
}
return NULL;
}
@@ -436,16 +480,16 @@ static void fuse_sync_writes(struct inode *inode)
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
struct fuse_flush_in inarg;
FUSE_ARGS(args);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
- if (fc->no_flush)
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
return 0;
err = write_inode_now(inode, 1);
@@ -460,9 +504,13 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
+ err = 0;
+ if (fm->fc->no_flush)
+ goto inval_attr_out;
+
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
- inarg.lock_owner = fuse_lock_owner_id(fc, id);
+ inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
args.opcode = FUSE_FLUSH;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
@@ -470,11 +518,19 @@ static int fuse_flush(struct file *file, fl_owner_t id)
args.in_args[0].value = &inarg;
args.force = true;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_flush = 1;
+ fm->fc->no_flush = 1;
err = 0;
}
+
+inval_attr_out:
+ /*
+ * In memory i_blocks is not maintained by fuse, if writeback cache is
+ * enabled, i_blocks from cached attr may not be accurate.
+ */
+ if (!err && fm->fc->writeback_cache)
+ fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
return err;
}
@@ -482,7 +538,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
int datasync, int opcode)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_fsync_in inarg;
@@ -495,7 +551,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
static int fuse_fsync(struct file *file, loff_t start, loff_t end,
@@ -505,7 +561,7 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end,
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
inode_lock(inode);
@@ -602,7 +658,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
* == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
*
* An example:
- * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * User requested DIO read of 64K. It was split into two 32K fuse requests,
* both submitted asynchronously. The first of them was ACKed by userspace as
* fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
* second request was ACKed as short, e.g. only 1K was read, resulting in
@@ -640,7 +696,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
spin_unlock(&fi->lock);
}
- io->iocb->ki_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res);
}
kref_put(&io->refcnt, fuse_io_release);
@@ -670,7 +726,7 @@ static void fuse_io_free(struct fuse_io_args *ia)
kfree(ia);
}
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
@@ -699,7 +755,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
fuse_io_free(ia);
}
-static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+static ssize_t fuse_async_req_send(struct fuse_mount *fm,
struct fuse_io_args *ia, size_t num_bytes)
{
ssize_t err;
@@ -712,9 +768,10 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
spin_unlock(&io->lock);
ia->ap.args.end = fuse_aio_complete_req;
- err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
+ ia->ap.args.may_block = io->should_dirty;
+ err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
if (err)
- fuse_aio_complete_req(fc, &ia->ap.args, err);
+ fuse_aio_complete_req(fm, &ia->ap.args, err);
return num_bytes;
}
@@ -724,18 +781,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
{
struct file *file = ia->io->iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
if (owner != NULL) {
ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
- ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
+ ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- return fuse_simple_request(fc, &ia->ap.args);
+ return fuse_simple_request(fm, &ia->ap.args);
}
static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -745,7 +802,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size &&
+ if (attr_ver >= fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
@@ -758,21 +815,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
{
struct fuse_conn *fc = get_fuse_conn(inode);
- if (fc->writeback_cache) {
- /*
- * A hole in a file. Some data after the hole are in page cache,
- * but have not reached the client fs yet. So, the hole is not
- * present there.
- */
- int i;
- int start_idx = num_read >> PAGE_SHIFT;
- size_t off = num_read & (PAGE_SIZE - 1);
-
- for (i = start_idx; i < ap->num_pages; i++) {
- zero_user_segment(ap->pages[i], off, PAGE_SIZE);
- off = 0;
- }
- } else {
+ /*
+ * If writeback_cache is enabled, a short read means there's a hole in
+ * the file. Some data after the hole is in page cache, but has not
+ * reached the client fs yet. So the hole is not present there.
+ */
+ if (!fc->writeback_cache) {
loff_t pos = page_offset(ap->pages[0]) + num_read;
fuse_read_update_size(inode, pos, attr_ver);
}
@@ -781,7 +829,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
static int fuse_do_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
loff_t pos = page_offset(page);
struct fuse_page_desc desc = { .length = PAGE_SIZE };
struct fuse_io_args ia = {
@@ -801,14 +849,14 @@ static int fuse_do_readpage(struct file *file, struct page *page)
*/
fuse_wait_on_page_writeback(inode, page->index);
- attr_ver = fuse_get_attr_version(fc);
+ attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
if (pos + (desc.length - 1) == LLONG_MAX)
desc.length--;
fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
- res = fuse_simple_request(fc, &ia.ap.args);
+ res = fuse_simple_request(fm, &ia.ap.args);
if (res < 0)
return res;
/*
@@ -822,13 +870,14 @@ static int fuse_do_readpage(struct file *file, struct page *page)
return 0;
}
-static int fuse_readpage(struct file *file, struct page *page)
+static int fuse_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
int err;
err = -EIO;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
goto out;
err = fuse_do_readpage(file, page);
@@ -838,7 +887,7 @@ static int fuse_readpage(struct file *file, struct page *page)
return err;
}
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
int i;
@@ -882,7 +931,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = page_offset(ap->pages[0]);
size_t count = ap->num_pages << PAGE_SHIFT;
@@ -901,98 +950,62 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
WARN_ON((loff_t) (pos + count) < 0);
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
- ia->read.attr_ver = fuse_get_attr_version(fc);
- if (fc->async_read) {
+ ia->read.attr_ver = fuse_get_attr_version(fm->fc);
+ if (fm->fc->async_read) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
- err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (!err)
return;
} else {
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
err = res < 0 ? res : 0;
}
- fuse_readpages_end(fc, &ap->args, err);
+ fuse_readpages_end(fm, &ap->args, err);
}
-struct fuse_fill_data {
- struct fuse_io_args *ia;
- struct file *file;
- struct inode *inode;
- unsigned int nr_pages;
- unsigned int max_pages;
-};
-
-static int fuse_readpages_fill(void *_data, struct page *page)
+static void fuse_readahead(struct readahead_control *rac)
{
- struct fuse_fill_data *data = _data;
- struct fuse_io_args *ia = data->ia;
- struct fuse_args_pages *ap = &ia->ap;
- struct inode *inode = data->inode;
+ struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
+ unsigned int i, max_pages, nr_pages = 0;
- fuse_wait_on_page_writeback(inode, page->index);
+ if (fuse_is_bad(inode))
+ return;
- if (ap->num_pages &&
- (ap->num_pages == fc->max_pages ||
- (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
- ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
- data->max_pages = min_t(unsigned int, data->nr_pages,
- fc->max_pages);
- fuse_send_readpages(ia, data->file);
- data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
- if (!ia) {
- unlock_page(page);
- return -ENOMEM;
- }
- ap = &ia->ap;
- }
+ max_pages = min_t(unsigned int, fc->max_pages,
+ fc->max_read / PAGE_SIZE);
- if (WARN_ON(ap->num_pages >= data->max_pages)) {
- unlock_page(page);
- fuse_io_free(ia);
- return -EIO;
- }
+ for (;;) {
+ struct fuse_io_args *ia;
+ struct fuse_args_pages *ap;
- get_page(page);
- ap->pages[ap->num_pages] = page;
- ap->descs[ap->num_pages].length = PAGE_SIZE;
- ap->num_pages++;
- data->nr_pages--;
- return 0;
-}
-
-static int fuse_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_fill_data data;
- int err;
-
- err = -EIO;
- if (is_bad_inode(inode))
- goto out;
-
- data.file = file;
- data.inode = inode;
- data.nr_pages = nr_pages;
- data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
-;
- data.ia = fuse_io_alloc(NULL, data.max_pages);
- err = -ENOMEM;
- if (!data.ia)
- goto out;
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ break;
- err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
- if (!err) {
- if (data.ia->ap.num_pages)
- fuse_send_readpages(data.ia, file);
- else
- fuse_io_free(data.ia);
+ nr_pages = readahead_count(rac) - nr_pages;
+ if (nr_pages > max_pages)
+ nr_pages = max_pages;
+ if (nr_pages == 0)
+ break;
+ ia = fuse_io_alloc(NULL, nr_pages);
+ if (!ia)
+ return;
+ ap = &ia->ap;
+ nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ fuse_wait_on_page_writeback(inode,
+ readahead_index(rac) + i);
+ ap->descs[i].length = PAGE_SIZE;
+ }
+ ap->num_pages = nr_pages;
+ fuse_send_readpages(ia, rac->file);
}
-out:
- return err;
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -1008,7 +1021,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
- err = fuse_update_attributes(inode, iocb->ki_filp);
+ err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
if (err)
return err;
}
@@ -1027,7 +1040,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
args->opcode = FUSE_WRITE;
args->nodeid = ff->nodeid;
args->in_numargs = 2;
- if (ff->fc->minor < 9)
+ if (ff->fm->fc->minor < 9)
args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
else
args->in_args[0].size = sizeof(ia->write.in);
@@ -1042,7 +1055,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb)
{
unsigned int flags = iocb->ki_filp->f_flags;
- if (iocb->ki_flags & IOCB_DSYNC)
+ if (iocb_is_dsync(iocb))
flags |= O_DSYNC;
if (iocb->ki_flags & IOCB_SYNC)
flags |= O_SYNC;
@@ -1056,7 +1069,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
struct kiocb *iocb = ia->io->iocb;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_write_in *inarg = &ia->write.in;
ssize_t err;
@@ -1064,20 +1077,20 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
inarg->flags = fuse_write_flags(iocb);
if (owner != NULL) {
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
- inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+ inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- err = fuse_simple_request(fc, &ia->ap.args);
+ err = fuse_simple_request(fm, &ia->ap.args);
if (!err && ia->write.out.size > count)
err = -EIO;
return err ?: ia->write.out.size;
}
-bool fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1085,12 +1098,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
- if (pos > inode->i_size) {
+ if (written > 0 && pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
spin_unlock(&fi->lock);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+
return ret;
}
@@ -1101,8 +1116,9 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct fuse_args_pages *ap = &ia->ap;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
unsigned int offset, i;
+ bool short_write;
int err;
for (i = 0; i < ap->num_pages; i++)
@@ -1110,37 +1126,45 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
+ if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
+ ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
- err = fuse_simple_request(fc, &ap->args);
+ err = fuse_simple_request(fm, &ap->args);
if (!err && ia->write.out.size > count)
err = -EIO;
+ short_write = ia->write.out.size < count;
offset = ap->descs[0].offset;
count = ia->write.out.size;
for (i = 0; i < ap->num_pages; i++) {
struct page *page = ap->pages[i];
- if (!err && !offset && count >= PAGE_SIZE)
- SetPageUptodate(page);
-
- if (count > PAGE_SIZE - offset)
- count -= PAGE_SIZE - offset;
- else
- count = 0;
- offset = 0;
-
- unlock_page(page);
+ if (err) {
+ ClearPageUptodate(page);
+ } else {
+ if (count >= PAGE_SIZE - offset)
+ count -= PAGE_SIZE - offset;
+ else {
+ if (short_write)
+ ClearPageUptodate(page);
+ count = 0;
+ }
+ offset = 0;
+ }
+ if (ia->write.page_locked && (i == ap->num_pages - 1))
+ unlock_page(page);
put_page(page);
}
return err;
}
-static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
+static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
unsigned int max_pages)
{
+ struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0;
@@ -1160,25 +1184,23 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
again:
err = -EFAULT;
- if (iov_iter_fault_in_readable(ii, bytes))
+ if (fault_in_iov_iter_readable(ii, bytes))
break;
err = -ENOMEM;
- page = grab_cache_page_write_begin(mapping, index, 0);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+ tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
flush_dcache_page(page);
- iov_iter_advance(ii, tmp);
if (!tmp) {
unlock_page(page);
put_page(page);
- bytes = min(bytes, iov_iter_single_seg_count(ii));
goto again;
}
@@ -1193,6 +1215,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
if (offset == PAGE_SIZE)
offset = 0;
+ /* If we copied full page, mark it uptodate */
+ if (tmp == PAGE_SIZE)
+ SetPageUptodate(page);
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ ia->write.page_locked = true;
+ break;
+ }
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
@@ -1236,7 +1268,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
break;
}
- count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
+ count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
if (count <= 0) {
err = count;
} else {
@@ -1256,11 +1288,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
kfree(ap->pages);
} while (!err && iov_iter_count(ii));
- if (res > 0)
- fuse_write_update_size(inode, pos);
-
+ fuse_write_update_attr(inode, pos, res);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
- fuse_invalidate_attr(inode);
return res > 0 ? res : err;
}
@@ -1273,17 +1302,25 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t written_buffered = 0;
struct inode *inode = mapping->host;
ssize_t err;
+ struct fuse_conn *fc = get_fuse_conn(inode);
loff_t endbyte = 0;
- if (get_fuse_conn(inode)->writeback_cache) {
+ if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
- err = fuse_update_attributes(mapping->host, file);
+ err = fuse_update_attributes(mapping->host, file,
+ STATX_SIZE | STATX_MODE);
if (err)
return err;
+ if (fc->handle_killpriv_v2 &&
+ should_remove_suid(file_dentry(file))) {
+ goto writethrough;
+ }
+
return generic_file_write_iter(iocb, from);
}
+writethrough:
inode_lock(inode);
/* We can write back this queue in page reclaim */
@@ -1341,16 +1378,6 @@ out:
return written ? written : err;
}
-static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
- unsigned int index,
- unsigned int nr_pages)
-{
- int i;
-
- for (i = index; i < index + nr_pages; i++)
- descs[i].length = PAGE_SIZE - descs[i].offset;
-}
-
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
return (unsigned long)ii->iov->iov_base + ii->iov_offset;
@@ -1387,18 +1414,17 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
while (nbytes < *nbytesp && ap->num_pages < max_pages) {
unsigned npages;
size_t start;
- ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
+ ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
*nbytesp - nbytes,
max_pages - ap->num_pages,
&start);
if (ret < 0)
break;
- iov_iter_advance(ii, ret);
nbytes += ret;
ret += start;
- npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+ npages = DIV_ROUND_UP(ret, PAGE_SIZE);
ap->descs[ap->num_pages].offset = start;
fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
@@ -1408,6 +1434,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
else
@@ -1426,7 +1453,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
struct file *file = io->iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
@@ -1442,7 +1469,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!ia)
return -ENOMEM;
- ia->io = io;
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
inode_lock(inode);
@@ -1451,7 +1477,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
inode_unlock(inode);
}
- io->should_dirty = !write && iter_is_iovec(iter);
+ io->should_dirty = !write && user_backed_iter(iter);
while (count) {
ssize_t nres;
fl_owner_t owner = current->files;
@@ -1464,7 +1490,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (write) {
if (!capable(CAP_FSETID))
- ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
+ ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
nres = fuse_send_write(ia, pos, nbytes, owner);
} else {
@@ -1552,11 +1578,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
} else {
res = fuse_direct_io(&io, from, &iocb->ki_pos,
FUSE_DIO_WRITE);
+ fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- fuse_invalidate_attr(inode);
- if (res > 0)
- fuse_write_update_size(inode, iocb->ki_pos);
inode_unlock(inode);
return res;
@@ -1566,10 +1590,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (fuse_is_bad(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_read_iter(iocb, to);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
@@ -1580,10 +1608,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (fuse_is_bad(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_write_iter(iocb, from);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_write_iter(iocb, from);
else
@@ -1595,6 +1627,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
int i;
+ if (wpa->bucket)
+ fuse_sync_bucket_dec(wpa->bucket);
+
for (i = 0; i < ap->num_pages; i++)
__free_page(ap->pages[i]);
@@ -1605,7 +1640,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
kfree(wpa);
}
-static void fuse_writepage_finish(struct fuse_conn *fc,
+static void fuse_writepage_finish(struct fuse_mount *fm,
struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
@@ -1614,7 +1649,6 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- list_del(&wpa->writepages_entry);
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
@@ -1624,7 +1658,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
}
/* Called under fi->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc,
+static void fuse_send_writepage(struct fuse_mount *fm,
struct fuse_writepage_args *wpa, loff_t size)
__releases(fi->lock)
__acquires(fi->lock)
@@ -1650,10 +1684,10 @@ __acquires(fi->lock)
args->force = true;
args->nocreds = true;
- err = fuse_simple_background(fc, args, GFP_ATOMIC);
+ err = fuse_simple_background(fm, args, GFP_ATOMIC);
if (err == -ENOMEM) {
spin_unlock(&fi->lock);
- err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+ err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&fi->lock);
}
@@ -1665,7 +1699,8 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
- fuse_writepage_finish(fc, wpa);
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
/* After fuse_writepage_finish() aux request list is private */
@@ -1689,7 +1724,7 @@ void fuse_flush_writepages(struct inode *inode)
__releases(fi->lock)
__acquires(fi->lock)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
loff_t crop = i_size_read(inode);
struct fuse_writepage_args *wpa;
@@ -1698,29 +1733,76 @@ __acquires(fi->lock)
wpa = list_entry(fi->queued_writes.next,
struct fuse_writepage_args, queue_entry);
list_del_init(&wpa->queue_entry);
- fuse_send_writepage(fc, wpa, crop);
+ fuse_send_writepage(fm, wpa, crop);
}
}
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
+ struct fuse_writepage_args *wpa)
+{
+ pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
+ pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ WARN_ON(!wpa->ia.ap.num_pages);
+ while (*p) {
+ struct fuse_writepage_args *curr;
+ pgoff_t curr_index;
+
+ parent = *p;
+ curr = rb_entry(parent, struct fuse_writepage_args,
+ writepages_entry);
+ WARN_ON(curr->inode != wpa->inode);
+ curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
+
+ if (idx_from >= curr_index + curr->ia.ap.num_pages)
+ p = &(*p)->rb_right;
+ else if (idx_to < curr_index)
+ p = &(*p)->rb_left;
+ else
+ return curr;
+ }
+
+ rb_link_node(&wpa->writepages_entry, parent, p);
+ rb_insert_color(&wpa->writepages_entry, root);
+ return NULL;
+}
+
+static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+{
+ WARN_ON(fuse_insert_writeback(root, wpa));
+}
+
+static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_writepage_args *wpa =
container_of(args, typeof(*wpa), ia.ap.args);
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
mapping_set_error(inode->i_mapping, error);
+ /*
+ * A writeback finished and this might have updated mtime/ctime on
+ * server making local mtime/ctime stale. Hence invalidate attrs.
+ * Do this only if writeback_cache is not enabled. If writeback_cache
+ * is enabled, we trust local ctime/mtime.
+ */
+ if (!fc->writeback_cache)
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_writepage_args *next = wpa->next;
wpa->next = next->next;
next->next = NULL;
next->ia.ff = fuse_file_get(wpa->ia.ff);
- list_add(&next->writepages_entry, &fi->writepages);
+ tree_insert(&fi->writepages, next);
/*
* Skip fuse_flush_writepages() to make it easy to crop requests
@@ -1745,46 +1827,53 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
* no invocations of fuse_writepage_end() while we're in
* fuse_set_nowrite..fuse_release_nowrite section.
*/
- fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+ fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fi->writectr--;
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
-static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
- struct fuse_inode *fi)
+static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
{
- struct fuse_file *ff = NULL;
+ struct fuse_file *ff;
spin_lock(&fi->lock);
- if (!list_empty(&fi->write_files)) {
- ff = list_entry(fi->write_files.next, struct fuse_file,
- write_entry);
+ ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
+ write_entry);
+ if (ff)
fuse_file_get(ff);
- }
spin_unlock(&fi->lock);
return ff;
}
-static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
- struct fuse_inode *fi)
+static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
{
- struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+ struct fuse_file *ff = __fuse_write_file_get(fi);
WARN_ON(!ff);
return ff;
}
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff;
int err;
- ff = __fuse_write_file_get(fc, fi);
+ /*
+ * Inode is always written before the last reference is dropped and
+ * hence this should not be reached from reclaim.
+ *
+ * Writing back the inode from reclaim can deadlock if the request
+ * processing itself needs an allocation. Allocations triggering
+ * reclaim while serving a request can't be prevented, because it can
+ * involve any number of unrelated userspace processes.
+ */
+ WARN_ON(wbc->for_reclaim);
+
+ ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
fuse_file_put(ff, false, false);
@@ -1811,6 +1900,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
}
+static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
+ struct fuse_writepage_args *wpa)
+{
+ if (!fc->sync_fs)
+ return;
+
+ rcu_read_lock();
+ /* Prevent resurrection of dead bucket in unlikely race with syncfs */
+ do {
+ wpa->bucket = rcu_dereference(fc->curr_bucket);
+ } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
+ rcu_read_unlock();
+}
+
static int fuse_writepage_locked(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1834,10 +1937,11 @@ static int fuse_writepage_locked(struct page *page)
goto err_free;
error = -EIO;
- wpa->ia.ff = fuse_write_file_get(fc, fi);
+ wpa->ia.ff = fuse_write_file_get(fi);
if (!wpa->ia.ff)
goto err_nofile;
+ fuse_writepage_add_to_bucket(fc, wpa);
fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
copy_highpage(tmp_page, page);
@@ -1855,7 +1959,7 @@ static int fuse_writepage_locked(struct page *page)
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fi->lock);
- list_add(&wpa->writepages_entry, &fi->writepages);
+ tree_insert(&fi->writepages, wpa);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
@@ -1876,6 +1980,7 @@ err:
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
int err;
if (fuse_page_is_writeback(page->mapping->host, page->index)) {
@@ -1891,6 +1996,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
err = fuse_writepage_locked(page);
unlock_page(page);
@@ -1950,14 +2059,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
}
/*
- * First recheck under fi->lock if the offending offset is still under
- * writeback. If yes, then iterate auxiliary write requests, to see if there's
+ * Check under fi->lock if the page is under writeback, and insert it onto the
+ * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
* one already added for a page at this offset. If there's none, then insert
* this new request onto the auxiliary list, otherwise reuse the existing one by
- * copying the new page contents over to the old temporary page.
+ * swapping the new temp page with the old one.
*/
-static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
- struct page *page)
+static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
+ struct page *page)
{
struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
struct fuse_writepage_args *tmp;
@@ -1965,17 +2074,15 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
WARN_ON(new_ap->num_pages != 0);
+ new_ap->num_pages = 1;
spin_lock(&fi->lock);
- list_del(&new_wpa->writepages_entry);
- old_wpa = fuse_find_writeback(fi, page->index, page->index);
+ old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
if (!old_wpa) {
- list_add(&new_wpa->writepages_entry, &fi->writepages);
spin_unlock(&fi->lock);
- return false;
+ return true;
}
- new_ap->num_pages = 1;
for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
pgoff_t curr_index;
@@ -2004,7 +2111,41 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
fuse_writepage_free(new_wpa);
}
- return true;
+ return false;
+}
+
+static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
+ struct fuse_args_pages *ap,
+ struct fuse_fill_wb_data *data)
+{
+ WARN_ON(!ap->num_pages);
+
+ /*
+ * Being under writeback is unlikely but possible. For example direct
+ * read to an mmaped fuse file will set the page dirty twice; once when
+ * the pages are faulted with get_user_pages(), and then after the read
+ * completed.
+ */
+ if (fuse_page_is_writeback(data->inode, page->index))
+ return true;
+
+ /* Reached max pages */
+ if (ap->num_pages == fc->max_pages)
+ return true;
+
+ /* Reached max write bytes */
+ if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
+ return true;
+
+ /* Discontinuity */
+ if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
+ return true;
+
+ /* Need to grow the pages array? If so, did the expansion fail? */
+ if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
+ return true;
+
+ return false;
}
static int fuse_writepages_fill(struct page *page,
@@ -2017,35 +2158,18 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
- bool is_writeback;
int err;
if (!data->ff) {
err = -EIO;
- data->ff = fuse_write_file_get(fc, fi);
+ data->ff = fuse_write_file_get(fi);
if (!data->ff)
goto out_unlock;
}
- /*
- * Being under writeback is unlikely but possible. For example direct
- * read to an mmaped fuse file will set the page dirty twice; once when
- * the pages are faulted with get_user_pages(), and then after the read
- * completed.
- */
- is_writeback = fuse_page_is_writeback(inode, page->index);
-
- if (wpa && ap->num_pages &&
- (is_writeback || ap->num_pages == fc->max_pages ||
- (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
- data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
+ if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
- } else if (wpa && ap->num_pages == data->max_pages) {
- if (!fuse_pages_realloc(data)) {
- fuse_writepages_send(data);
- data->wpa = NULL;
- }
}
err = -ENOMEM;
@@ -2073,6 +2197,8 @@ static int fuse_writepages_fill(struct page *page,
__free_page(tmp_page);
goto out_unlock;
}
+ fuse_writepage_add_to_bucket(fc, wpa);
+
data->max_pages = 1;
ap = &wpa->ia.ap;
@@ -2083,12 +2209,6 @@ static int fuse_writepages_fill(struct page *page,
ap->args.end = fuse_writepage_end;
ap->num_pages = 0;
wpa->inode = inode;
-
- spin_lock(&fi->lock);
- list_add(&wpa->writepages_entry, &fi->writepages);
- spin_unlock(&fi->lock);
-
- data->wpa = wpa;
}
set_page_writeback(page);
@@ -2096,26 +2216,25 @@ static int fuse_writepages_fill(struct page *page,
ap->pages[ap->num_pages] = tmp_page;
ap->descs[ap->num_pages].offset = 0;
ap->descs[ap->num_pages].length = PAGE_SIZE;
+ data->orig_pages[ap->num_pages] = page;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
- if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
+ if (data->wpa) {
+ /*
+ * Protected by fi->lock against concurrent access by
+ * fuse_page_is_writeback().
+ */
+ spin_lock(&fi->lock);
+ ap->num_pages++;
+ spin_unlock(&fi->lock);
+ } else if (fuse_writepage_add(wpa, page)) {
+ data->wpa = wpa;
+ } else {
end_page_writeback(page);
- data->wpa = NULL;
- goto out_unlock;
}
- data->orig_pages[ap->num_pages] = page;
-
- /*
- * Protected by fi->lock against concurrent access by
- * fuse_page_is_writeback().
- */
- spin_lock(&fi->lock);
- ap->num_pages++;
- spin_unlock(&fi->lock);
-
out_unlock:
unlock_page(page);
@@ -2131,9 +2250,13 @@ static int fuse_writepages(struct address_space *mapping,
int err;
err = -EIO;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
goto out;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
+
data.inode = inode;
data.wpa = NULL;
data.ff = NULL;
@@ -2147,10 +2270,8 @@ static int fuse_writepages(struct address_space *mapping,
err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
if (data.wpa) {
- /* Ignore errors if we can write at least one page */
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
- err = 0;
}
if (data.ff)
fuse_file_put(data.ff, false, false);
@@ -2165,8 +2286,7 @@ out:
* but how to implement it without killing performance need more thinking.
*/
static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
@@ -2176,7 +2296,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
WARN_ON(!fc->writeback_cache);
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
goto error;
@@ -2219,15 +2339,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
if (!copied)
goto unlock;
+ pos += copied;
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
- size_t endoff = (pos + copied) & ~PAGE_MASK;
+ size_t endoff = pos & ~PAGE_MASK;
if (endoff)
zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
- fuse_write_update_size(inode, pos + copied);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+
set_page_dirty(page);
unlock:
@@ -2237,25 +2360,31 @@ unlock:
return copied;
}
-static int fuse_launder_page(struct page *page)
+static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
- if (clear_page_dirty_for_io(page)) {
- struct inode *inode = page->mapping->host;
- err = fuse_writepage_locked(page);
+ if (folio_clear_dirty_for_io(folio)) {
+ struct inode *inode = folio->mapping->host;
+
+ /* Serialize with pending writeback for the same page */
+ fuse_wait_on_page_writeback(inode, folio->index);
+ err = fuse_writepage_locked(&folio->page);
if (!err)
- fuse_wait_on_page_writeback(inode, page->index);
+ fuse_wait_on_page_writeback(inode, folio->index);
}
return err;
}
/*
- * Write back dirty pages now, because there may not be any suitable
- * open files later
+ * Write back dirty data/metadata now (there may not be any suitable
+ * open files later for data)
*/
static void fuse_vma_close(struct vm_area_struct *vma)
{
- filemap_write_and_wait(vma->vm_file->f_mapping);
+ int err;
+
+ err = write_inode_now(vma->vm_file->f_mapping->host, 1);
+ mapping_set_error(vma->vm_file->f_mapping, err);
}
/*
@@ -2300,6 +2429,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
+ /* DAX mmap is superior to direct_io mmap */
+ if (FUSE_IS_DAX(file_inode(file)))
+ return fuse_dax_mmap(file, vma);
+
if (ff->open_flags & FOPEN_DIRECT_IO) {
/* Can't provide the coherency needed for MAP_SHARED */
if (vma->vm_flags & VM_MAYSHARE)
@@ -2378,7 +2511,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
struct fuse_lk_out outarg;
@@ -2388,9 +2521,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
- err = convert_fuse_file_lock(fc, &outarg.lk, fl);
+ err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
return err;
}
@@ -2398,12 +2531,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
- pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
+ pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2416,7 +2549,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return 0;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
/* locking is restartable */
if (err == -EINTR)
@@ -2470,13 +2603,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_bmap_in inarg;
struct fuse_bmap_out outarg;
int err;
- if (!inode->i_sb->s_bdev || fc->no_bmap)
+ if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
return 0;
memset(&inarg, 0, sizeof(inarg));
@@ -2490,9 +2623,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS)
- fc->no_bmap = 1;
+ fm->fc->no_bmap = 1;
return err ? 0 : outarg.block;
}
@@ -2500,7 +2633,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_lseek_in inarg = {
@@ -2511,7 +2644,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
struct fuse_lseek_out outarg;
int err;
- if (fc->no_lseek)
+ if (fm->fc->no_lseek)
goto fallback;
args.opcode = FUSE_LSEEK;
@@ -2522,10 +2655,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -ENOSYS) {
- fc->no_lseek = 1;
+ fm->fc->no_lseek = 1;
goto fallback;
}
return err;
@@ -2534,7 +2667,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
fallback:
- err = fuse_update_attributes(inode, file);
+ err = fuse_update_attributes(inode, file, STATX_SIZE);
if (!err)
return generic_file_llseek(file, offset, whence);
else
@@ -2554,7 +2687,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
break;
case SEEK_END:
inode_lock(inode);
- retval = fuse_update_attributes(inode, file);
+ retval = fuse_update_attributes(inode, file, STATX_SIZE);
if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode);
@@ -2573,354 +2706,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
}
/*
- * CUSE servers compiled on 32bit broke on 64bit kernels because the
- * ABI was defined to be 'struct iovec' which is different on 32bit
- * and 64bit. Fortunately we can determine which structure the server
- * used from the size of the reply.
- */
-static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
- size_t transferred, unsigned count,
- bool is_compat)
-{
-#ifdef CONFIG_COMPAT
- if (count * sizeof(struct compat_iovec) == transferred) {
- struct compat_iovec *ciov = src;
- unsigned i;
-
- /*
- * With this interface a 32bit server cannot support
- * non-compat (i.e. ones coming from 64bit apps) ioctl
- * requests
- */
- if (!is_compat)
- return -EINVAL;
-
- for (i = 0; i < count; i++) {
- dst[i].iov_base = compat_ptr(ciov[i].iov_base);
- dst[i].iov_len = ciov[i].iov_len;
- }
- return 0;
- }
-#endif
-
- if (count * sizeof(struct iovec) != transferred)
- return -EIO;
-
- memcpy(dst, src, transferred);
- return 0;
-}
-
-/* Make sure iov_length() won't overflow */
-static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
- size_t count)
-{
- size_t n;
- u32 max = fc->max_pages << PAGE_SHIFT;
-
- for (n = 0; n < count; n++, iov++) {
- if (iov->iov_len > (size_t) max)
- return -ENOMEM;
- max -= iov->iov_len;
- }
- return 0;
-}
-
-static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
- void *src, size_t transferred, unsigned count,
- bool is_compat)
-{
- unsigned i;
- struct fuse_ioctl_iovec *fiov = src;
-
- if (fc->minor < 16) {
- return fuse_copy_ioctl_iovec_old(dst, src, transferred,
- count, is_compat);
- }
-
- if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
- return -EIO;
-
- for (i = 0; i < count; i++) {
- /* Did the server supply an inappropriate value? */
- if (fiov[i].base != (unsigned long) fiov[i].base ||
- fiov[i].len != (unsigned long) fiov[i].len)
- return -EIO;
-
- dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
- dst[i].iov_len = (size_t) fiov[i].len;
-
-#ifdef CONFIG_COMPAT
- if (is_compat &&
- (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
- (compat_size_t) dst[i].iov_len != fiov[i].len))
- return -EIO;
-#endif
- }
-
- return 0;
-}
-
-
-/*
- * For ioctls, there is no generic way to determine how much memory
- * needs to be read and/or written. Furthermore, ioctls are allowed
- * to dereference the passed pointer, so the parameter requires deep
- * copying but FUSE has no idea whatsoever about what to copy in or
- * out.
- *
- * This is solved by allowing FUSE server to retry ioctl with
- * necessary in/out iovecs. Let's assume the ioctl implementation
- * needs to read in the following structure.
- *
- * struct a {
- * char *buf;
- * size_t buflen;
- * }
- *
- * On the first callout to FUSE server, inarg->in_size and
- * inarg->out_size will be NULL; then, the server completes the ioctl
- * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
- * the actual iov array to
- *
- * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
- *
- * which tells FUSE to copy in the requested area and retry the ioctl.
- * On the second round, the server has access to the structure and
- * from that it can tell what to look for next, so on the invocation,
- * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
- *
- * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
- * { .iov_base = a.buf, .iov_len = a.buflen } }
- *
- * FUSE will copy both struct a and the pointed buffer from the
- * process doing the ioctl and retry ioctl with both struct a and the
- * buffer.
- *
- * This time, FUSE server has everything it needs and completes ioctl
- * without FUSE_IOCTL_RETRY which finishes the ioctl call.
- *
- * Copying data out works the same way.
- *
- * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
- * automatically initializes in and out iovs by decoding @cmd with
- * _IOC_* macros and the server is not allowed to request RETRY. This
- * limits ioctl data transfers to well-formed ioctls and is the forced
- * behavior for all FUSE servers.
- */
-long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
- unsigned int flags)
-{
- struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
- struct fuse_ioctl_in inarg = {
- .fh = ff->fh,
- .cmd = cmd,
- .arg = arg,
- .flags = flags
- };
- struct fuse_ioctl_out outarg;
- struct iovec *iov_page = NULL;
- struct iovec *in_iov = NULL, *out_iov = NULL;
- unsigned int in_iovs = 0, out_iovs = 0, max_pages;
- size_t in_size, out_size, c;
- ssize_t transferred;
- int err, i;
- struct iov_iter ii;
- struct fuse_args_pages ap = {};
-
-#if BITS_PER_LONG == 32
- inarg.flags |= FUSE_IOCTL_32BIT;
-#else
- if (flags & FUSE_IOCTL_COMPAT) {
- inarg.flags |= FUSE_IOCTL_32BIT;
-#ifdef CONFIG_X86_X32
- if (in_x32_syscall())
- inarg.flags |= FUSE_IOCTL_COMPAT_X32;
-#endif
- }
-#endif
-
- /* assume all the iovs returned by client always fits in a page */
- BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
-
- err = -ENOMEM;
- ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
- iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
- if (!ap.pages || !iov_page)
- goto out;
-
- fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
-
- /*
- * If restricted, initialize IO parameters as encoded in @cmd.
- * RETRY from server is not allowed.
- */
- if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
- struct iovec *iov = iov_page;
-
- iov->iov_base = (void __user *)arg;
- iov->iov_len = _IOC_SIZE(cmd);
-
- if (_IOC_DIR(cmd) & _IOC_WRITE) {
- in_iov = iov;
- in_iovs = 1;
- }
-
- if (_IOC_DIR(cmd) & _IOC_READ) {
- out_iov = iov;
- out_iovs = 1;
- }
- }
-
- retry:
- inarg.in_size = in_size = iov_length(in_iov, in_iovs);
- inarg.out_size = out_size = iov_length(out_iov, out_iovs);
-
- /*
- * Out data can be used either for actual out data or iovs,
- * make sure there always is at least one page.
- */
- out_size = max_t(size_t, out_size, PAGE_SIZE);
- max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
-
- /* make sure there are enough buffer pages and init request with them */
- err = -ENOMEM;
- if (max_pages > fc->max_pages)
- goto out;
- while (ap.num_pages < max_pages) {
- ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!ap.pages[ap.num_pages])
- goto out;
- ap.num_pages++;
- }
-
-
- /* okay, let's send it to the client */
- ap.args.opcode = FUSE_IOCTL;
- ap.args.nodeid = ff->nodeid;
- ap.args.in_numargs = 1;
- ap.args.in_args[0].size = sizeof(inarg);
- ap.args.in_args[0].value = &inarg;
- if (in_size) {
- ap.args.in_numargs++;
- ap.args.in_args[1].size = in_size;
- ap.args.in_pages = true;
-
- err = -EFAULT;
- iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
- for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
- c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
- if (c != PAGE_SIZE && iov_iter_count(&ii))
- goto out;
- }
- }
-
- ap.args.out_numargs = 2;
- ap.args.out_args[0].size = sizeof(outarg);
- ap.args.out_args[0].value = &outarg;
- ap.args.out_args[1].size = out_size;
- ap.args.out_pages = true;
- ap.args.out_argvar = true;
-
- transferred = fuse_simple_request(fc, &ap.args);
- err = transferred;
- if (transferred < 0)
- goto out;
-
- /* did it ask for retry? */
- if (outarg.flags & FUSE_IOCTL_RETRY) {
- void *vaddr;
-
- /* no retry if in restricted mode */
- err = -EIO;
- if (!(flags & FUSE_IOCTL_UNRESTRICTED))
- goto out;
-
- in_iovs = outarg.in_iovs;
- out_iovs = outarg.out_iovs;
-
- /*
- * Make sure things are in boundary, separate checks
- * are to protect against overflow.
- */
- err = -ENOMEM;
- if (in_iovs > FUSE_IOCTL_MAX_IOV ||
- out_iovs > FUSE_IOCTL_MAX_IOV ||
- in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
- goto out;
-
- vaddr = kmap_atomic(ap.pages[0]);
- err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
- transferred, in_iovs + out_iovs,
- (flags & FUSE_IOCTL_COMPAT) != 0);
- kunmap_atomic(vaddr);
- if (err)
- goto out;
-
- in_iov = iov_page;
- out_iov = in_iov + in_iovs;
-
- err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
- if (err)
- goto out;
-
- err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
- if (err)
- goto out;
-
- goto retry;
- }
-
- err = -EIO;
- if (transferred > inarg.out_size)
- goto out;
-
- err = -EFAULT;
- iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
- for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
- c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
- if (c != PAGE_SIZE && iov_iter_count(&ii))
- goto out;
- }
- err = 0;
- out:
- free_page((unsigned long) iov_page);
- while (ap.num_pages)
- __free_page(ap.pages[--ap.num_pages]);
- kfree(ap.pages);
-
- return err ? err : outarg.result;
-}
-EXPORT_SYMBOL_GPL(fuse_do_ioctl);
-
-long fuse_ioctl_common(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
-{
- struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- if (!fuse_allow_current_process(fc))
- return -EACCES;
-
- if (is_bad_inode(inode))
- return -EIO;
-
- return fuse_do_ioctl(file, cmd, arg, flags);
-}
-
-static long fuse_file_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- return fuse_ioctl_common(file, cmd, arg, 0);
-}
-
-static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
-}
-
-/*
* All files which have been polled are linked to RB tree
* fuse_conn->polled_files which is indexed by kh. Walk the tree and
* find the matching one.
@@ -2961,7 +2746,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
{
spin_lock(&fc->lock);
if (RB_EMPTY_NODE(&ff->polled_node)) {
- struct rb_node **link, *uninitialized_var(parent);
+ struct rb_node **link, *parent;
link = fuse_find_polled_node(fc, ff->kh, &parent);
BUG_ON(*link);
@@ -2974,13 +2759,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
FUSE_ARGS(args);
int err;
- if (fc->no_poll)
+ if (fm->fc->no_poll)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
@@ -2992,7 +2777,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
*/
if (waitqueue_active(&ff->poll_wait)) {
inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
- fuse_register_polled_file(fc, ff);
+ fuse_register_polled_file(fm->fc, ff);
}
args.opcode = FUSE_POLL;
@@ -3003,12 +2788,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
return demangle_poll(outarg.revents);
if (err == -ENOSYS) {
- fc->no_poll = 1;
+ fm->fc->no_poll = 1;
return DEFAULT_POLLMASK;
}
return EPOLLERR;
@@ -3065,11 +2850,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- bool async_dio = ff->fc->async_dio;
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
- size_t count = iov_iter_count(iter);
+ size_t count = iov_iter_count(iter), shortened = 0;
loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
@@ -3077,17 +2861,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode = file->f_mapping->host;
i_size = i_size_read(inode);
- if ((iov_iter_rw(iter) == READ) && (offset > i_size))
+ if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
return 0;
- /* optimization for short read */
- if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
- if (offset >= i_size)
- return 0;
- iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
- count = iov_iter_count(iter);
- }
-
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
if (!io)
return -ENOMEM;
@@ -3103,15 +2879,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = async_dio;
+ io->async = ff->fm->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
+ /* optimization for short read */
+ if (io->async && !io->write && offset + count > i_size) {
+ iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
+ shortened = count - iov_iter_count(iter);
+ count -= shortened;
+ }
+
/*
* We cannot asynchronously extend the size of a file.
* In such case the aio will behave exactly like sync io.
*/
- if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
+ if ((offset + count > i_size) && io->write)
io->blocking = true;
if (io->async && io->blocking) {
@@ -3125,10 +2908,11 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
+ iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
if (io->async) {
bool blocking = io->blocking;
@@ -3146,9 +2930,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
- if (ret > 0)
- fuse_write_update_size(inode, pos);
- else if (ret < 0 && offset + count > i_size)
+ fuse_write_update_attr(inode, pos, ret);
+ if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
@@ -3157,7 +2940,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
- int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
if (!err)
fuse_sync_writes(inode);
@@ -3171,7 +2954,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
FUSE_ARGS(args);
struct fuse_fallocate_in inarg = {
.fh = ff->fh,
@@ -3181,17 +2964,28 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
};
int err;
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & FALLOC_FL_PUNCH_HOLE);
+ (mode & (FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE));
+
+ bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
- if (fc->no_fallocate)
+ if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
if (lock_inode) {
inode_lock(inode);
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (block_faults) {
+ filemap_invalidate_lock(inode->i_mapping);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
loff_t endbyte = offset + length - 1;
err = fuse_writeback_range(inode, offset, endbyte);
@@ -3207,6 +3001,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ err = file_modified(file);
+ if (err)
+ goto out;
+
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -3215,9 +3013,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_fallocate = 1;
+ fm->fc->no_fallocate = 1;
err = -EOPNOTSUPP;
}
if (err)
@@ -3225,24 +3023,27 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
/* we could have extended the file */
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- bool changed = fuse_write_update_size(inode, offset + length);
-
- if (changed && fc->writeback_cache)
+ if (fuse_write_update_attr(inode, offset + length, length))
file_update_time(file);
}
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
truncate_pagecache_range(inode, offset, offset + length - 1);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (block_faults)
+ filemap_invalidate_unlock(inode->i_mapping);
+
if (lock_inode)
inode_unlock(inode);
+ fuse_flush_time_update(inode);
+
return err;
}
@@ -3255,7 +3056,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
struct fuse_inode *fi_out = get_fuse_inode(inode_out);
- struct fuse_conn *fc = ff_in->fc;
+ struct fuse_mount *fm = ff_in->fm;
+ struct fuse_conn *fc = fm->fc;
FUSE_ARGS(args);
struct fuse_copy_file_range_in inarg = {
.fh_in = ff_in->fh,
@@ -3279,13 +3081,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
- if (fc->writeback_cache) {
- inode_lock(inode_in);
- err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
- inode_unlock(inode_in);
- if (err)
- return err;
- }
+ inode_lock(inode_in);
+ err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
+ inode_unlock(inode_in);
+ if (err)
+ return err;
inode_lock(inode_out);
@@ -3293,11 +3093,27 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
- if (fc->writeback_cache) {
- err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
- if (err)
- goto out;
- }
+ /*
+ * Write out dirty pages in the destination file before sending the COPY
+ * request to userspace. After the request is completed, truncate off
+ * pages (including partial ones) from the cache that have been copied,
+ * since these contain stale data at that point.
+ *
+ * This should be mostly correct, but if the COPY writes to partial
+ * pages (at the start or end) and the parts not covered by the COPY are
+ * written through a memory map after calling fuse_writeback_range(),
+ * then these partial page modifications will be lost on truncation.
+ *
+ * It is unlikely that someone would rely on such mixed style
+ * modifications. Yet this does give less guarantees than if the
+ * copying was performed with write(2).
+ *
+ * To fix this a mapping->invalidate_lock could be used to prevent new
+ * faults while the copy is ongoing.
+ */
+ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+ if (err)
+ goto out;
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
@@ -3310,7 +3126,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
@@ -3318,12 +3134,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
- if (fc->writeback_cache) {
- fuse_write_update_size(inode_out, pos_out + outarg.size);
- file_update_time(file_out);
- }
+ truncate_inode_pages_range(inode_out->i_mapping,
+ ALIGN_DOWN(pos_out, PAGE_SIZE),
+ ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
- fuse_invalidate_attr(inode_out);
+ file_update_time(file_out);
+ fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
err = outarg.size;
out:
@@ -3333,6 +3149,8 @@ out:
inode_unlock(inode_out);
file_accessed(file_in);
+ fuse_flush_time_update(inode_out);
+
return err;
}
@@ -3361,6 +3179,7 @@ static const struct file_operations fuse_file_operations = {
.release = fuse_release,
.fsync = fuse_fsync,
.lock = fuse_file_lock,
+ .get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -3372,19 +3191,19 @@ static const struct file_operations fuse_file_operations = {
};
static const struct address_space_operations fuse_file_aops = {
- .readpage = fuse_readpage,
+ .read_folio = fuse_read_folio,
+ .readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
- .launder_page = fuse_launder_page,
- .readpages = fuse_readpages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .launder_folio = fuse_launder_folio,
+ .dirty_folio = filemap_dirty_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
.write_end = fuse_write_end,
};
-void fuse_init_file_inode(struct inode *inode)
+void fuse_init_file_inode(struct inode *inode, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -3395,5 +3214,8 @@ void fuse_init_file_inode(struct inode *inode)
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
- INIT_LIST_HEAD(&fi->writepages);
+ fi->writepages = RB_ROOT;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_inode_init(inode, flags);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ca344bf71404..98a9cf531873 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -111,7 +111,7 @@ struct fuse_inode {
wait_queue_head_t page_waitq;
/* List of writepage requestst (pending or sent) */
- struct list_head writepages;
+ struct rb_root writepages;
};
/* readdir cache (directory only) */
@@ -148,6 +148,13 @@ struct fuse_inode {
/** Lock to protect write related fields */
spinlock_t lock;
+
+#ifdef CONFIG_FUSE_DAX
+ /*
+ * Dax specific inode data
+ */
+ struct fuse_inode_dax *dax;
+#endif
};
/** FUSE inode state bits */
@@ -158,15 +165,18 @@ enum {
FUSE_I_INIT_RDPLUS,
/** An operation changing file size is in progress */
FUSE_I_SIZE_UNSTABLE,
+ /* Bad inode */
+ FUSE_I_BAD,
};
struct fuse_conn;
+struct fuse_mount;
struct fuse_release_args;
/** FUSE specific file data */
struct fuse_file {
/** Fuse connection for this file */
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
/* Argument space reserved for release */
struct fuse_release_args *release_args;
@@ -246,12 +256,14 @@ struct fuse_args {
bool nocreds:1;
bool in_pages:1;
bool out_pages:1;
+ bool user_pages:1;
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
+ bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
- void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+ void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
};
struct fuse_args_pages {
@@ -359,6 +371,9 @@ struct fuse_req {
/** virtio-fs's physically contiguous buffer for in and out args */
void *argbuf;
#endif
+
+ /** fuse_mount this request belongs to */
+ struct fuse_mount *fm;
};
struct fuse_iqueue;
@@ -466,8 +481,21 @@ struct fuse_dev {
struct list_head entry;
};
+enum fuse_dax_mode {
+ FUSE_DAX_INODE_DEFAULT, /* default */
+ FUSE_DAX_ALWAYS, /* "-o dax=always" */
+ FUSE_DAX_NEVER, /* "-o dax=never" */
+ FUSE_DAX_INODE_USER, /* "-o dax=inode" */
+};
+
+static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode)
+{
+ return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER;
+}
+
struct fuse_fs_context {
int fd;
+ struct file *file;
unsigned int rootmode;
kuid_t user_id;
kgid_t group_id;
@@ -481,21 +509,32 @@ struct fuse_fs_context {
bool destroy:1;
bool no_control:1;
bool no_force_umount:1;
- bool no_mount_options:1;
+ bool legacy_opts_show:1;
+ enum fuse_dax_mode dax_mode;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
+ /* DAX device, may be NULL */
+ struct dax_device *dax_dev;
+
/* fuse_dev pointer to fill in, should contain NULL on entry */
void **fudptr;
};
+struct fuse_sync_bucket {
+ /* count is a possible scalability bottleneck */
+ atomic_t count;
+ wait_queue_head_t waitq;
+ struct rcu_head rcu;
+};
+
/**
* A Fuse connection.
*
- * This structure is created, when the filesystem is mounted, and is
- * destroyed, when the client device is closed and the filesystem is
- * unmounted.
+ * This structure is created, when the root filesystem is mounted, and
+ * is destroyed, when the client device is closed and the last
+ * fuse_mount is destroyed.
*/
struct fuse_conn {
/** Lock protecting accessess to members of this structure */
@@ -527,9 +566,12 @@ struct fuse_conn {
/** Maximum write size */
unsigned max_write;
- /** Maxmum number of pages that can be used in a single request */
+ /** Maximum number of pages that can be used in a single request */
unsigned int max_pages;
+ /** Constrain ->max_pages to this value during feature negotiation */
+ unsigned int max_pages_limit;
+
/** Input queue */
struct fuse_iqueue iq;
@@ -585,7 +627,7 @@ struct fuse_conn {
/** Connection successful. Only set in INIT */
unsigned conn_init:1;
- /** Do readpages asynchronously? Only set in INIT */
+ /** Do readahead asynchronously? Only set in INIT */
unsigned async_read:1;
/** Return an unique read error after abort. Only set in INIT */
@@ -609,6 +651,17 @@ struct fuse_conn {
/** cache READLINK responses in page cache */
unsigned cache_symlinks:1;
+ /* show legacy mount options */
+ unsigned int legacy_opts_show:1;
+
+ /*
+ * fs kills suid/sgid/cap on write/chown/trunc. suid is killed on
+ * write/trunc only if caller did not have CAP_FSETID. sgid is killed
+ * on write/truncate only if caller did not have CAP_FSETID as well as
+ * file has group execute permission.
+ */
+ unsigned handle_killpriv_v2:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -632,6 +685,9 @@ struct fuse_conn {
/** Is setxattr not implemented by fs? */
unsigned no_setxattr:1;
+ /** Does file server support extended setxattr */
+ unsigned setxattr_ext:1;
+
/** Is getxattr not implemented by fs? */
unsigned no_getxattr:1;
@@ -677,7 +733,7 @@ struct fuse_conn {
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
- /** Filesystem is fully reponsible for page cache invalidation. */
+ /** Filesystem is fully responsible for page cache invalidation. */
unsigned explicit_inval_data:1;
/** Does the filesystem support readdirplus? */
@@ -716,8 +772,20 @@ struct fuse_conn {
/** Do not allow MNT_FORCE umount */
unsigned int no_force_umount:1;
- /* Do not show mount options */
- unsigned int no_mount_options:1;
+ /* Auto-mount submounts announced by the server */
+ unsigned int auto_submounts:1;
+
+ /* Propagate syncfs() to server */
+ unsigned int sync_fs:1;
+
+ /* Initialize security xattrs when creating a new inode */
+ unsigned int init_security:1;
+
+ /* Does the filesystem support per inode DAX? */
+ unsigned int inode_dax:1;
+
+ /* Is tmpfile not implemented by fs? */
+ unsigned int no_tmpfile:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -725,10 +793,10 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Entry on the fuse_conn_list */
+ /** Entry on the fuse_mount_list */
struct list_head entry;
- /** Device ID from super block */
+ /** Device ID from the root super block */
dev_t dev;
/** Dentries in the control filesystem */
@@ -746,24 +814,69 @@ struct fuse_conn {
/** Called on final put */
void (*release)(struct fuse_conn *);
- /** Super block for this connection. */
- struct super_block *sb;
-
- /** Read/write semaphore to hold when accessing sb. */
+ /**
+ * Read/write semaphore to hold when accessing the sb of any
+ * fuse_mount belonging to this connection
+ */
struct rw_semaphore killsb;
/** List of device instances belonging to this connection */
struct list_head devices;
+
+#ifdef CONFIG_FUSE_DAX
+ /* Dax mode */
+ enum fuse_dax_mode dax_mode;
+
+ /* Dax specific conn data, non-NULL if DAX is enabled */
+ struct fuse_conn_dax *dax;
+#endif
+
+ /** List of filesystems using this connection */
+ struct list_head mounts;
+
+ /* New writepages go into this bucket */
+ struct fuse_sync_bucket __rcu *curr_bucket;
};
-static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+/*
+ * Represents a mounted filesystem, potentially a submount.
+ *
+ * This object allows sharing a fuse_conn between separate mounts to
+ * allow submounts with dedicated superblocks and thus separate device
+ * IDs.
+ */
+struct fuse_mount {
+ /* Underlying (potentially shared) connection to the FUSE server */
+ struct fuse_conn *fc;
+
+ /*
+ * Super block for this connection (fc->killsb must be held when
+ * accessing this).
+ */
+ struct super_block *sb;
+
+ /* Entry on fc->mounts */
+ struct list_head fc_entry;
+};
+
+static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
}
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+ return get_fuse_mount_super(sb)->fc;
+}
+
+static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
+{
+ return get_fuse_mount_super(inode->i_sb);
+}
+
static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
{
- return get_fuse_conn_super(inode->i_sb);
+ return get_fuse_mount_super(inode->i_sb)->fc;
}
static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
@@ -786,6 +899,55 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
return atomic64_read(&fc->attr_version);
}
+static inline bool fuse_stale_inode(const struct inode *inode, int generation,
+ struct fuse_attr *attr)
+{
+ return inode->i_generation != generation ||
+ inode_wrong_type(inode, attr->mode);
+}
+
+static inline void fuse_make_bad(struct inode *inode)
+{
+ remove_inode_hash(inode);
+ set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
+}
+
+static inline bool fuse_is_bad(struct inode *inode)
+{
+ return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state));
+}
+
+static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
+ struct fuse_page_desc **desc)
+{
+ struct page **pages;
+
+ pages = kzalloc(npages * (sizeof(struct page *) +
+ sizeof(struct fuse_page_desc)), flags);
+ *desc = (void *) (pages + npages);
+
+ return pages;
+}
+
+static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
+ unsigned int index,
+ unsigned int nr_pages)
+{
+ int i;
+
+ for (i = index; i < index + nr_pages; i++)
+ descs[i].length = PAGE_SIZE - descs[i].offset;
+}
+
+static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
+{
+ /* Need RCU protection to prevent use after free after the decrement */
+ rcu_read_lock();
+ if (atomic_dec_and_test(&bucket->count))
+ wake_up(&bucket->waitq);
+ rcu_read_unlock();
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -793,11 +955,6 @@ extern const struct dentry_operations fuse_dentry_operations;
extern const struct dentry_operations fuse_root_dentry_operations;
/**
- * Inode to nodeid comparison.
- */
-int fuse_inode_eq(struct inode *inode, void *_nodeidp);
-
-/**
* Get a filled in inode
*/
struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -831,6 +988,7 @@ struct fuse_io_args {
struct {
struct fuse_write_in in;
struct fuse_write_out out;
+ bool page_locked;
} write;
};
struct fuse_args_pages ap;
@@ -847,11 +1005,12 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
*/
int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
-void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags);
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags);
/**
* Send RELEASE or RELEASEDIR request
@@ -873,7 +1032,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
/**
* Initialize file operations on a regular file
*/
-void fuse_init_file_inode(struct inode *inode);
+void fuse_init_file_inode(struct inode *inode, unsigned int flags);
/**
* Initialize inode operations on regular files and special files
@@ -897,7 +1056,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version);
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid);
+ u64 attr_valid, u32 cache_mask);
+
+u32 fuse_get_cache_mask(struct inode *inode);
/**
* Initialize the client device
@@ -915,14 +1076,14 @@ void __exit fuse_ctl_cleanup(void);
/**
* Simple request sending that does request allocation and freeing
*/
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
* End a finished request
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_end(struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
@@ -931,7 +1092,15 @@ void fuse_wait_aborted(struct fuse_conn *fc);
/**
* Invalidate inode attributes
*/
+
+/* Attributes possibly changed on data modification */
+#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS)
+
+/* Attributes possibly changed on data and/or size modification */
+#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE)
+
void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask);
void fuse_invalidate_entry_cache(struct dentry *entry);
@@ -948,7 +1117,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
/**
@@ -960,7 +1130,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_conn *fc);
+void fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
@@ -969,12 +1139,25 @@ void fuse_send_init(struct fuse_conn *fc);
*/
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
-/**
- * Disassociate fuse connection from superblock and kill the superblock
+/*
+ * Remove the mount from the connection
*
- * Calls kill_anon_super(), do not use with bdev mounts.
+ * Returns whether this was the last mount
+ */
+bool fuse_mount_remove(struct fuse_mount *fm);
+
+/*
+ * Setup context ops for submounts
+ */
+int fuse_init_fs_context_submount(struct fs_context *fsc);
+
+/*
+ * Shut down the connection (possibly sending DESTROY request).
*/
-void fuse_kill_sb_anon(struct super_block *sb);
+void fuse_conn_destroy(struct fuse_mount *fm);
+
+/* Drop the connection and free the fuse mount */
+void fuse_mount_destroy(struct fuse_mount *fm);
/**
* Add connection to control filesystem
@@ -1000,9 +1183,10 @@ int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+void fuse_flush_time_update(struct inode *inode);
void fuse_update_ctime(struct inode *inode);
-int fuse_update_attributes(struct inode *inode, struct file *file);
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask);
void fuse_flush_writepages(struct inode *inode);
@@ -1010,9 +1194,19 @@ void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
/**
+ * Scan all fuse_mounts belonging to fc to find the first where
+ * ilookup5() returns a result. Return that result and the
+ * respective fuse_mount in *fm (unless fm is NULL).
+ *
+ * The caller must hold fc->killsb.
+ */
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm);
+
+/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len);
/**
@@ -1025,10 +1219,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
* - is a file or oan empty directory
* then the dentry is unhashed (d_delete()).
*/
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name);
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
/**
@@ -1050,7 +1244,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
__poll_t fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
-bool fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
@@ -1064,7 +1258,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked);
bool fuse_lock_inode(struct inode *inode);
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
- size_t size, int flags);
+ size_t size, int flags, unsigned int extra_flags);
ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size);
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
@@ -1074,9 +1268,9 @@ extern const struct xattr_handler *fuse_acl_xattr_handlers[];
extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
-struct posix_acl *fuse_get_acl(struct inode *inode, int type);
-int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-
+struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
/* readdir.c */
int fuse_readdir(struct file *file, struct dir_context *ctx);
@@ -1092,4 +1286,37 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
u64 fuse_get_unique(struct fuse_iqueue *fiq);
void fuse_free_conn(struct fuse_conn *fc);
+/* dax.c */
+
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode,
+ struct dax_device *dax_dev);
+void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode, unsigned int flags);
+void fuse_dax_inode_cleanup(struct inode *inode);
+void fuse_dax_dontcache(struct inode *inode, unsigned int flags);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
+void fuse_dax_cancel_work(struct fuse_conn *fc);
+
+/* ioctl.c */
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+
+/* file.c */
+
+struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, bool isdir);
+void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+ unsigned int open_flags, fl_owner_t id, bool isdir);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 95d712d44ca1..6b3beda16c1b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,7 @@
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/pid_namespace.h>
+#include <uapi/linux/magic.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh,
"Global limit for the maximum congestion threshold an "
"unprivileged user can set");
-#define FUSE_SUPER_MAGIC 0x65735546
-
#define FUSE_DEFAULT_BLKSIZE 512
/** Maximum number of outstanding background requests */
@@ -73,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
{
struct fuse_inode *fi;
- fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+ fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL);
if (!fi)
return NULL;
@@ -87,12 +86,19 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
mutex_init(&fi->mutex);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
- if (!fi->forget) {
- kmem_cache_free(fuse_inode_cachep, fi);
- return NULL;
- }
+ if (!fi->forget)
+ goto out_free;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+ goto out_free_forget;
return &fi->inode;
+
+out_free_forget:
+ kfree(fi->forget);
+out_free:
+ kmem_cache_free(fuse_inode_cachep, fi);
+ return NULL;
}
static void fuse_free_inode(struct inode *inode)
@@ -101,6 +107,9 @@ static void fuse_free_inode(struct inode *inode)
mutex_destroy(&fi->mutex);
kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+ kfree(fi->dax);
+#endif
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -108,23 +117,34 @@ static void fuse_evict_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ /* Will write inode on close/munmap and in all other dirtiers */
+ WARN_ON(inode->i_state & I_DIRTY_INODE);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
- fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
- fi->forget = NULL;
+
+ if (FUSE_IS_DAX(inode))
+ fuse_dax_inode_cleanup(inode);
+ if (fi->nlookup) {
+ fuse_queue_forget(fc, fi->forget, fi->nodeid,
+ fi->nlookup);
+ fi->forget = NULL;
+ }
}
- if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
+ if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
WARN_ON(!list_empty(&fi->write_files));
WARN_ON(!list_empty(&fi->queued_writes));
}
}
-static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+static int fuse_reconfigure(struct fs_context *fsc)
{
+ struct super_block *sb = fsc->root->d_sb;
+
sync_filesystem(sb);
- if (*flags & SB_MANDLOCK)
+ if (fsc->sb_flags & SB_MANDLOCK)
return -EINVAL;
return 0;
@@ -143,7 +163,7 @@ static ino_t fuse_squash_ino(u64 ino64)
}
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid)
+ u64 attr_valid, u32 cache_mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -160,12 +180,20 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_uid = make_kuid(fc->user_ns, attr->uid);
inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
+
+ /* Sanitize nsecs */
+ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
+ attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
+ attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
+
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
- if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ if (!(cache_mask & STATX_MTIME)) {
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
+ }
+ if (!(cache_mask & STATX_CTIME)) {
inode->i_ctime.tv_sec = attr->ctime;
inode->i_ctime.tv_nsec = attr->ctimensec;
}
@@ -185,6 +213,26 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_mode &= ~S_ISVTX;
fi->orig_ino = attr->ino;
+
+ /*
+ * We are refreshing inode data and it is possible that another
+ * client set suid/sgid or security.capability xattr. So clear
+ * S_NOSEC. Ideally, we could have cleared it only if suid/sgid
+ * was set or if security.capability xattr was set. But we don't
+ * know if security.capability has been set or not. So clear it
+ * anyway. Its less efficient but should be safe.
+ */
+ inode->i_flags &= ~S_NOSEC;
+}
+
+u32 fuse_get_cache_mask(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ return 0;
+
+ return STATX_MTIME | STATX_CTIME | STATX_SIZE;
}
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
@@ -192,11 +240,29 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- bool is_wb = fc->writeback_cache;
+ u32 cache_mask;
loff_t oldsize;
struct timespec64 old_mtime;
spin_lock(&fi->lock);
+ /*
+ * In case of writeback_cache enabled, writes update mtime, ctime and
+ * may update i_size. In these cases trust the cached value in the
+ * inode.
+ */
+ cache_mask = fuse_get_cache_mask(inode);
+ if (cache_mask & STATX_SIZE)
+ attr->size = i_size_read(inode);
+
+ if (cache_mask & STATX_MTIME) {
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ }
+ if (cache_mask & STATX_CTIME) {
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
+
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fi->lock);
@@ -204,7 +270,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
}
old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid);
+ fuse_change_attributes_common(inode, attr, attr_valid, cache_mask);
oldsize = inode->i_size;
/*
@@ -212,11 +278,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
* extend local i_size without keeping userspace server in sync. So,
* attr->size coming from server can be stale. We cannot trust it.
*/
- if (!is_wb || !S_ISREG(inode->i_mode))
+ if (!(cache_mask & STATX_SIZE))
i_size_write(inode, attr->size);
spin_unlock(&fi->lock);
- if (!is_wb && S_ISREG(inode->i_mode)) {
+ if (!cache_mask && S_ISREG(inode->i_mode)) {
bool inval = false;
if (oldsize != attr->size) {
@@ -240,6 +306,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
if (inval)
invalidate_inode_pages2(inode->i_mapping);
}
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_dontcache(inode, attr->flags);
}
static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
@@ -252,7 +321,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
inode->i_ctime.tv_nsec = attr->ctimensec;
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
- fuse_init_file_inode(inode);
+ fuse_init_file_inode(inode, attr->flags);
} else if (S_ISDIR(inode->i_mode))
fuse_init_dir(inode);
else if (S_ISLNK(inode->i_mode))
@@ -266,7 +335,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
BUG();
}
-int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
{
u64 nodeid = *(u64 *) _nodeidp;
if (get_node_id(inode) == nodeid)
@@ -290,7 +359,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
struct fuse_inode *fi;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- retry:
+ /*
+ * Auto mount points get their node id from the submount root, which is
+ * not a unique identifier within this filesystem.
+ *
+ * To avoid conflicts, do not place submount points into the inode hash
+ * table.
+ */
+ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
+ S_ISDIR(attr->mode)) {
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ fuse_init_inode(inode, attr);
+ get_fuse_inode(inode)->nodeid = nodeid;
+ inode->i_flags |= S_AUTOMOUNT;
+ goto done;
+ }
+
+retry:
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
if (!inode)
return NULL;
@@ -302,13 +390,13 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
inode->i_generation = generation;
fuse_init_inode(inode, attr);
unlock_new_inode(inode);
- } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
- /* Inode has changed type, any I/O on the old should fail */
- make_bad_inode(inode);
+ } else if (fuse_stale_inode(inode, generation, attr)) {
+ /* nodeid was reused, any I/O on the old inode should fail */
+ fuse_make_bad(inode);
iput(inode);
goto retry;
}
-
+done:
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->nlookup++;
@@ -318,17 +406,45 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
return inode;
}
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm)
+{
+ struct fuse_mount *fm_iter;
+ struct inode *inode;
+
+ WARN_ON(!rwsem_is_locked(&fc->killsb));
+ list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
+ if (!fm_iter->sb)
+ continue;
+
+ inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (inode) {
+ if (fm)
+ *fm = fm_iter;
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len)
{
+ struct fuse_inode *fi;
struct inode *inode;
pgoff_t pg_start;
pgoff_t pg_end;
- inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
return -ENOENT;
+ fi = get_fuse_inode(inode);
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
+
fuse_invalidate_attr(inode);
forget_all_cached_acls(inode);
if (offset >= 0) {
@@ -366,34 +482,28 @@ static void fuse_umount_begin(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (!fc->no_force_umount)
- fuse_abort_conn(fc);
+ if (fc->no_force_umount)
+ return;
+
+ fuse_abort_conn(fc);
+
+ // Only retire block-device-based superblocks.
+ if (sb->s_bdev != NULL)
+ retire_super(sb);
}
-static void fuse_send_destroy(struct fuse_conn *fc)
+static void fuse_send_destroy(struct fuse_mount *fm)
{
- if (fc->conn_init) {
+ if (fm->fc->conn_init) {
FUSE_ARGS(args);
args.opcode = FUSE_DESTROY;
args.force = true;
args.nocreds = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
}
}
-static void fuse_put_super(struct super_block *sb)
-{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
-
- mutex_lock(&fuse_mutex);
- list_del(&fc->entry);
- fuse_ctl_remove_conn(fc);
- mutex_unlock(&fuse_mutex);
-
- fuse_conn_put(fc);
-}
-
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
{
stbuf->f_type = FUSE_SUPER_MAGIC;
@@ -411,12 +521,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_statfs_out outarg;
int err;
- if (!fuse_allow_current_process(fc)) {
+ if (!fuse_allow_current_process(fm->fc)) {
buf->f_type = FUSE_SUPER_MAGIC;
return 0;
}
@@ -428,12 +538,104 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
convert_fuse_statfs(buf, &outarg.st);
return err;
}
+static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+{
+ struct fuse_sync_bucket *bucket;
+
+ bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+ if (bucket) {
+ init_waitqueue_head(&bucket->waitq);
+ /* Initial active count */
+ atomic_set(&bucket->count, 1);
+ }
+ return bucket;
+}
+
+static void fuse_sync_fs_writes(struct fuse_conn *fc)
+{
+ struct fuse_sync_bucket *bucket, *new_bucket;
+ int count;
+
+ new_bucket = fuse_sync_bucket_alloc();
+ spin_lock(&fc->lock);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ count = atomic_read(&bucket->count);
+ WARN_ON(count < 1);
+ /* No outstanding writes? */
+ if (count == 1) {
+ spin_unlock(&fc->lock);
+ kfree(new_bucket);
+ return;
+ }
+
+ /*
+ * Completion of new bucket depends on completion of this bucket, so add
+ * one more count.
+ */
+ atomic_inc(&new_bucket->count);
+ rcu_assign_pointer(fc->curr_bucket, new_bucket);
+ spin_unlock(&fc->lock);
+ /*
+ * Drop initial active count. At this point if all writes in this and
+ * ancestor buckets complete, the count will go to zero and this task
+ * will be woken up.
+ */
+ atomic_dec(&bucket->count);
+
+ wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+
+ /* Drop temp count on descendant bucket */
+ fuse_sync_bucket_dec(new_bucket);
+ kfree_rcu(bucket, rcu);
+}
+
+static int fuse_sync_fs(struct super_block *sb, int wait)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_syncfs_in inarg;
+ FUSE_ARGS(args);
+ int err;
+
+ /*
+ * Userspace cannot handle the wait == 0 case. Avoid a
+ * gratuitous roundtrip.
+ */
+ if (!wait)
+ return 0;
+
+ /* The filesystem is being unmounted. Nothing to do. */
+ if (!sb->s_root)
+ return 0;
+
+ if (!fc->sync_fs)
+ return 0;
+
+ fuse_sync_fs_writes(fc);
+
+ memset(&inarg, 0, sizeof(inarg));
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.opcode = FUSE_SYNCFS;
+ args.nodeid = get_node_id(sb->s_root->d_inode);
+ args.out_numargs = 0;
+
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fc->sync_fs = 0;
+ err = 0;
+ }
+
+ return err;
+}
+
enum {
OPT_SOURCE,
OPT_SUBTYPE,
@@ -462,27 +664,38 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
{}
};
-static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
+static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
{
struct fs_parse_result result;
- struct fuse_fs_context *ctx = fc->fs_private;
+ struct fuse_fs_context *ctx = fsc->fs_private;
int opt;
- opt = fs_parse(fc, fuse_fs_parameters, param, &result);
+ if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ /*
+ * Ignore options coming from mount(MS_REMOUNT) for backward
+ * compatibility.
+ */
+ if (fsc->oldapi)
+ return 0;
+
+ return invalfc(fsc, "No changes allowed in reconfigure");
+ }
+
+ opt = fs_parse(fsc, fuse_fs_parameters, param, &result);
if (opt < 0)
return opt;
switch (opt) {
case OPT_SOURCE:
- if (fc->source)
- return invalfc(fc, "Multiple sources specified");
- fc->source = param->string;
+ if (fsc->source)
+ return invalfc(fsc, "Multiple sources specified");
+ fsc->source = param->string;
param->string = NULL;
break;
case OPT_SUBTYPE:
if (ctx->subtype)
- return invalfc(fc, "Multiple subtypes specified");
+ return invalfc(fsc, "Multiple subtypes specified");
ctx->subtype = param->string;
param->string = NULL;
return 0;
@@ -494,22 +707,22 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
case OPT_ROOTMODE:
if (!fuse_valid_type(result.uint_32))
- return invalfc(fc, "Invalid rootmode");
+ return invalfc(fsc, "Invalid rootmode");
ctx->rootmode = result.uint_32;
ctx->rootmode_present = true;
break;
case OPT_USER_ID:
- ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
+ ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
if (!uid_valid(ctx->user_id))
- return invalfc(fc, "Invalid user_id");
+ return invalfc(fsc, "Invalid user_id");
ctx->user_id_present = true;
break;
case OPT_GROUP_ID:
- ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
+ ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
if (!gid_valid(ctx->group_id))
- return invalfc(fc, "Invalid group_id");
+ return invalfc(fsc, "Invalid group_id");
ctx->group_id_present = true;
break;
@@ -527,7 +740,7 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
case OPT_BLKSIZE:
if (!ctx->is_bdev)
- return invalfc(fc, "blksize only supported for fuseblk");
+ return invalfc(fsc, "blksize only supported for fuseblk");
ctx->blksize = result.uint_32;
break;
@@ -538,9 +751,9 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
}
-static void fuse_free_fc(struct fs_context *fc)
+static void fuse_free_fsc(struct fs_context *fsc)
{
- struct fuse_fs_context *ctx = fc->fs_private;
+ struct fuse_fs_context *ctx = fsc->fs_private;
if (ctx) {
kfree(ctx->subtype);
@@ -553,19 +766,29 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (fc->no_mount_options)
- return 0;
+ if (fc->legacy_opts_show) {
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
+ if (fc->default_permissions)
+ seq_puts(m, ",default_permissions");
+ if (fc->allow_other)
+ seq_puts(m, ",allow_other");
+ if (fc->max_read != ~0)
+ seq_printf(m, ",max_read=%u", fc->max_read);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+ }
+#ifdef CONFIG_FUSE_DAX
+ if (fc->dax_mode == FUSE_DAX_ALWAYS)
+ seq_puts(m, ",dax=always");
+ else if (fc->dax_mode == FUSE_DAX_NEVER)
+ seq_puts(m, ",dax=never");
+ else if (fc->dax_mode == FUSE_DAX_INODE_USER)
+ seq_puts(m, ",dax=inode");
+#endif
- seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
- if (fc->default_permissions)
- seq_puts(m, ",default_permissions");
- if (fc->allow_other)
- seq_puts(m, ",allow_other");
- if (fc->max_read != ~0)
- seq_printf(m, ",max_read=%u", fc->max_read);
- if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
- seq_printf(m, ",blksize=%lu", sb->s_blocksize);
return 0;
}
@@ -595,7 +818,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
fpq->connected = 1;
}
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
{
memset(fc, 0, sizeof(*fc));
@@ -622,6 +846,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+ fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
+
+ INIT_LIST_HEAD(&fc->mounts);
+ list_add(&fm->fc_entry, &fc->mounts);
+ fm->fc = fc;
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -629,11 +858,19 @@ void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_sync_bucket *bucket;
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
put_user_ns(fc->user_ns);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ if (bucket) {
+ WARN_ON(atomic_read(&bucket->count) != 1);
+ kfree(bucket);
+ }
fc->release(fc);
}
}
@@ -776,14 +1013,13 @@ static struct dentry *fuse_get_parent(struct dentry *child)
struct inode *inode;
struct dentry *parent;
struct fuse_entry_out outarg;
- const struct qstr name = QSTR_INIT("..", 2);
int err;
if (!fc->export_support)
return ERR_PTR(-ESTALE);
err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
- &name, &outarg, &inode);
+ &dotdot_name, &outarg, &inode);
if (err) {
if (err == -ENOENT)
return ERR_PTR(-ESTALE);
@@ -810,10 +1046,9 @@ static const struct super_operations fuse_super_operations = {
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
- .remount_fs = fuse_remount_fs,
- .put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
+ .sync_fs = fuse_sync_fs,
.show_options = fuse_show_options,
};
@@ -876,84 +1111,104 @@ struct fuse_init_args {
struct fuse_init_out out;
};
-static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
struct fuse_init_out *arg = &ia->out;
+ bool ok = true;
if (error || arg->major != FUSE_KERNEL_VERSION)
- fc->conn_error = 1;
+ ok = false;
else {
unsigned long ra_pages;
process_init_limits(fc, arg);
if (arg->minor >= 6) {
+ u64 flags = arg->flags | (u64) arg->flags2 << 32;
+
ra_pages = arg->max_readahead / PAGE_SIZE;
- if (arg->flags & FUSE_ASYNC_READ)
+ if (flags & FUSE_ASYNC_READ)
fc->async_read = 1;
- if (!(arg->flags & FUSE_POSIX_LOCKS))
+ if (!(flags & FUSE_POSIX_LOCKS))
fc->no_lock = 1;
if (arg->minor >= 17) {
- if (!(arg->flags & FUSE_FLOCK_LOCKS))
+ if (!(flags & FUSE_FLOCK_LOCKS))
fc->no_flock = 1;
} else {
- if (!(arg->flags & FUSE_POSIX_LOCKS))
+ if (!(flags & FUSE_POSIX_LOCKS))
fc->no_flock = 1;
}
- if (arg->flags & FUSE_ATOMIC_O_TRUNC)
+ if (flags & FUSE_ATOMIC_O_TRUNC)
fc->atomic_o_trunc = 1;
if (arg->minor >= 9) {
/* LOOKUP has dependency on proto version */
- if (arg->flags & FUSE_EXPORT_SUPPORT)
+ if (flags & FUSE_EXPORT_SUPPORT)
fc->export_support = 1;
}
- if (arg->flags & FUSE_BIG_WRITES)
+ if (flags & FUSE_BIG_WRITES)
fc->big_writes = 1;
- if (arg->flags & FUSE_DONT_MASK)
+ if (flags & FUSE_DONT_MASK)
fc->dont_mask = 1;
- if (arg->flags & FUSE_AUTO_INVAL_DATA)
+ if (flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
- else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA)
+ else if (flags & FUSE_EXPLICIT_INVAL_DATA)
fc->explicit_inval_data = 1;
- if (arg->flags & FUSE_DO_READDIRPLUS) {
+ if (flags & FUSE_DO_READDIRPLUS) {
fc->do_readdirplus = 1;
- if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ if (flags & FUSE_READDIRPLUS_AUTO)
fc->readdirplus_auto = 1;
}
- if (arg->flags & FUSE_ASYNC_DIO)
+ if (flags & FUSE_ASYNC_DIO)
fc->async_dio = 1;
- if (arg->flags & FUSE_WRITEBACK_CACHE)
+ if (flags & FUSE_WRITEBACK_CACHE)
fc->writeback_cache = 1;
- if (arg->flags & FUSE_PARALLEL_DIROPS)
+ if (flags & FUSE_PARALLEL_DIROPS)
fc->parallel_dirops = 1;
- if (arg->flags & FUSE_HANDLE_KILLPRIV)
+ if (flags & FUSE_HANDLE_KILLPRIV)
fc->handle_killpriv = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
- fc->sb->s_time_gran = arg->time_gran;
- if ((arg->flags & FUSE_POSIX_ACL)) {
+ fm->sb->s_time_gran = arg->time_gran;
+ if ((flags & FUSE_POSIX_ACL)) {
fc->default_permissions = 1;
fc->posix_acl = 1;
- fc->sb->s_xattr = fuse_acl_xattr_handlers;
+ fm->sb->s_xattr = fuse_acl_xattr_handlers;
}
- if (arg->flags & FUSE_CACHE_SYMLINKS)
+ if (flags & FUSE_CACHE_SYMLINKS)
fc->cache_symlinks = 1;
- if (arg->flags & FUSE_ABORT_ERROR)
+ if (flags & FUSE_ABORT_ERROR)
fc->abort_err = 1;
- if (arg->flags & FUSE_MAX_PAGES) {
+ if (flags & FUSE_MAX_PAGES) {
fc->max_pages =
- min_t(unsigned int, FUSE_MAX_MAX_PAGES,
+ min_t(unsigned int, fc->max_pages_limit,
max_t(unsigned int, arg->max_pages, 1));
}
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ if (flags & FUSE_MAP_ALIGNMENT &&
+ !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+ ok = false;
+ }
+ if (flags & FUSE_HAS_INODE_DAX)
+ fc->inode_dax = 1;
+ }
+ if (flags & FUSE_HANDLE_KILLPRIV_V2) {
+ fc->handle_killpriv_v2 = 1;
+ fm->sb->s_flags |= SB_NOSEC;
+ }
+ if (flags & FUSE_SETXATTR_EXT)
+ fc->setxattr_ext = 1;
+ if (flags & FUSE_SECURITY_CTX)
+ fc->init_security = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
- fc->sb->s_bdi->ra_pages =
- min(fc->sb->s_bdi->ra_pages, ra_pages);
+ fm->sb->s_bdi->ra_pages =
+ min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -961,20 +1216,26 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
}
kfree(ia);
+ if (!ok) {
+ fc->conn_init = 0;
+ fc->conn_error = 1;
+ }
+
fuse_set_initialized(fc);
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_conn *fc)
+void fuse_send_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
+ u64 flags;
ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL);
ia->in.major = FUSE_KERNEL_VERSION;
ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
- ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
- ia->in.flags |=
+ ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
+ flags =
FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -983,7 +1244,21 @@ void fuse_send_init(struct fuse_conn *fc)
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
- FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+ FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
+ FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
+ FUSE_SECURITY_CTX;
+#ifdef CONFIG_FUSE_DAX
+ if (fm->fc->dax)
+ flags |= FUSE_MAP_ALIGNMENT;
+ if (fuse_is_inode_dax_mode(fm->fc->dax_mode))
+ flags |= FUSE_HAS_INODE_DAX;
+#endif
+ if (fm->fc->auto_submounts)
+ flags |= FUSE_SUBMOUNTS;
+
+ ia->in.flags = flags;
+ ia->in.flags2 = flags >> 32;
+
ia->args.opcode = FUSE_INIT;
ia->args.in_numargs = 1;
ia->args.in_args[0].size = sizeof(ia->in);
@@ -999,8 +1274,8 @@ void fuse_send_init(struct fuse_conn *fc)
ia->args.nocreds = true;
ia->args.end = process_init_reply;
- if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fc, &ia->args, -ENOTCONN);
+ if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+ process_init_reply(fm, &ia->args, -ENOTCONN);
}
EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1030,9 +1305,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
+ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
+ sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
@@ -1111,10 +1386,141 @@ void fuse_dev_free(struct fuse_dev *fud)
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+ const struct fuse_inode *fi)
+{
+ *attr = (struct fuse_attr){
+ .ino = fi->inode.i_ino,
+ .size = fi->inode.i_size,
+ .blocks = fi->inode.i_blocks,
+ .atime = fi->inode.i_atime.tv_sec,
+ .mtime = fi->inode.i_mtime.tv_sec,
+ .ctime = fi->inode.i_ctime.tv_sec,
+ .atimensec = fi->inode.i_atime.tv_nsec,
+ .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .mode = fi->inode.i_mode,
+ .nlink = fi->inode.i_nlink,
+ .uid = fi->inode.i_uid.val,
+ .gid = fi->inode.i_gid.val,
+ .rdev = fi->inode.i_rdev,
+ .blksize = 1u << fi->inode.i_blkbits,
+ };
+}
+
+static void fuse_sb_defaults(struct super_block *sb)
+{
+ sb->s_magic = FUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_xattr = fuse_xattr_handlers;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+ sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+
+ /*
+ * If we are not in the initial user namespace posix
+ * acls must be translated.
+ */
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_xattr = fuse_no_acl_xattr_handlers;
+}
+
+static int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct super_block *parent_sb = parent_fi->inode.i_sb;
+ struct fuse_attr root_attr;
+ struct inode *root;
+
+ fuse_sb_defaults(sb);
+ fm->sb = sb;
+
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi_get(parent_sb->s_bdi);
+
+ sb->s_xattr = parent_sb->s_xattr;
+ sb->s_time_gran = parent_sb->s_time_gran;
+ sb->s_blocksize = parent_sb->s_blocksize;
+ sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
+ sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL);
+ if (parent_sb->s_subtype && !sb->s_subtype)
+ return -ENOMEM;
+
+ fuse_fill_attr_from_inode(&root_attr, parent_fi);
+ root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+ /*
+ * This inode is just a duplicate, so it is not looked up and
+ * its nlookup should not be incremented. fuse_iget() does
+ * that, though, so undo it here.
+ */
+ get_fuse_inode(root)->nlookup--;
+ sb->s_d_op = &fuse_dentry_operations;
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Filesystem context private data holds the FUSE inode of the mount point */
+static int fuse_get_tree_submount(struct fs_context *fsc)
+{
+ struct fuse_mount *fm;
+ struct fuse_inode *mp_fi = fsc->fs_private;
+ struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode);
+ struct super_block *sb;
+ int err;
+
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ return -ENOMEM;
+
+ fm->fc = fuse_conn_get(fc);
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, NULL, set_anon_super_fc);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Initialize superblock, making @mp_fi its root */
+ err = fuse_fill_super_submount(sb, mp_fi);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ down_write(&fc->killsb);
+ list_add_tail(&fm->fc_entry, &fc->mounts);
+ up_write(&fc->killsb);
+
+ sb->s_flags |= SB_ACTIVE;
+ fsc->root = dget(sb->s_root);
+
+ return 0;
+}
+
+static const struct fs_context_operations fuse_context_submount_ops = {
+ .get_tree = fuse_get_tree_submount,
+};
+
+int fuse_init_fs_context_submount(struct fs_context *fsc)
+{
+ fsc->ops = &fuse_context_submount_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount);
+
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
- struct fuse_dev *fud;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_dev *fud = NULL;
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct inode *root;
struct dentry *root_dentry;
int err;
@@ -1123,7 +1529,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_flags & SB_MANDLOCK)
goto err;
- sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+ rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
+ fuse_sb_defaults(sb);
if (ctx->is_bdev) {
#ifdef CONFIG_BLOCK
@@ -1138,29 +1545,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
sb->s_subtype = ctx->subtype;
ctx->subtype = NULL;
- sb->s_magic = FUSE_SUPER_MAGIC;
- sb->s_op = &fuse_super_operations;
- sb->s_xattr = fuse_xattr_handlers;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_export_op = &fuse_export_operations;
- sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
- if (sb->s_user_ns != &init_user_ns)
- sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
-
- /*
- * If we are not in the initial user namespace posix
- * acls must be translated.
- */
- if (sb->s_user_ns != &init_user_ns)
- sb->s_xattr = fuse_no_acl_xattr_handlers;
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev);
+ if (err)
+ goto err;
+ }
- fud = fuse_dev_alloc_install(fc);
- if (!fud)
- goto err;
+ if (ctx->fudptr) {
+ err = -ENOMEM;
+ fud = fuse_dev_alloc_install(fc);
+ if (!fud)
+ goto err_free_dax;
+ }
fc->dev = sb->s_dev;
- fc->sb = sb;
+ fm->sb = sb;
err = fuse_bdi_init(fc, sb);
if (err)
goto err_dev_free;
@@ -1174,11 +1573,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
fc->allow_other = ctx->allow_other;
fc->user_id = ctx->user_id;
fc->group_id = ctx->group_id;
- fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+ fc->legacy_opts_show = ctx->legacy_opts_show;
+ fc->max_read = max_t(unsigned int, 4096, ctx->max_read);
fc->destroy = ctx->destroy;
fc->no_control = ctx->no_control;
fc->no_force_umount = ctx->no_force_umount;
- fc->no_mount_options = ctx->no_mount_options;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
@@ -1191,7 +1590,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_lock(&fuse_mutex);
err = -EINVAL;
- if (*ctx->fudptr)
+ if (ctx->fudptr && *ctx->fudptr)
goto err_unlock;
err = fuse_ctl_add_conn(fc);
@@ -1200,7 +1599,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- *ctx->fudptr = fud;
+ if (ctx->fudptr)
+ *ctx->fudptr = fud;
mutex_unlock(&fuse_mutex);
return 0;
@@ -1208,7 +1608,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_unlock(&fuse_mutex);
dput(root_dentry);
err_dev_free:
- fuse_dev_free(fud);
+ if (fud)
+ fuse_dev_free(fud);
+ err_free_dax:
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
err:
return err;
}
@@ -1217,80 +1621,117 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common);
static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
- struct file *file;
int err;
- struct fuse_conn *fc;
- err = -EINVAL;
- file = fget(ctx->fd);
- if (!file)
- goto err;
+ if (!ctx->file || !ctx->rootmode_present ||
+ !ctx->user_id_present || !ctx->group_id_present)
+ return -EINVAL;
/*
* Require mount to happen from the same user namespace which
* opened /dev/fuse to prevent potential attacks.
*/
- if ((file->f_op != &fuse_dev_operations) ||
- (file->f_cred->user_ns != sb->s_user_ns))
- goto err_fput;
- ctx->fudptr = &file->private_data;
-
- fc = kmalloc(sizeof(*fc), GFP_KERNEL);
- err = -ENOMEM;
- if (!fc)
- goto err_fput;
-
- fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
- fc->release = fuse_free_conn;
- sb->s_fs_info = fc;
+ if ((ctx->file->f_op != &fuse_dev_operations) ||
+ (ctx->file->f_cred->user_ns != sb->s_user_ns))
+ return -EINVAL;
+ ctx->fudptr = &ctx->file->private_data;
err = fuse_fill_super_common(sb, ctx);
if (err)
- goto err_put_conn;
- /*
- * atomic_dec_and_test() in fput() provides the necessary
- * memory barrier for file->private_data to be visible on all
- * CPUs after this
- */
- fput(file);
- fuse_send_init(get_fuse_conn_super(sb));
+ return err;
+ /* file->private_data shall be visible on all CPUs after this */
+ smp_mb();
+ fuse_send_init(get_fuse_mount_super(sb));
return 0;
+}
- err_put_conn:
- fuse_conn_put(fc);
- sb->s_fs_info = NULL;
- err_fput:
- fput(file);
- err:
- return err;
+/*
+ * This is the path where user supplied an already initialized fuse dev. In
+ * this case never create a new super if the old one is gone.
+ */
+static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc)
+{
+ return -ENOTCONN;
}
-static int fuse_get_tree(struct fs_context *fc)
+static int fuse_test_super(struct super_block *sb, struct fs_context *fsc)
{
- struct fuse_fs_context *ctx = fc->fs_private;
- if (!ctx->fd_present || !ctx->rootmode_present ||
- !ctx->user_id_present || !ctx->group_id_present)
- return -EINVAL;
+ return fsc->sget_key == get_fuse_conn_super(sb);
+}
-#ifdef CONFIG_BLOCK
- if (ctx->is_bdev)
- return get_tree_bdev(fc, fuse_fill_super);
-#endif
+static int fuse_get_tree(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ struct fuse_mount *fm;
+ struct super_block *sb;
+ int err;
- return get_tree_nodev(fc, fuse_fill_super);
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL);
+ fc->release = fuse_free_conn;
+
+ fsc->s_fs_info = fm;
+
+ if (ctx->fd_present)
+ ctx->file = fget(ctx->fd);
+
+ if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
+ err = get_tree_bdev(fsc, fuse_fill_super);
+ goto out;
+ }
+ /*
+ * While block dev mount can be initialized with a dummy device fd
+ * (found by device name), normal fuse mounts can't
+ */
+ err = -EINVAL;
+ if (!ctx->file)
+ goto out;
+
+ /*
+ * Allow creating a fuse mount with an already initialized fuse
+ * connection
+ */
+ fud = READ_ONCE(ctx->file->private_data);
+ if (ctx->file->f_op == &fuse_dev_operations && fud) {
+ fsc->sget_key = fud->fc;
+ sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
+ err = PTR_ERR_OR_ZERO(sb);
+ if (!IS_ERR(sb))
+ fsc->root = dget(sb->s_root);
+ } else {
+ err = get_tree_nodev(fsc, fuse_fill_super);
+ }
+out:
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (ctx->file)
+ fput(ctx->file);
+ return err;
}
static const struct fs_context_operations fuse_context_ops = {
- .free = fuse_free_fc,
+ .free = fuse_free_fsc,
.parse_param = fuse_parse_param,
+ .reconfigure = fuse_reconfigure,
.get_tree = fuse_get_tree,
};
/*
* Set up the filesystem mount context.
*/
-static int fuse_init_fs_context(struct fs_context *fc)
+static int fuse_init_fs_context(struct fs_context *fsc)
{
struct fuse_fs_context *ctx;
@@ -1300,42 +1741,79 @@ static int fuse_init_fs_context(struct fs_context *fc)
ctx->max_read = ~0;
ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+ ctx->legacy_opts_show = true;
#ifdef CONFIG_BLOCK
- if (fc->fs_type == &fuseblk_fs_type) {
+ if (fsc->fs_type == &fuseblk_fs_type) {
ctx->is_bdev = true;
ctx->destroy = true;
}
#endif
- fc->fs_private = ctx;
- fc->ops = &fuse_context_ops;
+ fsc->fs_private = ctx;
+ fsc->ops = &fuse_context_ops;
return 0;
}
-static void fuse_sb_destroy(struct super_block *sb)
+bool fuse_mount_remove(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ bool last = false;
- if (fc) {
- if (fc->destroy)
- fuse_send_destroy(fc);
+ down_write(&fc->killsb);
+ list_del_init(&fm->fc_entry);
+ if (list_empty(&fc->mounts))
+ last = true;
+ up_write(&fc->killsb);
- fuse_abort_conn(fc);
- fuse_wait_aborted(fc);
+ return last;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_remove);
- down_write(&fc->killsb);
- fc->sb = NULL;
- up_write(&fc->killsb);
+void fuse_conn_destroy(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+
+ if (fc->destroy)
+ fuse_send_destroy(fm);
+
+ fuse_abort_conn(fc);
+ fuse_wait_aborted(fc);
+
+ if (!list_empty(&fc->entry)) {
+ mutex_lock(&fuse_mutex);
+ list_del(&fc->entry);
+ fuse_ctl_remove_conn(fc);
+ mutex_unlock(&fuse_mutex);
}
}
+EXPORT_SYMBOL_GPL(fuse_conn_destroy);
+
+static void fuse_sb_destroy(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (sb->s_root) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
+}
+
+void fuse_mount_destroy(struct fuse_mount *fm)
+{
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+}
+EXPORT_SYMBOL(fuse_mount_destroy);
-void fuse_kill_sb_anon(struct super_block *sb)
+static void fuse_kill_sb_anon(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_anon_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
-EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
@@ -1352,6 +1830,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_block_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuseblk_fs_type = {
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
new file mode 100644
index 000000000000..61d8afcb10a3
--- /dev/null
+++ b/fs/fuse/ioctl.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/uio.h>
+#include <linux/compat.h>
+#include <linux/fileattr.h>
+
+static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args)
+{
+ ssize_t ret = fuse_simple_request(fm, args);
+
+ /* Translate ENOSYS, which shouldn't be returned from fs */
+ if (ret == -ENOSYS)
+ ret = -ENOTTY;
+
+ return ret;
+}
+
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit. Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+ size_t transferred, unsigned count,
+ bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+ if (count * sizeof(struct compat_iovec) == transferred) {
+ struct compat_iovec *ciov = src;
+ unsigned i;
+
+ /*
+ * With this interface a 32bit server cannot support
+ * non-compat (i.e. ones coming from 64bit apps) ioctl
+ * requests
+ */
+ if (!is_compat)
+ return -EINVAL;
+
+ for (i = 0; i < count; i++) {
+ dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+ dst[i].iov_len = ciov[i].iov_len;
+ }
+ return 0;
+ }
+#endif
+
+ if (count * sizeof(struct iovec) != transferred)
+ return -EIO;
+
+ memcpy(dst, src, transferred);
+ return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
+ size_t count)
+{
+ size_t n;
+ u32 max = fc->max_pages << PAGE_SHIFT;
+
+ for (n = 0; n < count; n++, iov++) {
+ if (iov->iov_len > (size_t) max)
+ return -ENOMEM;
+ max -= iov->iov_len;
+ }
+ return 0;
+}
+
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+ void *src, size_t transferred, unsigned count,
+ bool is_compat)
+{
+ unsigned i;
+ struct fuse_ioctl_iovec *fiov = src;
+
+ if (fc->minor < 16) {
+ return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+ count, is_compat);
+ }
+
+ if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+ return -EIO;
+
+ for (i = 0; i < count; i++) {
+ /* Did the server supply an inappropriate value? */
+ if (fiov[i].base != (unsigned long) fiov[i].base ||
+ fiov[i].len != (unsigned long) fiov[i].len)
+ return -EIO;
+
+ dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+ dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat &&
+ (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+ (compat_size_t) dst[i].iov_len != fiov[i].len))
+ return -EIO;
+#endif
+ }
+
+ return 0;
+}
+
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written. Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs. Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ * char *buf;
+ * size_t buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
+ * { .iov_base = a.buf, .iov_len = a.buflen } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY. This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_ioctl_in inarg = {
+ .fh = ff->fh,
+ .cmd = cmd,
+ .arg = arg,
+ .flags = flags
+ };
+ struct fuse_ioctl_out outarg;
+ struct iovec *iov_page = NULL;
+ struct iovec *in_iov = NULL, *out_iov = NULL;
+ unsigned int in_iovs = 0, out_iovs = 0, max_pages;
+ size_t in_size, out_size, c;
+ ssize_t transferred;
+ int err, i;
+ struct iov_iter ii;
+ struct fuse_args_pages ap = {};
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+ if (flags & FUSE_IOCTL_COMPAT) {
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#ifdef CONFIG_X86_X32_ABI
+ if (in_x32_syscall())
+ inarg.flags |= FUSE_IOCTL_COMPAT_X32;
+#endif
+ }
+#endif
+
+ /* assume all the iovs returned by client always fits in a page */
+ BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+ err = -ENOMEM;
+ ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
+ iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
+ if (!ap.pages || !iov_page)
+ goto out;
+
+ fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
+
+ /*
+ * If restricted, initialize IO parameters as encoded in @cmd.
+ * RETRY from server is not allowed.
+ */
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+ struct iovec *iov = iov_page;
+
+ iov->iov_base = (void __user *)arg;
+ iov->iov_len = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ in_iov = iov;
+ in_iovs = 1;
+ }
+
+ if (_IOC_DIR(cmd) & _IOC_READ) {
+ out_iov = iov;
+ out_iovs = 1;
+ }
+ }
+
+ retry:
+ inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+ inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+ /*
+ * Out data can be used either for actual out data or iovs,
+ * make sure there always is at least one page.
+ */
+ out_size = max_t(size_t, out_size, PAGE_SIZE);
+ max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+ /* make sure there are enough buffer pages and init request with them */
+ err = -ENOMEM;
+ if (max_pages > fm->fc->max_pages)
+ goto out;
+ while (ap.num_pages < max_pages) {
+ ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!ap.pages[ap.num_pages])
+ goto out;
+ ap.num_pages++;
+ }
+
+
+ /* okay, let's send it to the client */
+ ap.args.opcode = FUSE_IOCTL;
+ ap.args.nodeid = ff->nodeid;
+ ap.args.in_numargs = 1;
+ ap.args.in_args[0].size = sizeof(inarg);
+ ap.args.in_args[0].value = &inarg;
+ if (in_size) {
+ ap.args.in_numargs++;
+ ap.args.in_args[1].size = in_size;
+ ap.args.in_pages = true;
+
+ err = -EFAULT;
+ iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+ c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ }
+
+ ap.args.out_numargs = 2;
+ ap.args.out_args[0].size = sizeof(outarg);
+ ap.args.out_args[0].value = &outarg;
+ ap.args.out_args[1].size = out_size;
+ ap.args.out_pages = true;
+ ap.args.out_argvar = true;
+
+ transferred = fuse_send_ioctl(fm, &ap.args);
+ err = transferred;
+ if (transferred < 0)
+ goto out;
+
+ /* did it ask for retry? */
+ if (outarg.flags & FUSE_IOCTL_RETRY) {
+ void *vaddr;
+
+ /* no retry if in restricted mode */
+ err = -EIO;
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+ goto out;
+
+ in_iovs = outarg.in_iovs;
+ out_iovs = outarg.out_iovs;
+
+ /*
+ * Make sure things are in boundary, separate checks
+ * are to protect against overflow.
+ */
+ err = -ENOMEM;
+ if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+ out_iovs > FUSE_IOCTL_MAX_IOV ||
+ in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+ goto out;
+
+ vaddr = kmap_local_page(ap.pages[0]);
+ err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
+ transferred, in_iovs + out_iovs,
+ (flags & FUSE_IOCTL_COMPAT) != 0);
+ kunmap_local(vaddr);
+ if (err)
+ goto out;
+
+ in_iov = iov_page;
+ out_iov = in_iov + in_iovs;
+
+ err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
+ if (err)
+ goto out;
+
+ err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
+ if (err)
+ goto out;
+
+ goto retry;
+ }
+
+ err = -EIO;
+ if (transferred > inarg.out_size)
+ goto out;
+
+ err = -EFAULT;
+ iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+ c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ err = 0;
+ out:
+ free_page((unsigned long) iov_page);
+ while (ap.num_pages)
+ __free_page(ap.pages[--ap.num_pages]);
+ kfree(ap.pages);
+
+ return err ? err : outarg.result;
+}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, 0);
+}
+
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
+ unsigned int cmd, void *ptr, size_t size)
+{
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_ioctl_in inarg;
+ struct fuse_ioctl_out outarg;
+ FUSE_ARGS(args);
+ int err;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.cmd = cmd;
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+ if (S_ISDIR(inode->i_mode))
+ inarg.flags |= FUSE_IOCTL_DIR;
+
+ if (_IOC_DIR(cmd) & _IOC_READ)
+ inarg.out_size = size;
+ if (_IOC_DIR(cmd) & _IOC_WRITE)
+ inarg.in_size = size;
+
+ args.opcode = FUSE_IOCTL;
+ args.nodeid = ff->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = inarg.in_size;
+ args.in_args[1].value = ptr;
+ args.out_numargs = 2;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ args.out_args[1].size = inarg.out_size;
+ args.out_args[1].value = ptr;
+
+ err = fuse_send_ioctl(fm, &args);
+ if (!err) {
+ if (outarg.result < 0)
+ err = outarg.result;
+ else if (outarg.flags & FUSE_IOCTL_RETRY)
+ err = -EIO;
+ }
+ return err;
+}
+
+static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ bool isdir = S_ISDIR(inode->i_mode);
+
+ if (!S_ISREG(inode->i_mode) && !isdir)
+ return ERR_PTR(-ENOTTY);
+
+ return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir);
+}
+
+static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
+{
+ fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
+}
+
+int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_file *ff;
+ unsigned int flags;
+ struct fsxattr xfa;
+ int err;
+
+ ff = fuse_priv_ioctl_prepare(inode);
+ if (IS_ERR(ff))
+ return PTR_ERR(ff);
+
+ if (fa->flags_valid) {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS,
+ &flags, sizeof(flags));
+ if (err)
+ goto cleanup;
+
+ fileattr_fill_flags(fa, flags);
+ } else {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR,
+ &xfa, sizeof(xfa));
+ if (err)
+ goto cleanup;
+
+ fileattr_fill_xflags(fa, xfa.fsx_xflags);
+ fa->fsx_extsize = xfa.fsx_extsize;
+ fa->fsx_nextents = xfa.fsx_nextents;
+ fa->fsx_projid = xfa.fsx_projid;
+ fa->fsx_cowextsize = xfa.fsx_cowextsize;
+ }
+cleanup:
+ fuse_priv_ioctl_cleanup(inode, ff);
+
+ return err;
+}
+
+int fuse_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_file *ff;
+ unsigned int flags = fa->flags;
+ struct fsxattr xfa;
+ int err;
+
+ ff = fuse_priv_ioctl_prepare(inode);
+ if (IS_ERR(ff))
+ return PTR_ERR(ff);
+
+ if (fa->flags_valid) {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS,
+ &flags, sizeof(flags));
+ if (err)
+ goto cleanup;
+ } else {
+ memset(&xfa, 0, sizeof(xfa));
+ xfa.fsx_xflags = fa->fsx_xflags;
+ xfa.fsx_extsize = fa->fsx_extsize;
+ xfa.fsx_nextents = fa->fsx_nextents;
+ xfa.fsx_projid = fa->fsx_projid;
+ xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR,
+ &xfa, sizeof(xfa));
+ }
+
+cleanup:
+ fuse_priv_ioctl_cleanup(inode, ff);
+
+ return err;
+}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 90e3f01bd796..e8deaacf1832 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -76,11 +76,13 @@ static void fuse_add_dirent_to_cache(struct file *file,
WARN_ON(fi->rdc.pos != pos))
goto unlock;
- addr = kmap_atomic(page);
- if (!offset)
+ addr = kmap_local_page(page);
+ if (!offset) {
clear_page(addr);
+ SetPageUptodate(page);
+ }
memcpy(addr + offset, dirent, reclen);
- kunmap_atomic(addr);
+ kunmap_local(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
fi->rdc.pos = dirent->off;
unlock:
@@ -200,14 +202,17 @@ retry:
if (!d_in_lookup(dentry)) {
struct fuse_inode *fi;
inode = d_inode(dentry);
+ if (inode && get_node_id(inode) != o->nodeid)
+ inode = NULL;
if (!inode ||
- get_node_id(inode) != o->nodeid ||
- ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ fuse_stale_inode(inode, o->generation, &o->attr)) {
+ if (inode)
+ fuse_make_bad(inode);
d_invalidate(dentry);
dput(dentry);
goto retry;
}
- if (is_bad_inode(inode)) {
+ if (fuse_is_bad(inode)) {
dput(dentry);
return -EIO;
}
@@ -252,7 +257,7 @@ retry:
static void fuse_force_forget(struct file *file, u64 nodeid)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_forget_in inarg;
FUSE_ARGS(args);
@@ -266,7 +271,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid)
args.force = true;
args.noreply = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
/* ignore errors */
}
@@ -320,7 +325,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
ssize_t res;
struct page *page;
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
struct fuse_page_desc desc = { .length = PAGE_SIZE };
@@ -337,7 +342,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
ap->pages = &page;
ap->descs = &desc;
if (plus) {
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
@@ -345,7 +350,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
@@ -451,7 +456,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
* cache; both cases require an up-to-date mtime value.
*/
if (!ctx->pos && fc->auto_inval_data) {
- int err = fuse_update_attributes(inode, file);
+ int err = fuse_update_attributes(inode, file, STATX_MTIME);
if (err)
return err;
@@ -513,6 +518,12 @@ retry_locked:
page = find_get_page_flags(file->f_mapping, index,
FGP_ACCESSED | FGP_LOCK);
+ /* Page gone missing, then re-added to cache, but not initialized? */
+ if (page && !PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ }
spin_lock(&fi->rdc.lock);
if (!page) {
/*
@@ -568,7 +579,7 @@ int fuse_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
mutex_lock(&ff->readdir.lock);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index bade74768903..4d8d4f16c727 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -5,14 +5,26 @@
*/
#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
#include <linux/delay.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/uio.h>
#include "fuse_i.h"
+/* Used to help calculate the FUSE connection's max_pages limit for a request's
+ * size. Parts of the struct fuse_req are sliced into scattergather lists in
+ * addition to the pages used, so this can help account for that overhead.
+ */
+#define FUSE_HEADER_OVERHEAD 4
+
/* List of virtio-fs device instances and a lock for the list. Also provides
* mutual exclusion in device removal and mounting path
*/
@@ -24,6 +36,8 @@ enum {
VQ_REQUEST
};
+#define VQ_NAME_LEN 24
+
/* Per-virtqueue state */
struct virtio_fs_vq {
spinlock_t lock;
@@ -36,7 +50,7 @@ struct virtio_fs_vq {
bool connected;
long in_flight;
struct completion in_flight_zero; /* No inflight requests */
- char name[24];
+ char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
@@ -47,6 +61,12 @@ struct virtio_fs {
struct virtio_fs_vq *vqs;
unsigned int nvqs; /* number of virtqueues */
unsigned int num_request_queues; /* number of request queues */
+ struct dax_device *dax_dev;
+
+ /* DAX memory window where file contents are mapped */
+ void *window_kaddr;
+ phys_addr_t window_phys_addr;
+ size_t window_len;
};
struct virtio_fs_forget_req {
@@ -60,19 +80,70 @@ struct virtio_fs_forget {
struct virtio_fs_forget_req req;
};
+struct virtio_fs_req_work {
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq;
+ struct work_struct done_work;
+};
+
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
-static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+static const struct constant_table dax_param_enums[] = {
+ {"always", FUSE_DAX_ALWAYS },
+ {"never", FUSE_DAX_NEVER },
+ {"inode", FUSE_DAX_INODE_USER },
+ {}
+};
+
+enum {
+ OPT_DAX,
+ OPT_DAX_ENUM,
+};
+
+static const struct fs_parameter_spec virtio_fs_parameters[] = {
+ fsparam_flag("dax", OPT_DAX),
+ fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums),
+ {}
+};
+
+static int virtio_fs_parse_param(struct fs_context *fsc,
+ struct fs_parameter *param)
{
- struct virtio_fs *fs = vq->vdev->priv;
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ int opt;
+
+ opt = fs_parse(fsc, virtio_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_DAX:
+ ctx->dax_mode = FUSE_DAX_ALWAYS;
+ break;
+ case OPT_DAX_ENUM:
+ ctx->dax_mode = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
+ }
- return &fs->vqs[vq->index];
+ return 0;
}
-static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
+static void virtio_fs_free_fsc(struct fs_context *fsc)
{
- return &vq_to_fsvq(vq)->fud->pq;
+ struct fuse_fs_context *ctx = fsc->fs_private;
+
+ kfree(ctx);
+}
+
+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+{
+ struct virtio_fs *fs = vq->vdev->priv;
+
+ return &fs->vqs[vq->index];
}
/* Should be called with fsvq->lock held. */
@@ -283,7 +354,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
struct fuse_req *req;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
dispatch_work.work);
- struct fuse_conn *fc = fsvq->fud->fc;
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
@@ -298,7 +368,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
list_del_init(&req->list);
spin_unlock(&fsvq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
/* Dispatch pending requests */
@@ -329,7 +399,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
spin_unlock(&fsvq->lock);
pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
ret);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
}
@@ -485,19 +555,66 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
}
/* Work function for request completion */
+static void virtio_fs_request_complete(struct fuse_req *req,
+ struct virtio_fs_vq *fsvq)
+{
+ struct fuse_pqueue *fpq = &fsvq->fud->pq;
+ struct fuse_args *args;
+ struct fuse_args_pages *ap;
+ unsigned int len, i, thislen;
+ struct page *page;
+
+ /*
+ * TODO verify that server properly follows FUSE protocol
+ * (oh.uniq, oh.len)
+ */
+ args = req->args;
+ copy_args_from_argbuf(args, req);
+
+ if (args->out_pages && args->page_zeroing) {
+ len = args->out_args[args->out_numargs - 1].size;
+ ap = container_of(args, typeof(*ap), args);
+ for (i = 0; i < ap->num_pages; i++) {
+ thislen = ap->descs[i].length;
+ if (len < thislen) {
+ WARN_ON(ap->descs[i].offset);
+ page = ap->pages[i];
+ zero_user_segment(page, len, thislen);
+ len = 0;
+ } else {
+ len -= thislen;
+ }
+ }
+ }
+
+ spin_lock(&fpq->lock);
+ clear_bit(FR_SENT, &req->flags);
+ spin_unlock(&fpq->lock);
+
+ fuse_request_end(req);
+ spin_lock(&fsvq->lock);
+ dec_in_flight_req(fsvq);
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_complete_req_work(struct work_struct *work)
+{
+ struct virtio_fs_req_work *w =
+ container_of(work, typeof(*w), done_work);
+
+ virtio_fs_request_complete(w->req, w->fsvq);
+ kfree(w);
+}
+
static void virtio_fs_requests_done_work(struct work_struct *work)
{
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
done_work);
struct fuse_pqueue *fpq = &fsvq->fud->pq;
- struct fuse_conn *fc = fsvq->fud->fc;
struct virtqueue *vq = fsvq->vq;
struct fuse_req *req;
- struct fuse_args_pages *ap;
struct fuse_req *next;
- struct fuse_args *args;
- unsigned int len, i, thislen;
- struct page *page;
+ unsigned int len;
LIST_HEAD(reqs);
/* Collect completed requests off the virtqueue */
@@ -515,38 +632,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
/* End requests */
list_for_each_entry_safe(req, next, &reqs, list) {
- /*
- * TODO verify that server properly follows FUSE protocol
- * (oh.uniq, oh.len)
- */
- args = req->args;
- copy_args_from_argbuf(args, req);
-
- if (args->out_pages && args->page_zeroing) {
- len = args->out_args[args->out_numargs - 1].size;
- ap = container_of(args, typeof(*ap), args);
- for (i = 0; i < ap->num_pages; i++) {
- thislen = ap->descs[i].length;
- if (len < thislen) {
- WARN_ON(ap->descs[i].offset);
- page = ap->pages[i];
- zero_user_segment(page, len, thislen);
- len = 0;
- } else {
- len -= thislen;
- }
- }
- }
-
- spin_lock(&fpq->lock);
- clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
- spin_lock(&fsvq->lock);
- dec_in_flight_req(fsvq);
- spin_unlock(&fsvq->lock);
+ /* blocking async request completes in a worker context */
+ if (req->args->may_block) {
+ struct virtio_fs_req_work *w;
+
+ w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL);
+ INIT_WORK(&w->done_work, virtio_fs_complete_req_work);
+ w->fsvq = fsvq;
+ w->req = req;
+ schedule_work(&w->done_work);
+ } else {
+ virtio_fs_request_complete(req, fsvq);
+ }
}
}
@@ -560,6 +659,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
schedule_work(&fsvq->done_work);
}
+static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
+ int vq_type)
+{
+ strscpy(fsvq->name, name, VQ_NAME_LEN);
+ spin_lock_init(&fsvq->lock);
+ INIT_LIST_HEAD(&fsvq->queued_reqs);
+ INIT_LIST_HEAD(&fsvq->end_reqs);
+ init_completion(&fsvq->in_flight_zero);
+
+ if (vq_type == VQ_REQUEST) {
+ INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
+ } else {
+ INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ }
+}
+
/* Initialize virtqueues */
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
@@ -570,12 +689,12 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
unsigned int i;
int ret = 0;
- virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
- &fs->num_request_queues);
+ virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues,
+ &fs->num_request_queues);
if (fs->num_request_queues == 0)
return -EINVAL;
- fs->nvqs = 1 + fs->num_request_queues;
+ fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
return -ENOMEM;
@@ -589,29 +708,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
goto out;
}
+ /* Initialize the hiprio/forget request virtqueue */
callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
- snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
- "hiprio");
+ virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
- INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
- INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
- virtio_fs_hiprio_dispatch_work);
- init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero);
- spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
- spin_lock_init(&fs->vqs[i].lock);
- INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
- INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
- virtio_fs_request_dispatch_work);
- INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
- init_completion(&fs->vqs[i].in_flight_zero);
- snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
- "requests.%u", i - VQ_REQUEST);
+ char vq_name[VQ_NAME_LEN];
+
+ snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
+ virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
names[i] = fs->vqs[i].name;
}
@@ -634,12 +741,121 @@ out:
}
/* Free virtqueues (device must already be reset) */
-static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
- struct virtio_fs *fs)
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
{
vdev->config->del_vqs(vdev);
}
+/* Map a window offset to a page frame number. The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+ long nr_pages, enum dax_access_mode mode,
+ void **kaddr, pfn_t *pfn)
+{
+ struct virtio_fs *fs = dax_get_private(dax_dev);
+ phys_addr_t offset = PFN_PHYS(pgoff);
+ size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff;
+
+ if (kaddr)
+ *kaddr = fs->window_kaddr + offset;
+ if (pfn)
+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+ PFN_DEV | PFN_MAP);
+ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+ pgoff_t pgoff, size_t nr_pages)
+{
+ long rc;
+ void *kaddr;
+
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr,
+ NULL);
+ if (rc < 0)
+ return rc;
+ memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+ dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+ return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+ .direct_access = virtio_fs_direct_access,
+ .zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+ struct dax_device *dax_dev = data;
+
+ kill_dax(dax_dev);
+ put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ struct virtio_shm_region cache_reg;
+ struct dev_pagemap *pgmap;
+ bool have_cache;
+
+ if (!IS_ENABLED(CONFIG_FUSE_DAX))
+ return 0;
+
+ /* Get cache region */
+ have_cache = virtio_get_shm_region(vdev, &cache_reg,
+ (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+ if (!have_cache) {
+ dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+ return 0;
+ }
+
+ if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+ dev_name(&vdev->dev))) {
+ dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+ cache_reg.addr, cache_reg.len);
+ return -EBUSY;
+ }
+
+ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+ cache_reg.addr);
+
+ pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+ /* Ideally we would directly use the PCI BAR resource but
+ * devm_memremap_pages() wants its own copy in pgmap. So
+ * initialize a struct resource from scratch (only the start
+ * and end fields will be used).
+ */
+ pgmap->range = (struct range) {
+ .start = (phys_addr_t) cache_reg.addr,
+ .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+ };
+ pgmap->nr_range = 1;
+
+ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+ if (IS_ERR(fs->window_kaddr))
+ return PTR_ERR(fs->window_kaddr);
+
+ fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+ fs->window_len = (phys_addr_t) cache_reg.len;
+
+ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+ __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+ fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+ if (IS_ERR(fs->dax_dev))
+ return PTR_ERR(fs->dax_dev);
+
+ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+ fs->dax_dev);
+}
+
static int virtio_fs_probe(struct virtio_device *vdev)
{
struct virtio_fs *fs;
@@ -661,6 +877,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
/* TODO vq affinity */
+ ret = virtio_fs_setup_dax(vdev, fs);
+ if (ret < 0)
+ goto out_vqs;
+
/* Bring the device online in case the filesystem is mounted and
* requests need to be sent before we return.
*/
@@ -673,8 +893,9 @@ static int virtio_fs_probe(struct virtio_device *vdev)
return 0;
out_vqs:
- vdev->config->reset(vdev);
- virtio_fs_cleanup_vqs(vdev, fs);
+ virtio_reset_device(vdev);
+ virtio_fs_cleanup_vqs(vdev);
+ kfree(fs->vqs);
out:
vdev->priv = NULL;
@@ -704,8 +925,8 @@ static void virtio_fs_remove(struct virtio_device *vdev)
list_del_init(&fs->list);
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
- vdev->config->reset(vdev);
- virtio_fs_cleanup_vqs(vdev, fs);
+ virtio_reset_device(vdev);
+ virtio_fs_cleanup_vqs(vdev);
vdev->priv = NULL;
/* Put device reference on virtio_fs object */
@@ -797,18 +1018,37 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
+/* Count number of scatter-gather elements required */
+static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ this_len = min(page_descs[i].length, total_len);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
/* Return the number of scatter-gather list elements required */
static unsigned int sg_count_fuse_req(struct fuse_req *req)
{
struct fuse_args *args = req->args;
struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
- unsigned int total_sgs = 1 /* fuse_in_header */;
+ unsigned int size, total_sgs = 1 /* fuse_in_header */;
if (args->in_numargs - args->in_pages)
total_sgs += 1;
- if (args->in_pages)
- total_sgs += ap->num_pages;
+ if (args->in_pages) {
+ size = args->in_args[args->in_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
if (!test_bit(FR_ISREPLY, &req->flags))
return total_sgs;
@@ -818,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req)
if (args->out_numargs - args->out_pages)
total_sgs += 1;
- if (args->out_pages)
- total_sgs += ap->num_pages;
+ if (args->out_pages) {
+ size = args->out_args[args->out_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
return total_sgs;
}
@@ -1035,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
.release = virtio_fs_fiq_release,
};
-static int virtio_fs_fill_super(struct super_block *sb)
+static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
+{
+ ctx->rootmode = S_IFDIR;
+ ctx->default_permissions = 1;
+ ctx->allow_other = 1;
+ ctx->max_read = UINT_MAX;
+ ctx->blksize = 512;
+ ctx->destroy = true;
+ ctx->no_control = true;
+ ctx->no_force_umount = true;
+}
+
+static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct virtio_fs *fs = fc->iq.priv;
+ struct fuse_fs_context *ctx = fsc->fs_private;
unsigned int i;
int err;
- struct fuse_fs_context ctx = {
- .rootmode = S_IFDIR,
- .default_permissions = 1,
- .allow_other = 1,
- .max_read = UINT_MAX,
- .blksize = 512,
- .destroy = true,
- .no_control = true,
- .no_force_umount = true,
- .no_mount_options = true,
- };
+ virtio_fs_ctx_set_defaults(ctx);
mutex_lock(&virtio_fs_mutex);
/* After holding mutex, make sure virtiofs device is still there.
@@ -1067,7 +1314,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
err = -ENOMEM;
/* Allocate fuse_dev for hiprio and notification queues */
- for (i = 0; i < VQ_REQUEST; i++) {
+ for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
fsvq->fud = fuse_dev_alloc();
@@ -1075,24 +1322,30 @@ static int virtio_fs_fill_super(struct super_block *sb)
goto err_free_fuse_devs;
}
- ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
- err = fuse_fill_super_common(sb, &ctx);
+ /* virtiofs allocates and installs its own fuse devices */
+ ctx->fudptr = NULL;
+ if (ctx->dax_mode != FUSE_DAX_NEVER) {
+ if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) {
+ err = -EINVAL;
+ pr_err("virtio-fs: dax can't be enabled as filesystem"
+ " device does not support it.\n");
+ goto err_free_fuse_devs;
+ }
+ ctx->dax_dev = fs->dax_dev;
+ }
+ err = fuse_fill_super_common(sb, ctx);
if (err < 0)
goto err_free_fuse_devs;
- fc = fs->vqs[VQ_REQUEST].fud->fc;
-
for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
- if (i == VQ_REQUEST)
- continue; /* already initialized */
fuse_dev_install(fsvq->fud, fc);
}
/* Previous unmount will stop all queues. Start these again */
virtio_fs_start_all_queues(fs);
- fuse_send_init(fc);
+ fuse_send_init(fm);
mutex_unlock(&virtio_fs_mutex);
return 0;
@@ -1103,18 +1356,17 @@ err:
return err;
}
-static void virtio_kill_sb(struct super_block *sb)
+static void virtio_fs_conn_destroy(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
- struct virtio_fs *vfs;
- struct virtio_fs_vq *fsvq;
+ struct fuse_conn *fc = fm->fc;
+ struct virtio_fs *vfs = fc->iq.priv;
+ struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
- /* If mount failed, we can still be called without any fc */
- if (!fc)
- return fuse_kill_sb_anon(sb);
-
- vfs = fc->iq.priv;
- fsvq = &vfs->vqs[VQ_HIPRIO];
+ /* Stop dax worker. Soon evict_inodes() will be called which
+ * will free all memory ranges belonging to all inodes.
+ */
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_cancel_work(fc);
/* Stop forget queue. Soon destroy will be sent */
spin_lock(&fsvq->lock);
@@ -1122,9 +1374,9 @@ static void virtio_kill_sb(struct super_block *sb)
spin_unlock(&fsvq->lock);
virtio_fs_drain_all_queues(vfs);
- fuse_kill_sb_anon(sb);
+ fuse_conn_destroy(fm);
- /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+ /* fuse_conn_destroy() must have sent destroy. Stop all queues
* and drain one more time and free fuse devices. Freeing fuse
* devices will drop their reference on fuse_conn and that in
* turn will drop its reference on virtio_fs object.
@@ -1134,32 +1386,38 @@ static void virtio_kill_sb(struct super_block *sb)
virtio_fs_free_devs(vfs);
}
-static int virtio_fs_test_super(struct super_block *sb,
- struct fs_context *fsc)
+static void virtio_kill_sb(struct super_block *sb)
{
- struct fuse_conn *fc = fsc->s_fs_info;
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
- return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+ /* If mount failed, we can still be called without any fc */
+ if (sb->s_root) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ virtio_fs_conn_destroy(fm);
+ }
+ kill_anon_super(sb);
+ fuse_mount_destroy(fm);
}
-static int virtio_fs_set_super(struct super_block *sb,
- struct fs_context *fsc)
+static int virtio_fs_test_super(struct super_block *sb,
+ struct fs_context *fsc)
{
- int err;
+ struct fuse_mount *fsc_fm = fsc->s_fs_info;
+ struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
- err = get_anon_bdev(&sb->s_dev);
- if (!err)
- fuse_conn_get(fsc->s_fs_info);
-
- return err;
+ return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
}
static int virtio_fs_get_tree(struct fs_context *fsc)
{
struct virtio_fs *fs;
struct super_block *sb;
- struct fuse_conn *fc;
- int err;
+ struct fuse_conn *fc = NULL;
+ struct fuse_mount *fm;
+ unsigned int virtqueue_size;
+ int err = -EIO;
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
@@ -1171,27 +1429,38 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
return -EINVAL;
}
+ virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq);
+ if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD))
+ goto out_err;
+
+ err = -ENOMEM;
fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
- if (!fc) {
- mutex_lock(&virtio_fs_mutex);
- virtio_fs_put(fs);
- mutex_unlock(&virtio_fs_mutex);
- return -ENOMEM;
- }
+ if (!fc)
+ goto out_err;
- fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
- fs);
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ goto out_err;
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs);
fc->release = fuse_free_conn;
fc->delete_stale = true;
+ fc->auto_submounts = true;
+ fc->sync_fs = true;
+
+ /* Tell FUSE to split requests that exceed the virtqueue's size */
+ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,
+ virtqueue_size - FUSE_HEADER_OVERHEAD);
- fsc->s_fs_info = fc;
- sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
- fuse_conn_put(fc);
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
- err = virtio_fs_fill_super(sb);
+ err = virtio_fs_fill_super(sb, fsc);
if (err) {
deactivate_locked_super(sb);
return err;
@@ -1203,14 +1472,32 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
WARN_ON(fsc->root);
fsc->root = dget(sb->s_root);
return 0;
+
+out_err:
+ kfree(fc);
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ return err;
}
static const struct fs_context_operations virtio_fs_context_ops = {
+ .free = virtio_fs_free_fsc,
+ .parse_param = virtio_fs_parse_param,
.get_tree = virtio_fs_get_tree,
};
static int virtio_fs_init_fs_context(struct fs_context *fsc)
{
+ struct fuse_fs_context *ctx;
+
+ if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT)
+ return fuse_init_fs_context_submount(fsc);
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fsc->fs_private = ctx;
fsc->ops = &virtio_fs_context_ops;
return 0;
}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 20d052e08b3b..0d3e7177fce0 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -12,50 +12,52 @@
#include <linux/posix_acl_xattr.h>
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
- size_t size, int flags)
+ size_t size, int flags, unsigned int extra_flags)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
int err;
- if (fc->no_setxattr)
+ if (fm->fc->no_setxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
inarg.flags = flags;
+ inarg.setxattr_flags = extra_flags;
+
args.opcode = FUSE_SETXATTR;
args.nodeid = get_node_id(inode);
args.in_numargs = 3;
- args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].size = fm->fc->setxattr_ext ?
+ sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE;
args.in_args[0].value = &inarg;
args.in_args[1].size = strlen(name) + 1;
args.in_args[1].value = name;
args.in_args[2].size = size;
args.in_args[2].value = value;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_setxattr = 1;
+ fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (fc->no_getxattr)
+ if (fm->fc->no_getxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -77,11 +79,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
if (ret == -ENOSYS) {
- fc->no_getxattr = 1;
+ fm->fc->no_getxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
@@ -107,16 +109,19 @@ static int fuse_verify_xattr_list(char *list, size_t size)
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
{
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (!fuse_allow_current_process(fc))
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fuse_allow_current_process(fm->fc))
return -EACCES;
- if (fc->no_listxattr)
+ if (fm->fc->no_listxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -136,13 +141,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
if (ret > 0 && size)
ret = fuse_verify_xattr_list(list, ret);
if (ret == -ENOSYS) {
- fc->no_listxattr = 1;
+ fm->fc->no_listxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
@@ -150,11 +155,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
int fuse_removexattr(struct inode *inode, const char *name)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
int err;
- if (fc->no_removexattr)
+ if (fm->fc->no_removexattr)
return -EOPNOTSUPP;
args.opcode = FUSE_REMOVEXATTR;
@@ -162,15 +167,14 @@ int fuse_removexattr(struct inode *inode, const char *name)
args.in_numargs = 1;
args.in_args[0].size = strlen(name) + 1;
args.in_args[0].value = name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_removexattr = 1;
+ fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
@@ -178,18 +182,25 @@ static int fuse_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size)
{
+ if (fuse_is_bad(inode))
+ return -EIO;
+
return fuse_getxattr(inode, name, value, size);
}
static int fuse_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value, size_t size,
int flags)
{
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!value)
return fuse_removexattr(inode, name);
- return fuse_setxattr(inode, name, value, size, flags);
+ return fuse_setxattr(inode, name, value, size, flags, 0);
}
static bool no_xattr_list(struct dentry *dentry)
@@ -205,6 +216,7 @@ static int no_xattr_get(const struct xattr_handler *handler,
}
static int no_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *nodee,
const char *name, const void *value,
size_t size, int flags)