aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl7
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl6
-rw-r--r--fs/Makefile2
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/fs_context.c160
-rw-r--r--fs/fsopen.c477
-rw-r--r--fs/internal.h4
-rw-r--r--fs/namespace.c477
-rw-r--r--include/linux/fs.h7
-rw-r--r--include/linux/fs_context.h38
-rw-r--r--include/linux/lsm_hooks.h6
-rw-r--r--include/linux/module.h6
-rw-r--r--include/linux/security.h7
-rw-r--r--include/linux/syscalls.h9
-rw-r--r--include/uapi/linux/fcntl.h2
-rw-r--r--include/uapi/linux/mount.h62
-rw-r--r--samples/Kconfig9
-rw-r--r--samples/Makefile2
-rw-r--r--samples/vfs/Makefile (renamed from samples/statx/Makefile)5
-rw-r--r--samples/vfs/test-fsmount.c133
-rw-r--r--samples/vfs/test-statx.c (renamed from samples/statx/test-statx.c)11
-rw-r--r--security/security.c5
22 files changed, 1353 insertions, 91 deletions
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 1f9607ed087c..4cd5f982b1e5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,7 +398,12 @@
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents
386 i386 rseq sys_rseq __ia32_sys_rseq
-# don't use numbers 387 through 392, add new calls at the end
+387 i386 open_tree sys_open_tree __ia32_sys_open_tree
+388 i386 move_mount sys_move_mount __ia32_sys_move_mount
+389 i386 fsopen sys_fsopen __ia32_sys_fsopen
+390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig
+391 i386 fsmount sys_fsmount __ia32_sys_fsmount
+392 i386 fspick sys_fspick __ia32_sys_fspick
393 i386 semget sys_semget __ia32_sys_semget
394 i386 semctl sys_semctl __ia32_compat_sys_semctl
395 i386 shmget sys_shmget __ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92ee0b4378d4..64ca0d06259a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,12 @@
332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents
334 common rseq __x64_sys_rseq
+335 common open_tree __x64_sys_open_tree
+336 common move_mount __x64_sys_move_mount
+337 common fsopen __x64_sys_fsopen
+338 common fsconfig __x64_sys_fsconfig
+339 common fsmount __x64_sys_fsmount
+340 common fspick __x64_sys_fspick
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal __x64_sys_pidfd_send_signal
diff --git a/fs/Makefile b/fs/Makefile
index 35945f8139e6..5a51bc2489ba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o
+ fs_types.o fs_context.o fs_parser.o fsopen.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/file_table.c b/fs/file_table.c
index 155d7514a094..3f9c1b452c1d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -255,6 +255,7 @@ static void __fput(struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = file->f_inode;
+ fmode_t mode = file->f_mode;
if (unlikely(!(file->f_mode & FMODE_OPENED)))
goto out;
@@ -277,18 +278,20 @@ static void __fput(struct file *file)
if (file->f_op->release)
file->f_op->release(inode, file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
- !(file->f_mode & FMODE_PATH))) {
+ !(mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+ if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
- if (file->f_mode & FMODE_WRITER) {
+ if (mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(mnt);
}
dput(dentry);
+ if (unlikely(mode & FMODE_NEED_UNMOUNT))
+ dissolve_on_fput(mnt);
mntput(mnt);
out:
file_free(file);
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 87e3546b9a52..a47ccd5a4a78 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -11,6 +11,7 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs.h>
@@ -23,6 +24,7 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
+#include <asm/sections.h>
#include "mount.h"
#include "internal.h"
@@ -271,6 +273,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
fc->cred = get_current_cred();
fc->net_ns = get_net(current->nsproxy->net_ns);
+ mutex_init(&fc->uapi_mutex);
+
switch (purpose) {
case FS_CONTEXT_FOR_MOUNT:
fc->user_ns = get_user_ns(fc->cred->user_ns);
@@ -353,6 +357,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
if (!fc)
return ERR_PTR(-ENOMEM);
+ mutex_init(&fc->uapi_mutex);
+
fc->fs_private = NULL;
fc->s_fs_info = NULL;
fc->source = NULL;
@@ -361,6 +367,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
get_net(fc->net_ns);
get_user_ns(fc->user_ns);
get_cred(fc->cred);
+ if (fc->log)
+ refcount_inc(&fc->log->usage);
/* Can't call put until we've called ->dup */
ret = fc->ops->dup(fc, src_fc);
@@ -378,7 +386,6 @@ err_fc:
}
EXPORT_SYMBOL(vfs_dup_fs_context);
-#ifdef CONFIG_PRINTK
/**
* logfc - Log a message to a filesystem context
* @fc: The filesystem context to log to.
@@ -386,27 +393,100 @@ EXPORT_SYMBOL(vfs_dup_fs_context);
*/
void logfc(struct fs_context *fc, const char *fmt, ...)
{
+ static const char store_failure[] = "OOM: Can't store error string";
+ struct fc_log *log = fc ? fc->log : NULL;
+ const char *p;
va_list va;
+ char *q;
+ u8 freeable;
va_start(va, fmt);
-
- switch (fmt[0]) {
- case 'w':
- vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va);
- break;
- case 'e':
- vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va);
- break;
- default:
- vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va);
- break;
+ if (!strchr(fmt, '%')) {
+ p = fmt;
+ goto unformatted_string;
+ }
+ if (strcmp(fmt, "%s") == 0) {
+ p = va_arg(va, const char *);
+ goto unformatted_string;
}
- pr_cont("\n");
+ q = kvasprintf(GFP_KERNEL, fmt, va);
+copied_string:
+ if (!q)
+ goto store_failure;
+ freeable = 1;
+ goto store_string;
+
+unformatted_string:
+ if ((unsigned long)p >= (unsigned long)__start_rodata &&
+ (unsigned long)p < (unsigned long)__end_rodata)
+ goto const_string;
+ if (log && within_module_core((unsigned long)p, log->owner))
+ goto const_string;
+ q = kstrdup(p, GFP_KERNEL);
+ goto copied_string;
+
+store_failure:
+ p = store_failure;
+const_string:
+ q = (char *)p;
+ freeable = 0;
+store_string:
+ if (!log) {
+ switch (fmt[0]) {
+ case 'w':
+ printk(KERN_WARNING "%s\n", q + 2);
+ break;
+ case 'e':
+ printk(KERN_ERR "%s\n", q + 2);
+ break;
+ default:
+ printk(KERN_NOTICE "%s\n", q + 2);
+ break;
+ }
+ if (freeable)
+ kfree(q);
+ } else {
+ unsigned int logsize = ARRAY_SIZE(log->buffer);
+ u8 index;
+
+ index = log->head & (logsize - 1);
+ BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) ||
+ sizeof(log->tail) != sizeof(u8));
+ if ((u8)(log->head - log->tail) == logsize) {
+ /* The buffer is full, discard the oldest message */
+ if (log->need_free & (1 << index))
+ kfree(log->buffer[index]);
+ log->tail++;
+ }
+
+ log->buffer[index] = q;
+ log->need_free &= ~(1 << index);
+ log->need_free |= freeable << index;
+ log->head++;
+ }
va_end(va);
}
EXPORT_SYMBOL(logfc);
-#endif
+
+/*
+ * Free a logging structure.
+ */
+static void put_fc_log(struct fs_context *fc)
+{
+ struct fc_log *log = fc->log;
+ int i;
+
+ if (log) {
+ if (refcount_dec_and_test(&log->usage)) {
+ fc->log = NULL;
+ for (i = 0; i <= 7; i++)
+ if (log->need_free & (1 << i))
+ kfree(log->buffer[i]);
+ kfree(log);
+ }
+ }
+}
/**
* put_fs_context - Dispose of a superblock configuration context.
@@ -431,6 +511,7 @@ void put_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
put_cred(fc->cred);
kfree(fc->subtype);
+ put_fc_log(fc);
put_filesystem(fc->fs_type);
kfree(fc->source);
kfree(fc);
@@ -640,3 +721,54 @@ int parse_monolithic_mount_data(struct fs_context *fc, void *data)
return monolithic_mount_data(fc, data);
}
+
+/*
+ * Clean up a context after performing an action on it and put it into a state
+ * from where it can be used to reconfigure a superblock.
+ *
+ * Note that here we do only the parts that can't fail; the rest is in
+ * finish_clean_context() below and in between those fs_context is marked
+ * FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after
+ * successful mount or remount we need to report success to userland.
+ * Trying to do full reinit (for the sake of possible subsequent remount)
+ * and failing to allocate memory would've put us into a nasty situation.
+ * So here we only discard the old state and reinitialization is left
+ * until we actually try to reconfigure.
+ */
+void vfs_clean_context(struct fs_context *fc)
+{
+ if (fc->need_free && fc->ops && fc->ops->free)
+ fc->ops->free(fc);
+ fc->need_free = false;
+ fc->fs_private = NULL;
+ fc->s_fs_info = NULL;
+ fc->sb_flags = 0;
+ security_free_mnt_opts(&fc->security);
+ kfree(fc->subtype);
+ fc->subtype = NULL;
+ kfree(fc->source);
+ fc->source = NULL;
+
+ fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
+ fc->phase = FS_CONTEXT_AWAITING_RECONF;
+}
+
+int finish_clean_context(struct fs_context *fc)
+{
+ int error;
+
+ if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
+ return 0;
+
+ if (fc->fs_type->init_fs_context)
+ error = fc->fs_type->init_fs_context(fc);
+ else
+ error = legacy_init_fs_context(fc);
+ if (unlikely(error)) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return error;
+ }
+ fc->need_free = true;
+ fc->phase = FS_CONTEXT_RECONF_PARAMS;
+ return 0;
+}
diff --git a/fs/fsopen.c b/fs/fsopen.c
new file mode 100644
index 000000000000..3bb9c0c8cbcc
--- /dev/null
+++ b/fs/fsopen.c
@@ -0,0 +1,477 @@
+/* Filesystem access-by-fd.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/anon_inodes.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <uapi/linux/mount.h>
+#include "internal.h"
+#include "mount.h"
+
+/*
+ * Allow the user to read back any error, warning or informational messages.
+ */
+static ssize_t fscontext_read(struct file *file,
+ char __user *_buf, size_t len, loff_t *pos)
+{
+ struct fs_context *fc = file->private_data;
+ struct fc_log *log = fc->log;
+ unsigned int logsize = ARRAY_SIZE(log->buffer);
+ ssize_t ret;
+ char *p;
+ bool need_free;
+ int index, n;
+
+ ret = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (log->head == log->tail) {
+ mutex_unlock(&fc->uapi_mutex);
+ return -ENODATA;
+ }
+
+ index = log->tail & (logsize - 1);
+ p = log->buffer[index];
+ need_free = log->need_free & (1 << index);
+ log->buffer[index] = NULL;
+ log->need_free &= ~(1 << index);
+ log->tail++;
+ mutex_unlock(&fc->uapi_mutex);
+
+ ret = -EMSGSIZE;
+ n = strlen(p);
+ if (n > len)
+ goto err_free;
+ ret = -EFAULT;
+ if (copy_to_user(_buf, p, n) != 0)
+ goto err_free;
+ ret = n;
+
+err_free:
+ if (need_free)
+ kfree(p);
+ return ret;
+}
+
+static int fscontext_release(struct inode *inode, struct file *file)
+{
+ struct fs_context *fc = file->private_data;
+
+ if (fc) {
+ file->private_data = NULL;
+ put_fs_context(fc);
+ }
+ return 0;
+}
+
+const struct file_operations fscontext_fops = {
+ .read = fscontext_read,
+ .release = fscontext_release,
+ .llseek = no_llseek,
+};
+
+/*
+ * Attach a filesystem context to a file and an fd.
+ */
+static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags)
+{
+ int fd;
+
+ fd = anon_inode_getfd("fscontext", &fscontext_fops, fc,
+ O_RDWR | o_flags);
+ if (fd < 0)
+ put_fs_context(fc);
+ return fd;
+}
+
+static int fscontext_alloc_log(struct fs_context *fc)
+{
+ fc->log = kzalloc(sizeof(*fc->log), GFP_KERNEL);
+ if (!fc->log)
+ return -ENOMEM;
+ refcount_set(&fc->log->usage, 1);
+ fc->log->owner = fc->fs_type->owner;
+ return 0;
+}
+
+/*
+ * Open a filesystem by name so that it can be configured for mounting.
+ *
+ * We are allowed to specify a container in which the filesystem will be
+ * opened, thereby indicating which namespaces will be used (notably, which
+ * network namespace will be used for network filesystems).
+ */
+SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
+{
+ struct file_system_type *fs_type;
+ struct fs_context *fc;
+ const char *fs_name;
+ int ret;
+
+ if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (flags & ~FSOPEN_CLOEXEC)
+ return -EINVAL;
+
+ fs_name = strndup_user(_fs_name, PAGE_SIZE);
+ if (IS_ERR(fs_name))
+ return PTR_ERR(fs_name);
+
+ fs_type = get_fs_type(fs_name);
+ kfree(fs_name);
+ if (!fs_type)
+ return -ENODEV;
+
+ fc = fs_context_for_mount(fs_type, 0);
+ put_filesystem(fs_type);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ fc->phase = FS_CONTEXT_CREATE_PARAMS;
+
+ ret = fscontext_alloc_log(fc);
+ if (ret < 0)
+ goto err_fc;
+
+ return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);
+
+err_fc:
+ put_fs_context(fc);
+ return ret;
+}
+
+/*
+ * Pick a superblock into a context for reconfiguration.
+ */
+SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags)
+{
+ struct fs_context *fc;
+ struct path target;
+ unsigned int lookup_flags;
+ int ret;
+
+ if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((flags & ~(FSPICK_CLOEXEC |
+ FSPICK_SYMLINK_NOFOLLOW |
+ FSPICK_NO_AUTOMOUNT |
+ FSPICK_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+ if (flags & FSPICK_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & FSPICK_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & FSPICK_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+ ret = user_path_at(dfd, path, lookup_flags, &target);
+ if (ret < 0)
+ goto err;
+
+ ret = -EINVAL;
+ if (target.mnt->mnt_root != target.dentry)
+ goto err_path;
+
+ fc = fs_context_for_reconfigure(target.dentry, 0, 0);
+ if (IS_ERR(fc)) {
+ ret = PTR_ERR(fc);
+ goto err_path;
+ }
+
+ fc->phase = FS_CONTEXT_RECONF_PARAMS;
+
+ ret = fscontext_alloc_log(fc);
+ if (ret < 0)
+ goto err_fc;
+
+ path_put(&target);
+ return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0);
+
+err_fc:
+ put_fs_context(fc);
+err_path:
+ path_put(&target);
+err:
+ return ret;
+}
+
+/*
+ * Check the state and apply the configuration. Note that this function is
+ * allowed to 'steal' the value by setting param->xxx to NULL before returning.
+ */
+static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
+ struct fs_parameter *param)
+{
+ struct super_block *sb;
+ int ret;
+
+ ret = finish_clean_context(fc);
+ if (ret)
+ return ret;
+ switch (cmd) {
+ case FSCONFIG_CMD_CREATE:
+ if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+ return -EBUSY;
+ fc->phase = FS_CONTEXT_CREATING;
+ ret = vfs_get_tree(fc);
+ if (ret)
+ break;
+ sb = fc->root->d_sb;
+ ret = security_sb_kern_mount(sb);
+ if (unlikely(ret)) {
+ fc_drop_locked(fc);
+ break;
+ }
+ up_write(&sb->s_umount);
+ fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+ return 0;
+ case FSCONFIG_CMD_RECONFIGURE:
+ if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
+ return -EBUSY;
+ fc->phase = FS_CONTEXT_RECONFIGURING;
+ sb = fc->root->d_sb;
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ down_write(&sb->s_umount);
+ ret = reconfigure_super(fc);
+ up_write(&sb->s_umount);
+ if (ret)
+ break;
+ vfs_clean_context(fc);
+ return 0;
+ default:
+ if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+ fc->phase != FS_CONTEXT_RECONF_PARAMS)
+ return -EBUSY;
+
+ return vfs_parse_fs_param(fc, param);
+ }
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+}
+
+/**
+ * sys_fsconfig - Set parameters and trigger actions on a context
+ * @fd: The filesystem context to act upon
+ * @cmd: The action to take
+ * @_key: Where appropriate, the parameter key to set
+ * @_value: Where appropriate, the parameter value to set
+ * @aux: Additional information for the value
+ *
+ * This system call is used to set parameters on a context, including
+ * superblock settings, data source and security labelling.
+ *
+ * Actions include triggering the creation of a superblock and the
+ * reconfiguration of the superblock attached to the specified context.
+ *
+ * When setting a parameter, @cmd indicates the type of value being proposed
+ * and @_key indicates the parameter to be altered.
+ *
+ * @_value and @aux are used to specify the value, should a value be required:
+ *
+ * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean
+ * in nature. The key may be prefixed with "no" to invert the
+ * setting. @_value must be NULL and @aux must be 0.
+ *
+ * (*) fsconfig_set_string: A string value is specified. The parameter can be
+ * expecting boolean, integer, string or take a path. A conversion to an
+ * appropriate type will be attempted (which may include looking up as a
+ * path). @_value points to a NUL-terminated string and @aux must be 0.
+ *
+ * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the
+ * blob and @aux indicates its size. The parameter must be expecting a
+ * blob.
+ *
+ * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be
+ * expecting a path object. @_value points to a NUL-terminated string that
+ * is the path and @aux is a file descriptor at which to start a relative
+ * lookup or AT_FDCWD.
+ *
+ * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH
+ * implied.
+ *
+ * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be
+ * NULL and @aux indicates the file descriptor.
+ */
+SYSCALL_DEFINE5(fsconfig,
+ int, fd,
+ unsigned int, cmd,
+ const char __user *, _key,
+ const void __user *, _value,
+ int, aux)
+{
+ struct fs_context *fc;
+ struct fd f;
+ int ret;
+
+ struct fs_parameter param = {
+ .type = fs_value_is_undefined,
+ };
+
+ if (fd < 0)
+ return -EINVAL;
+
+ switch (cmd) {
+ case FSCONFIG_SET_FLAG:
+ if (!_key || _value || aux)
+ return -EINVAL;
+ break;
+ case FSCONFIG_SET_STRING:
+ if (!_key || !_value || aux)
+ return -EINVAL;
+ break;
+ case FSCONFIG_SET_BINARY:
+ if (!_key || !_value || aux <= 0 || aux > 1024 * 1024)
+ return -EINVAL;
+ break;
+ case FSCONFIG_SET_PATH:
+ case FSCONFIG_SET_PATH_EMPTY:
+ if (!_key || !_value || (aux != AT_FDCWD && aux < 0))
+ return -EINVAL;
+ break;
+ case FSCONFIG_SET_FD:
+ if (!_key || _value || aux < 0)
+ return -EINVAL;
+ break;
+ case FSCONFIG_CMD_CREATE:
+ case FSCONFIG_CMD_RECONFIGURE:
+ if (_key || _value || aux)
+ return -EINVAL;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+ ret = -EINVAL;
+ if (f.file->f_op != &fscontext_fops)
+ goto out_f;
+
+ fc = f.file->private_data;
+ if (fc->ops == &legacy_fs_context_ops) {
+ switch (cmd) {
+ case FSCONFIG_SET_BINARY:
+ case FSCONFIG_SET_PATH:
+ case FSCONFIG_SET_PATH_EMPTY:
+ case FSCONFIG_SET_FD:
+ ret = -EOPNOTSUPP;
+ goto out_f;
+ }
+ }
+
+ if (_key) {
+ param.key = strndup_user(_key, 256);
+ if (IS_ERR(param.key)) {
+ ret = PTR_ERR(param.key);
+ goto out_f;
+ }
+ }
+
+ switch (cmd) {
+ case FSCONFIG_SET_FLAG:
+ param.type = fs_value_is_flag;
+ break;
+ case FSCONFIG_SET_STRING:
+ param.type = fs_value_is_string;
+ param.string = strndup_user(_value, 256);
+ if (IS_ERR(param.string)) {
+ ret = PTR_ERR(param.string);
+ goto out_key;
+ }
+ param.size = strlen(param.string);
+ break;
+ case FSCONFIG_SET_BINARY:
+ param.type = fs_value_is_blob;
+ param.size = aux;
+ param.blob = memdup_user_nul(_value, aux);
+ if (IS_ERR(param.blob)) {
+ ret = PTR_ERR(param.blob);
+ goto out_key;
+ }
+ break;
+ case FSCONFIG_SET_PATH:
+ param.type = fs_value_is_filename;
+ param.name = getname_flags(_value, 0, NULL);
+ if (IS_ERR(param.name)) {
+ ret = PTR_ERR(param.name);
+ goto out_key;
+ }
+ param.dirfd = aux;
+ param.size = strlen(param.name->name);
+ break;
+ case FSCONFIG_SET_PATH_EMPTY:
+ param.type = fs_value_is_filename_empty;
+ param.name = getname_flags(_value, LOOKUP_EMPTY, NULL);
+ if (IS_ERR(param.name)) {
+ ret = PTR_ERR(param.name);
+ goto out_key;
+ }
+ param.dirfd = aux;
+ param.size = strlen(param.name->name);
+ break;
+ case FSCONFIG_SET_FD:
+ param.type = fs_value_is_file;
+ ret = -EBADF;
+ param.file = fget(aux);
+ if (!param.file)
+ goto out_key;
+ break;
+ default:
+ break;
+ }
+
+ ret = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (ret == 0) {
+ ret = vfs_fsconfig_locked(fc, cmd, &param);
+ mutex_unlock(&fc->uapi_mutex);
+ }
+
+ /* Clean up the our record of any value that we obtained from
+ * userspace. Note that the value may have been stolen by the LSM or
+ * filesystem, in which case the value pointer will have been cleared.
+ */
+ switch (cmd) {
+ case FSCONFIG_SET_STRING:
+ case FSCONFIG_SET_BINARY:
+ kfree(param.string);
+ break;
+ case FSCONFIG_SET_PATH:
+ case FSCONFIG_SET_PATH_EMPTY:
+ if (param.name)
+ putname(param.name);
+ break;
+ case FSCONFIG_SET_FD:
+ if (param.file)
+ fput(param.file);
+ break;
+ default:
+ break;
+ }
+out_key:
+ kfree(param.key);
+out_f:
+ fdput(f);
+ return ret;
+}
diff --git a/fs/internal.h b/fs/internal.h
index 17a8ae967493..0010889f2e85 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,8 +55,11 @@ extern void __init chrdev_init(void);
/*
* fs_context.c
*/
+extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void fc_drop_locked(struct fs_context *);
+extern void vfs_clean_context(struct fs_context *fc);
+extern int finish_clean_context(struct fs_context *fc);
/*
* namei.c
@@ -92,6 +95,7 @@ extern void __init mnt_init(void);
extern int __mnt_want_write_file(struct file *);
extern void __mnt_drop_write_file(struct file *);
+extern void dissolve_on_fput(struct vfsmount *);
/*
* fs_struct.c
*/
diff --git a/fs/namespace.c b/fs/namespace.c
index c9cab307fa77..3357c3d65475 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -20,6 +20,7 @@
#include <linux/init.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
+#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
@@ -1832,6 +1833,27 @@ struct vfsmount *collect_mounts(const struct path *path)
return &tree->mnt;
}
+static void free_mnt_ns(struct mnt_namespace *);
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+
+void dissolve_on_fput(struct vfsmount *mnt)
+{
+ struct mnt_namespace *ns;
+ namespace_lock();
+ lock_mount_hash();
+ ns = real_mount(mnt)->mnt_ns;
+ if (ns) {
+ if (is_anon_ns(ns))
+ umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
+ else
+ ns = NULL;
+ }
+ unlock_mount_hash();
+ namespace_unlock();
+ if (ns)
+ free_mnt_ns(ns);
+}
+
void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
@@ -2065,6 +2087,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
attach_mnt(source_mnt, dest_mnt, dest_mp);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
+ if (source_mnt->mnt_ns) {
+ /* move from anon - the caller will destroy */
+ list_del_init(&source_mnt->mnt_ns->list);
+ }
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt);
}
@@ -2222,6 +2248,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+static struct mount *__do_loopback(struct path *old_path, int recurse)
+{
+ struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+
+ if (IS_MNT_UNBINDABLE(old))
+ return mnt;
+
+ if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+ return mnt;
+
+ if (!recurse && has_locked_children(old, old_path->dentry))
+ return mnt;
+
+ if (recurse)
+ mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+ else
+ mnt = clone_mnt(old, old_path->dentry, 0);
+
+ if (!IS_ERR(mnt))
+ mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+ return mnt;
+}
+
/*
* do loopback mount.
*/
@@ -2229,7 +2279,7 @@ static int do_loopback(struct path *path, const char *old_name,
int recurse)
{
struct path old_path;
- struct mount *mnt = NULL, *old, *parent;
+ struct mount *mnt = NULL, *parent;
struct mountpoint *mp;
int err;
if (!old_name || !*old_name)
@@ -2243,38 +2293,21 @@ static int do_loopback(struct path *path, const char *old_name,
goto out;
mp = lock_mount(path);
- err = PTR_ERR(mp);
- if (IS_ERR(mp))
+ if (IS_ERR(mp)) {
+ err = PTR_ERR(mp);
goto out;
+ }
- old = real_mount(old_path.mnt);
parent = real_mount(path->mnt);
-
- err = -EINVAL;
- if (IS_MNT_UNBINDABLE(old))
- goto out2;
-
if (!check_mnt(parent))
goto out2;
- if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
- goto out2;
-
- if (!recurse && has_locked_children(old, old_path.dentry))
- goto out2;
-
- if (recurse)
- mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
- else
- mnt = clone_mnt(old, old_path.dentry, 0);
-
+ mnt = __do_loopback(&old_path, recurse);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto out2;
}
- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
@@ -2288,6 +2321,96 @@ out:
return err;
}
+static struct file *open_detached_copy(struct path *path, bool recursive)
+{
+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+ struct mount *mnt, *p;
+ struct file *file;
+
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
+
+ namespace_lock();
+ mnt = __do_loopback(path, recursive);
+ if (IS_ERR(mnt)) {
+ namespace_unlock();
+ free_mnt_ns(ns);
+ return ERR_CAST(mnt);
+ }
+
+ lock_mount_hash();
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ p->mnt_ns = ns;
+ ns->mounts++;
+ }
+ ns->root = mnt;
+ list_add_tail(&ns->list, &mnt->mnt_list);
+ mntget(&mnt->mnt);
+ unlock_mount_hash();
+ namespace_unlock();
+
+ mntput(path->mnt);
+ path->mnt = &mnt->mnt;
+ file = dentry_open(path, O_PATH, current_cred());
+ if (IS_ERR(file))
+ dissolve_on_fput(path->mnt);
+ else
+ file->f_mode |= FMODE_NEED_UNMOUNT;
+ return file;
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
+{
+ struct file *file;
+ struct path path;
+ int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+ bool detached = flags & OPEN_TREE_CLONE;
+ int error;
+ int fd;
+
+ BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+
+ if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
+ OPEN_TREE_CLOEXEC))
+ return -EINVAL;
+
+ if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ if (detached && !may_mount())
+ return -EPERM;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ error = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(error)) {
+ file = ERR_PTR(error);
+ } else {
+ if (detached)
+ file = open_detached_copy(&path, flags & AT_RECURSIVE);
+ else
+ file = dentry_open(&path, O_PATH, current_cred());
+ path_put(&path);
+ }
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+ fd_install(fd, file);
+ return fd;
+}
+
/*
* Don't allow locked mount flags to be cleared.
*
@@ -2426,72 +2549,117 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-static int do_move_mount(struct path *path, const char *old_name)
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree. Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
{
- struct path old_path, parent_path;
+ struct mount *p;
+ bool ret = false;
+
+ lock_mount_hash();
+ for (p = subtree; p; p = next_mnt(p, subtree))
+ if (mnt_ns_loop(p->mnt.mnt_root))
+ goto out;
+
+ ret = true;
+out:
+ unlock_mount_hash();
+ return ret;
+}
+
+static int do_move_mount(struct path *old_path, struct path *new_path)
+{
+ struct path parent_path = {.mnt = NULL, .dentry = NULL};
+ struct mnt_namespace *ns;
struct mount *p;
struct mount *old;
struct mountpoint *mp;
int err;
- if (!old_name || !*old_name)
- return -EINVAL;
- err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
- if (err)
- return err;
+ bool attached;
- mp = lock_mount(path);
- err = PTR_ERR(mp);
+ mp = lock_mount(new_path);
if (IS_ERR(mp))
- goto out;
+ return PTR_ERR(mp);
- old = real_mount(old_path.mnt);
- p = real_mount(path->mnt);
+ old = real_mount(old_path->mnt);
+ p = real_mount(new_path->mnt);
+ attached = mnt_has_parent(old);
+ ns = old->mnt_ns;
err = -EINVAL;
- if (!check_mnt(p) || !check_mnt(old))
- goto out1;
+ /* The mountpoint must be in our namespace. */
+ if (!check_mnt(p))
+ goto out;
- if (old->mnt.mnt_flags & MNT_LOCKED)
- goto out1;
+ /* The thing moved should be either ours or completely unattached. */
+ if (attached && !check_mnt(old))
+ goto out;
- err = -EINVAL;
- if (old_path.dentry != old_path.mnt->mnt_root)
- goto out1;
+ if (!attached && !is_anon_ns(ns))
+ goto out;
- if (!mnt_has_parent(old))
- goto out1;
+ if (old->mnt.mnt_flags & MNT_LOCKED)
+ goto out;
- if (d_is_dir(path->dentry) !=
- d_is_dir(old_path.dentry))
- goto out1;
+ if (old_path->dentry != old_path->mnt->mnt_root)
+ goto out;
+
+ if (d_is_dir(new_path->dentry) !=
+ d_is_dir(old_path->dentry))
+ goto out;
/*
* Don't move a mount residing in a shared parent.
*/
- if (IS_MNT_SHARED(old->mnt_parent))
- goto out1;
+ if (attached && IS_MNT_SHARED(old->mnt_parent))
+ goto out;
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
*/
if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
- goto out1;
+ goto out;
err = -ELOOP;
+ if (!check_for_nsfs_mounts(old))
+ goto out;
for (; mnt_has_parent(p); p = p->mnt_parent)
if (p == old)
- goto out1;
+ goto out;
- err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
+ err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
+ attached ? &parent_path : NULL);
if (err)
- goto out1;
+ goto out;
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
-out1:
- unlock_mount(mp);
out:
- if (!err)
+ unlock_mount(mp);
+ if (!err) {
path_put(&parent_path);
+ if (!attached)
+ free_mnt_ns(ns);
+ }
+ return err;
+}
+
+static int do_move_mount_old(struct path *path, const char *old_name)
+{
+ struct path old_path;
+ int err;
+
+ if (!old_name || !*old_name)
+ return -EINVAL;
+
+ err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+ if (err)
+ return err;
+
+ err = do_move_mount(&old_path, path);
path_put(&old_path);
return err;
}
@@ -2937,7 +3105,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
- retval = do_move_mount(&path, dev_name);
+ retval = do_move_mount_old(&path, dev_name);
else
retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
dev_name, data_page);
@@ -3166,6 +3334,203 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
}
/*
+ * Create a kernel mount representation for a new, prepared superblock
+ * (specified by fs_fd) and attach to an open_tree-like file descriptor.
+ */
+SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
+ unsigned int, attr_flags)
+{
+ struct mnt_namespace *ns;
+ struct fs_context *fc;
+ struct file *file;
+ struct path newmount;
+ struct mount *mnt;
+ struct fd f;
+ unsigned int mnt_flags = 0;
+ long ret;
+
+ if (!may_mount())
+ return -EPERM;
+
+ if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
+ return -EINVAL;
+
+ if (attr_flags & ~(MOUNT_ATTR_RDONLY |
+ MOUNT_ATTR_NOSUID |
+ MOUNT_ATTR_NODEV |
+ MOUNT_ATTR_NOEXEC |
+ MOUNT_ATTR__ATIME |
+ MOUNT_ATTR_NODIRATIME))
+ return -EINVAL;
+
+ if (attr_flags & MOUNT_ATTR_RDONLY)
+ mnt_flags |= MNT_READONLY;
+ if (attr_flags & MOUNT_ATTR_NOSUID)
+ mnt_flags |= MNT_NOSUID;
+ if (attr_flags & MOUNT_ATTR_NODEV)
+ mnt_flags |= MNT_NODEV;
+ if (attr_flags & MOUNT_ATTR_NOEXEC)
+ mnt_flags |= MNT_NOEXEC;
+ if (attr_flags & MOUNT_ATTR_NODIRATIME)
+ mnt_flags |= MNT_NODIRATIME;
+
+ switch (attr_flags & MOUNT_ATTR__ATIME) {
+ case MOUNT_ATTR_STRICTATIME:
+ break;
+ case MOUNT_ATTR_NOATIME:
+ mnt_flags |= MNT_NOATIME;
+ break;
+ case MOUNT_ATTR_RELATIME:
+ mnt_flags |= MNT_RELATIME;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ f = fdget(fs_fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EINVAL;
+ if (f.file->f_op != &fscontext_fops)
+ goto err_fsfd;
+
+ fc = f.file->private_data;
+
+ ret = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (ret < 0)
+ goto err_fsfd;
+
+ /* There must be a valid superblock or we can't mount it */
+ ret = -EINVAL;
+ if (!fc->root)
+ goto err_unlock;
+
+ ret = -EPERM;
+ if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
+ pr_warn("VFS: Mount too revealing\n");
+ goto err_unlock;
+ }
+
+ ret = -EBUSY;
+ if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
+ goto err_unlock;
+
+ ret = -EPERM;
+ if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
+ goto err_unlock;
+
+ newmount.mnt = vfs_create_mount(fc);
+ if (IS_ERR(newmount.mnt)) {
+ ret = PTR_ERR(newmount.mnt);
+ goto err_unlock;
+ }
+ newmount.dentry = dget(fc->root);
+ newmount.mnt->mnt_flags = mnt_flags;
+
+ /* We've done the mount bit - now move the file context into more or
+ * less the same state as if we'd done an fspick(). We don't want to
+ * do any memory allocation or anything like that at this point as we
+ * don't want to have to handle any errors incurred.
+ */
+ vfs_clean_context(fc);
+
+ ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
+ if (IS_ERR(ns)) {
+ ret = PTR_ERR(ns);
+ goto err_path;
+ }
+ mnt = real_mount(newmount.mnt);
+ mnt->mnt_ns = ns;
+ ns->root = mnt;
+ ns->mounts = 1;
+ list_add(&mnt->mnt_list, &ns->list);
+
+ /* Attach to an apparent O_PATH fd with a note that we need to unmount
+ * it, not just simply put it.
+ */
+ file = dentry_open(&newmount, O_PATH, fc->cred);
+ if (IS_ERR(file)) {
+ dissolve_on_fput(newmount.mnt);
+ ret = PTR_ERR(file);
+ goto err_path;
+ }
+ file->f_mode |= FMODE_NEED_UNMOUNT;
+
+ ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
+ if (ret >= 0)
+ fd_install(ret, file);
+ else
+ fput(file);
+
+err_path:
+ path_put(&newmount);
+err_unlock:
+ mutex_unlock(&fc->uapi_mutex);
+err_fsfd:
+ fdput(f);
+ return ret;
+}
+
+/*
+ * Move a mount from one place to another. In combination with
+ * fsopen()/fsmount() this is used to install a new mount and in combination
+ * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
+ * a mount subtree.
+ *
+ * Note the flags value is a combination of MOVE_MOUNT_* flags.
+ */
+SYSCALL_DEFINE5(move_mount,
+ int, from_dfd, const char *, from_pathname,
+ int, to_dfd, const char *, to_pathname,
+ unsigned int, flags)
+{
+ struct path from_path, to_path;
+ unsigned int lflags;
+ int ret = 0;
+
+ if (!may_mount())
+ return -EPERM;
+
+ if (flags & ~MOVE_MOUNT__MASK)
+ return -EINVAL;
+
+ /* If someone gives a pathname, they aren't permitted to move
+ * from an fd that requires unmount as we can't get at the flag
+ * to clear it afterwards.
+ */
+ lflags = 0;
+ if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
+ if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
+ if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+
+ ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
+ if (ret < 0)
+ return ret;
+
+ lflags = 0;
+ if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
+ if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
+ if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+
+ ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
+ if (ret < 0)
+ goto out_from;
+
+ ret = security_move_mount(&from_path, &to_path);
+ if (ret < 0)
+ goto out_to;
+
+ ret = do_move_mount(&from_path, &to_path);
+
+out_to:
+ path_put(&to_path);
+out_from:
+ path_put(&from_path);
+ return ret;
+}
+
+/*
* Return true if path is reachable from root
*
* namespace_sem or mount_lock is held
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5174405e40d5..ec07f4c5630d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -165,10 +165,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
/* File is capable of returning -EAGAIN if I/O will block */
-#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
+#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
+
+/* File represents mount that needs unmounting */
+#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
/* File does not contribute to nr_files count */
-#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
+#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index eaca452088fa..1f966670c8dc 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -13,8 +13,10 @@
#define _LINUX_FS_CONTEXT_H
#include <linux/kernel.h>
+#include <linux/refcount.h>
#include <linux/errno.h>
#include <linux/security.h>
+#include <linux/mutex.h>
struct cred;
struct dentry;
@@ -35,6 +37,19 @@ enum fs_context_purpose {
};
/*
+ * Userspace usage phase for fsopen/fspick.
+ */
+enum fs_context_phase {
+ FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */
+ FS_CONTEXT_CREATING, /* A superblock is being created */
+ FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */
+ FS_CONTEXT_AWAITING_RECONF, /* Awaiting initialisation for reconfiguration */
+ FS_CONTEXT_RECONF_PARAMS, /* Loading params for reconfiguration */
+ FS_CONTEXT_RECONFIGURING, /* Reconfiguring the superblock */
+ FS_CONTEXT_FAILED, /* Failed to correctly transition a context */
+};
+
+/*
* Type of parameter value.
*/
enum fs_value_type {
@@ -74,12 +89,14 @@ struct fs_parameter {
*/
struct fs_context {
const struct fs_context_operations *ops;
+ struct mutex uapi_mutex; /* Userspace access mutex */
struct file_system_type *fs_type;
void *fs_private; /* The filesystem's context */
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
const struct cred *cred; /* The mounter's credentials */
+ struct fc_log *log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
const char *subtype; /* The subtype to set on the superblock */
void *security; /* Linux S&M options */
@@ -88,6 +105,7 @@ struct fs_context {
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int lsm_flags; /* Information flags from the fs to the LSM */
enum fs_context_purpose purpose:8;
+ enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
};
@@ -135,15 +153,21 @@ extern int vfs_get_super(struct fs_context *fc,
extern const struct file_operations fscontext_fops;
-#ifdef CONFIG_PRINTK
+/*
+ * Mount error, warning and informational message logging. This structure is
+ * shareable between a mount and a subordinate mount.
+ */
+struct fc_log {
+ refcount_t usage;
+ u8 head; /* Insertion index in buffer[] */
+ u8 tail; /* Removal index in buffer[] */
+ u8 need_free; /* Mask of kfree'able items in buffer[] */
+ struct module *owner; /* Owner module for strings that don't then need freeing */
+ char *buffer[8];
+};
+
extern __attribute__((format(printf, 2, 3)))
void logfc(struct fs_context *fc, const char *fmt, ...);
-#else
-static inline __attribute__((format(printf, 2, 3)))
-void logfc(struct fs_context *fc, const char *fmt, ...)
-{
-}
-#endif
/**
* infof - Store supplementary informational message
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index f7e55d0d2672..47f58cfb6a19 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -159,6 +159,10 @@
* Parse a string of security data filling in the opts structure
* @options string containing all mount options known by the LSM
* @opts binary data structure usable by the LSM
+ * @move_mount:
+ * Check permission before a mount is moved.
+ * @from_path indicates the mount that is going to be moved.
+ * @to_path indicates the mountpoint that will be mounted upon.
* @dentry_init_security:
* Compute a context for a dentry as the inode is not yet available
* since NFSv4 has no label backed by an EA anyway.
@@ -1502,6 +1506,7 @@ union security_list_options {
unsigned long *set_kern_flags);
int (*sb_add_mnt_opt)(const char *option, const char *val, int len,
void **mnt_opts);
+ int (*move_mount)(const struct path *from_path, const struct path *to_path);
int (*dentry_init_security)(struct dentry *dentry, int mode,
const struct qstr *name, void **ctx,
u32 *ctxlen);
@@ -1839,6 +1844,7 @@ struct security_hook_heads {
struct hlist_head sb_set_mnt_opts;
struct hlist_head sb_clone_mnt_opts;
struct hlist_head sb_add_mnt_opt;
+ struct hlist_head move_mount;
struct hlist_head dentry_init_security;
struct hlist_head dentry_create_files_as;
#ifdef CONFIG_SECURITY_PATH
diff --git a/include/linux/module.h b/include/linux/module.h
index 5bf5dcd91009..7dc4dc79b634 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -709,6 +709,12 @@ static inline bool is_module_text_address(unsigned long addr)
return false;
}
+static inline bool within_module_core(unsigned long addr,
+ const struct module *mod)
+{
+ return false;
+}
+
/* Get/put a kernel symbol (calls should be symmetric) */
#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak)); &(x); })
#define symbol_put(x) do { } while (0)
diff --git a/include/linux/security.h b/include/linux/security.h
index d543293216b9..659071c2e57c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -251,6 +251,7 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb,
unsigned long *set_kern_flags);
int security_add_mnt_opt(const char *option, const char *val,
int len, void **mnt_opts);
+int security_move_mount(const struct path *from_path, const struct path *to_path);
int security_dentry_init_security(struct dentry *dentry, int mode,
const struct qstr *name, void **ctx,
u32 *ctxlen);
@@ -614,6 +615,12 @@ static inline int security_add_mnt_opt(const char *option, const char *val,
return 0;
}
+static inline int security_move_mount(const struct path *from_path,
+ const struct path *to_path)
+{
+ return 0;
+}
+
static inline int security_inode_alloc(struct inode *inode)
{
return 0;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e446806a561f..e2870fe1be5b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -985,6 +985,15 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig);
+asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
+asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
+ int to_dfd, const char __user *to_path,
+ unsigned int ms_flags);
+asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
+asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
+ const void __user *value, int aux);
+asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
+asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
siginfo_t __user *info,
unsigned int flags);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index a2f8658f1c55..1d338357df8a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -91,5 +91,7 @@
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+
#endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 3f9ec42510b0..96a0240f23fe 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -55,4 +55,66 @@
#define MS_MGC_VAL 0xC0ED0000
#define MS_MGC_MSK 0xffff0000
+/*
+ * open_tree() flags.
+ */
+#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
+
+/*
+ * move_mount() flags.
+ */
+#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */
+#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */
+#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#define MOVE_MOUNT__MASK 0x00000077
+
+/*
+ * fsopen() flags.
+ */
+#define FSOPEN_CLOEXEC 0x00000001
+
+/*
+ * fspick() flags.
+ */
+#define FSPICK_CLOEXEC 0x00000001
+#define FSPICK_SYMLINK_NOFOLLOW 0x00000002
+#define FSPICK_NO_AUTOMOUNT 0x00000004
+#define FSPICK_EMPTY_PATH 0x00000008
+
+/*
+ * The type of fsconfig() call made.
+ */
+enum fsconfig_command {
+ FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */
+ FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */
+ FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */
+ FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */
+ FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */
+ FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */
+ FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */
+ FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */
+};
+
+/*
+ * fsmount() flags.
+ */
+#define FSMOUNT_CLOEXEC 0x00000001
+
+/*
+ * Mount attributes.
+ */
+#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */
+#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */
+#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */
+#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */
+#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */
+#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */
+#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */
+#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */
+#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */
+
#endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/samples/Kconfig b/samples/Kconfig
index d19754ccad08..30a89425009c 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -154,10 +154,11 @@ config SAMPLE_ANDROID_BINDERFS
Builds a sample program to illustrate the use of the Android binderfs
filesystem.
-config SAMPLE_STATX
- bool "Build example extended-stat using code"
- depends on BROKEN
+config SAMPLE_VFS
+ bool "Build example programs that use new VFS system calls"
help
- Build example userspace program to use the new extended-stat syscall.
+ Build example userspace programs that use new VFS system calls such
+ as mount API and statx(). Note that this is restricted to the x86
+ arch whilst it accesses system calls that aren't yet in all arches.
endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
index fadadb1c3b05..2484cc262d3e 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -3,4 +3,4 @@
obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \
hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \
configfs/ connector/ v4l/ trace_printk/ \
- vfio-mdev/ statx/ qmi/ binderfs/ pidfd/
+ vfio-mdev/ vfs/ qmi/ binderfs/ pidfd/
diff --git a/samples/statx/Makefile b/samples/vfs/Makefile
index 59df7c25a9d1..4ac9690fb3c4 100644
--- a/samples/statx/Makefile
+++ b/samples/vfs/Makefile
@@ -1,7 +1,10 @@
# List of programs to build
-hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx
+hostprogs-$(CONFIG_SAMPLE_VFS) := \
+ test-fsmount \
+ test-statx
# Tell kbuild to always build the programs
always := $(hostprogs-y)
+HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include
HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c
new file mode 100644
index 000000000000..266d72b3dce4
--- /dev/null
+++ b/samples/vfs/test-fsmount.c
@@ -0,0 +1,133 @@
+/* fd-based mount test.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/mount.h>
+#include <linux/unistd.h>
+
+#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0)
+
+static void check_messages(int fd)
+{
+ char buf[4096];
+ int err, n;
+
+ err = errno;
+
+ for (;;) {
+ n = read(fd, buf, sizeof(buf));
+ if (n < 0)
+ break;
+ n -= 2;
+
+ switch (buf[0]) {
+ case 'e':
+ fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2);
+ break;
+ case 'w':
+ fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2);
+ break;
+ case 'i':
+ fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2);
+ break;
+ }
+ }
+
+ errno = err;
+}
+
+static __attribute__((noreturn))
+void mount_error(int fd, const char *s)
+{
+ check_messages(fd);
+ fprintf(stderr, "%s: %m\n", s);
+ exit(1);
+}
+
+/* Hope -1 isn't a syscall */
+#ifndef __NR_fsopen
+#define __NR_fsopen -1
+#endif
+#ifndef __NR_fsmount
+#define __NR_fsmount -1
+#endif
+#ifndef __NR_fsconfig
+#define __NR_fsconfig -1
+#endif
+#ifndef __NR_move_mount
+#define __NR_move_mount -1
+#endif
+
+
+static inline int fsopen(const char *fs_name, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fs_name, flags);
+}
+
+static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags)
+{
+ return syscall(__NR_fsmount, fsfd, flags, ms_flags);
+}
+
+static inline int fsconfig(int fsfd, unsigned int cmd,
+ const char *key, const void *val, int aux)
+{
+ return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux);
+}
+
+static inline int move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount,
+ from_dfd, from_pathname,
+ to_dfd, to_pathname, flags);
+}
+
+#define E_fsconfig(fd, cmd, key, val, aux) \
+ do { \
+ if (fsconfig(fd, cmd, key, val, aux) == -1) \
+ mount_error(fd, key ?: "create"); \
+ } while (0)
+
+int main(int argc, char *argv[])
+{
+ int fsfd, mfd;
+
+ /* Mount a publically available AFS filesystem */
+ fsfd = fsopen("afs", 0);
+ if (fsfd == -1) {
+ perror("fsopen");
+ exit(1);
+ }
+
+ E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0);
+ E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
+
+ mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY);
+ if (mfd < 0)
+ mount_error(fsfd, "fsmount");
+ E(close(fsfd));
+
+ if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) {
+ perror("move_mount");
+ exit(1);
+ }
+
+ E(close(mfd));
+ exit(0);
+}
diff --git a/samples/statx/test-statx.c b/samples/vfs/test-statx.c
index d4d77b09412c..e91f918e84c4 100644
--- a/samples/statx/test-statx.c
+++ b/samples/vfs/test-statx.c
@@ -25,13 +25,21 @@
#include <sys/types.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
+#define statx foo
+#define statx_timestamp foo_timestamp
#include <sys/stat.h>
+#undef statx
+#undef statx_timestamp
#define AT_STATX_SYNC_TYPE 0x6000
#define AT_STATX_SYNC_AS_STAT 0x0000
#define AT_STATX_FORCE_SYNC 0x2000
#define AT_STATX_DONT_SYNC 0x4000
+#ifndef __NR_statx
+#define __NR_statx -1
+#endif
+
static __attribute__((unused))
ssize_t statx(int dfd, const char *filename, unsigned flags,
unsigned int mask, struct statx *buffer)
@@ -157,7 +165,8 @@ static void dump_statx(struct statx *stx)
"?dai?c??" /* 7- 0 0x00000000-000000ff */
;
- printf("Attributes: %016llx (", stx->stx_attributes);
+ printf("Attributes: %016llx (",
+ (unsigned long long)stx->stx_attributes);
for (byte = 64 - 8; byte >= 0; byte -= 8) {
bits = stx->stx_attributes >> byte;
mbits = stx->stx_attributes_mask >> byte;
diff --git a/security/security.c b/security/security.c
index 8d6ef9da94eb..613a5c00e602 100644
--- a/security/security.c
+++ b/security/security.c
@@ -866,6 +866,11 @@ int security_add_mnt_opt(const char *option, const char *val, int len,
}
EXPORT_SYMBOL(security_add_mnt_opt);
+int security_move_mount(const struct path *from_path, const struct path *to_path)
+{
+ return call_int_hook(move_mount, 0, from_path, to_path);
+}
+
int security_inode_alloc(struct inode *inode)
{
int rc = lsm_inode_alloc(inode);