diff options
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 1276 |
1 files changed, 1010 insertions, 266 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 85b5f7bea82e..df137ba19d37 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -25,17 +25,19 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> +#include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> +#include <linux/mnt_idmapping.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ -unsigned int sysctl_mount_max __read_mostly = 100000; +static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; @@ -73,6 +75,15 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mount_kattr { + unsigned int attr_set; + unsigned int attr_clr; + unsigned int propagation; + unsigned int lookup_flags; + bool recurse; + struct user_namespace *mnt_userns; +}; + /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); @@ -87,6 +98,16 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static inline void lock_mount_hash(void) +{ + write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ + write_sequnlock(&mount_lock); +} + static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); @@ -156,10 +177,10 @@ static inline void mnt_add_count(struct mount *mnt, int n) /* * vfsmount lock must be held for write */ -unsigned int mnt_get_count(struct mount *mnt) +int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP - unsigned int count = 0; + int count = 0; int cpu; for_each_possible_cpu(cpu) { @@ -183,7 +204,8 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, + GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } @@ -210,6 +232,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + mnt->mnt.mnt_userns = &init_user_ns; } return mnt; @@ -321,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + might_lock(&mount_lock.lock); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + cpu_relax(); + } else { + /* + * This prevents priority inversion, if the task + * setting MNT_WRITE_HOLD got preempted on a remote + * CPU, and it prevents life lock if the task setting + * MNT_WRITE_HOLD has a lower priority and is bound to + * the same CPU as the task that is spinning here. + */ + preempt_enable(); + lock_mount_hash(); + unlock_mount_hash(); + preempt_disable(); + } + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until @@ -360,50 +399,36 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * mnt_clone_write - get write access to a mount - * @mnt: the mount on which to take a write - * - * This is effectively like mnt_want_write, except - * it must only be used to take an extra write reference - * on a mountpoint that we already know has a write reference - * on it. This allows some optimisation. - * - * After finished, mnt_drop_write must be called as usual to - * drop the reference. - */ -int mnt_clone_write(struct vfsmount *mnt) -{ - /* superblock may be r/o */ - if (__mnt_is_readonly(mnt)) - return -EROFS; - preempt_disable(); - mnt_inc_writers(real_mount(mnt)); - preempt_enable(); - return 0; -} -EXPORT_SYMBOL_GPL(mnt_clone_write); - -/** * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like __mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the check for emergency r/o remounts. This must be + * paired with __mnt_drop_write_file. */ int __mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITER)) - return __mnt_want_write(file->f_path.mnt); - else - return mnt_clone_write(file->f_path.mnt); + if (file->f_mode & FMODE_WRITER) { + /* + * Superblock may have become readonly while there are still + * writable fd's, e.g. due to a fs error with errors=remount-ro + */ + if (__mnt_is_readonly(file->f_path.mnt)) + return -EROFS; + return 0; + } + return __mnt_want_write(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the freeze protection and the check for emergency r/o + * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { @@ -449,7 +474,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write); void __mnt_drop_write_file(struct file *file) { - __mnt_drop_write(file->f_path.mnt); + if (!(file->f_mode & FMODE_WRITER)) + __mnt_drop_write(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) @@ -459,11 +485,26 @@ void mnt_drop_write_file(struct file *file) } EXPORT_SYMBOL(mnt_drop_write_file); -static int mnt_make_readonly(struct mount *mnt) +/** + * mnt_hold_writers - prevent write access to the given mount + * @mnt: mnt to prevent write access to + * + * Prevents write access to @mnt if there are no active writers for @mnt. + * This function needs to be called and return successfully before changing + * properties of @mnt that need to remain stable for callers with write access + * to @mnt. + * + * After this functions has been called successfully callers must pair it with + * a call to mnt_unhold_writers() in order to stop preventing write access to + * @mnt. + * + * Context: This function expects lock_mount_hash() to be held serializing + * setting MNT_WRITE_HOLD. + * Return: On success 0 is returned. + * On error, -EBUSY is returned. + */ +static inline int mnt_hold_writers(struct mount *mnt) { - int ret = 0; - - lock_mount_hash(); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -488,25 +529,42 @@ static int mnt_make_readonly(struct mount *mnt) * we're counting up here. */ if (mnt_get_writers(mnt) > 0) - ret = -EBUSY; - else - mnt->mnt.mnt_flags |= MNT_READONLY; + return -EBUSY; + + return 0; +} + +/** + * mnt_unhold_writers - stop preventing write access to the given mount + * @mnt: mnt to stop preventing write access to + * + * Stop preventing write access to @mnt allowing callers to gain write access + * to @mnt again. + * + * This function can only be called after a successful call to + * mnt_hold_writers(). + * + * Context: This function expects lock_mount_hash() to be held. + */ +static inline void mnt_unhold_writers(struct mount *mnt) +{ /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; - unlock_mount_hash(); - return ret; } -static int __mnt_unmake_readonly(struct mount *mnt) +static int mnt_make_readonly(struct mount *mnt) { - lock_mount_hash(); - mnt->mnt.mnt_flags &= ~MNT_READONLY; - unlock_mount_hash(); - return 0; + int ret; + + ret = mnt_hold_writers(mnt); + if (!ret) + mnt->mnt.mnt_flags |= MNT_READONLY; + mnt_unhold_writers(mnt); + return ret; } int sb_prepare_remount_readonly(struct super_block *sb) @@ -521,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; - smp_mb(); - if (mnt_get_writers(mnt) > 0) { - err = -EBUSY; + err = mnt_hold_writers(mnt); + if (err) break; - } } } if (!err && atomic_long_read(&sb->s_remove_count)) @@ -547,6 +602,11 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { + struct user_namespace *mnt_userns; + + mnt_userns = mnt_user_ns(&mnt->mnt); + if (!initial_idmapping(mnt_userns)) + put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -588,7 +648,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /* call under rcu_read_lock */ -bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) @@ -648,6 +708,21 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -669,17 +744,18 @@ bool __is_local_mountpoint(struct dentry *dentry) struct mount *mnt; bool is_covered = false; - if (!d_mountpoint(dentry)) - goto out; - down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } + unlock_ns_list(ns); up_read(&namespace_sem); -out: + return is_covered; } @@ -933,6 +1009,7 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; + struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -950,6 +1027,10 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; + fs_userns = mnt->mnt.mnt_sb->s_user_ns; + if (!initial_idmapping(fs_userns)) + mnt->mnt.mnt_userns = get_user_ns(fs_userns); + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1039,6 +1120,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); + mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); + if (!initial_idmapping(mnt->mnt.mnt_userns)) + mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -1123,6 +1207,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); + int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { @@ -1146,7 +1231,9 @@ static void mntput_no_expire(struct mount *mnt) */ smp_mb(); mnt_add_count(mnt, -1); - if (mnt_get_count(mnt)) { + count = mnt_get_count(mnt); + if (count != 0) { + WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; @@ -1175,7 +1262,7 @@ static void mntput_no_expire(struct mount *mnt) struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); - if (!task_work_add(task, &mnt->mnt_rcu, true)) + if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) @@ -1205,8 +1292,9 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -/* path_is_mountpoint() - Check if path is a mount in the current - * namespace. +/** + * path_is_mountpoint() - Check if path is a mount in the current namespace. + * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. @@ -1245,46 +1333,71 @@ struct vfsmount *mnt_clone_internal(const struct path *path) } #ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct list_head *prev; down_read(&namespace_sem); - if (p->cached_event == p->ns->event) { - void *v = p->cached_mount; - if (*pos == p->cached_index) - return v; - if (*pos == p->cached_index + 1) { - v = seq_list_next(v, &p->ns->list, &p->cached_index); - return p->cached_mount = v; - } + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; } - p->cached_event = p->ns->event; - p->cached_mount = seq_list_start(&p->ns->list, *pos); - p->cached_index = *pos; - return p->cached_mount; + return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt = v; - p->cached_mount = seq_list_next(v, &p->ns->list, pos); - p->cached_index = *pos; - return p->cached_mount; + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; - struct mount *r = list_entry(v, struct mount, mnt_list); + struct mount *r = v; return p->show(m, &r->mnt); } @@ -1294,11 +1407,20 @@ const struct seq_operations mounts_op = { .stop = m_stop, .show = m_show, }; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy - * @mnt: root of mount tree + * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are @@ -1638,70 +1760,69 @@ out_unlock: /* * Is the caller allowed to modify his namespace? */ -static inline bool may_mount(void) +bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -static inline bool may_mandlock(void) +static void warn_mandlock(void) { - return capable(CAP_SYS_ADMIN); + pr_warn_once("=======================================================\n" + "WARNING: The mand mount option has been deprecated and\n" + " and is ignored by this kernel. Remove the mand\n" + " option from the mount to silence this warning.\n" + "=======================================================\n"); } -#else -static inline bool may_mandlock(void) + +static int can_umount(const struct path *path, int flags) { - pr_warn("VFS: \"mand\" mount option not supported"); - return false; + struct mount *mnt = real_mount(path->mnt); + + if (!may_mount()) + return -EPERM; + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + if (!check_mnt(mnt)) + return -EINVAL; + if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ + return -EINVAL; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; } -#endif -/* - * Now umount can handle mount points as well as block devices. - * This is important for filesystems which use unnamed block devices. - * - * We now support a flag for forced unmount like the other 'big iron' - * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD - */ +// caller is responsible for flags being sane +int path_umount(struct path *path, int flags) +{ + struct mount *mnt = real_mount(path->mnt); + int ret; + + ret = can_umount(path, flags); + if (!ret) + ret = do_umount(mnt, flags); -int ksys_umount(char __user *name, int flags) + /* we mustn't call path_put() as that would clear mnt_expiry_mark */ + dput(path->dentry); + mntput_no_expire(mnt); + return ret; +} + +static int ksys_umount(char __user *name, int flags) { + int lookup_flags = LOOKUP_MOUNTPOINT; struct path path; - struct mount *mnt; - int retval; - int lookup_flags = 0; + int ret; + // basic validity checks done first if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; - if (!may_mount()) - return -EPERM; - if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - - retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); - if (retval) - goto out; - mnt = real_mount(path.mnt); - retval = -EINVAL; - if (path.dentry != path.mnt->mnt_root) - goto dput_and_out; - if (!check_mnt(mnt)) - goto dput_and_out; - if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ - goto dput_and_out; - retval = -EPERM; - if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) - goto dput_and_out; - - retval = do_umount(mnt, flags); -dput_and_out: - /* we mustn't call path_put() as that would clear mnt_expiry_mark */ - dput(path.dentry); - mntput_no_expire(mnt); -out: - return retval; + ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); + if (ret) + return ret; + return path_umount(&path, flags); } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) @@ -1733,6 +1854,11 @@ static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) return container_of(ns, struct mnt_namespace, ns); } +struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) +{ + return &mnt->ns; +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a @@ -1858,12 +1984,27 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /** * clone_private_mount - create a private clone of a path + * @path: path to clone * - * This creates a new vfsmount, which will be the clone of @path. The new will - * not be attached anywhere in the namespace and will be private (i.e. changes - * to the originating mount won't be propagated into this). + * This creates a new vfsmount, which will be the clone of @path. The new mount + * will not be attached anywhere in the namespace and will be private (i.e. + * changes to the originating mount won't be propagated into this). * * Release with mntput(). */ @@ -1872,14 +2013,30 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; + down_read(&namespace_sem); if (IS_MNT_UNBINDABLE(old_mnt)) - return ERR_PTR(-EINVAL); + goto invalid; + + if (!check_mnt(old_mnt)) + goto invalid; + + if (has_locked_children(old_mnt, path->dentry)) + goto invalid; new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); + /* Longterm mount to be removed by kern_unmount*() */ + new_mnt->mnt_ns = MNT_NS_INTERNAL; + return &new_mnt->mnt; + +invalid: + up_read(&namespace_sem); + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -1955,22 +2112,23 @@ static int invent_group_ids(struct mount *mnt, bool recurse) int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); - unsigned int mounts = 0, old, pending, sum; + unsigned int mounts = 0; struct mount *p; + if (ns->mounts >= max) + return -ENOSPC; + max -= ns->mounts; + if (ns->pending_mounts >= max) + return -ENOSPC; + max -= ns->pending_mounts; + for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; - old = ns->mounts; - pending = ns->pending_mounts; - sum = old + pending; - if ((old > sum) || - (pending > sum) || - (max < sum) || - (mounts > (max - sum))) + if (mounts > max) return -ENOSPC; - ns->pending_mounts = pending + mounts; + ns->pending_mounts += mounts; return 0; } @@ -2231,19 +2389,6 @@ static int do_change_type(struct path *path, int ms_flags) return err; } -static bool has_locked_children(struct mount *mnt, struct dentry *dentry) -{ - struct mount *child; - list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { - if (!is_subdir(child->mnt_mountpoint, dentry)) - continue; - - if (child->mnt.mnt_flags & MNT_LOCKED) - return true; - } - return false; -} - static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); @@ -2450,20 +2595,15 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) if (readonly_request) return mnt_make_readonly(mnt); - return __mnt_unmake_readonly(mnt); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + return 0; } -/* - * Update the user-settable attributes on a mount. The caller must hold - * sb->s_umount for writing. - */ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { - lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); - unlock_mount_hash(); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) @@ -2471,6 +2611,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2485,6 +2626,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } @@ -2508,11 +2650,17 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - down_write(&sb->s_umount); + /* + * We're only checking whether the superblock is read-only not + * changing it, so only take down_read(&sb->s_umount). + */ + down_read(&sb->s_umount); + lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); - up_write(&sb->s_umount); + unlock_mount_hash(); + up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); @@ -2545,14 +2693,18 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); - if (!err) + if (!err) { + lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); + unlock_mount_hash(); + } } up_write(&sb->s_umount); } @@ -2595,6 +2747,78 @@ out: return ret; } +static int do_set_group(struct path *from_path, struct path *to_path) +{ + struct mount *from, *to; + int err; + + from = real_mount(from_path->mnt); + to = real_mount(to_path->mnt); + + namespace_lock(); + + err = -EINVAL; + /* To and From must be mounted */ + if (!is_mounted(&from->mnt)) + goto out; + if (!is_mounted(&to->mnt)) + goto out; + + err = -EPERM; + /* We should be allowed to modify mount namespaces of both mounts */ + if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + /* To and From paths should be mount roots */ + if (from_path->dentry != from_path->mnt->mnt_root) + goto out; + if (to_path->dentry != to_path->mnt->mnt_root) + goto out; + + /* Setting sharing groups is only allowed across same superblock */ + if (from->mnt.mnt_sb != to->mnt.mnt_sb) + goto out; + + /* From mount root should be wider than To mount root */ + if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) + goto out; + + /* From mount should not have locked children in place of To's root */ + if (has_locked_children(from, to->mnt.mnt_root)) + goto out; + + /* Setting sharing groups is only allowed on private mounts */ + if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) + goto out; + + /* From should not be private */ + if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) + goto out; + + if (IS_MNT_SLAVE(from)) { + struct mount *m = from->mnt_master; + + list_add(&to->mnt_slave, &m->mnt_slave_list); + to->mnt_master = m; + } + + if (IS_MNT_SHARED(from)) { + to->mnt_group_id = from->mnt_group_id; + list_add(&to->mnt_share, &from->mnt_share); + lock_mount_hash(); + set_mnt_shared(to); + unlock_mount_hash(); + } + + err = 0; +out: + namespace_unlock(); + return err; +} + static int do_move_mount(struct path *old_path, struct path *new_path) { struct mnt_namespace *ns; @@ -2697,45 +2921,32 @@ static int do_move_mount_old(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, + const struct path *path, int mnt_flags) { - struct mountpoint *mp; - struct mount *parent; - int err; + struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; - mp = lock_mount(path); - if (IS_ERR(mp)) - return PTR_ERR(mp); - - parent = real_mount(path->mnt); - err = -EINVAL; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) - goto unlock; + return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) - goto unlock; + return -EINVAL; } /* Refuse the same filesystem on the same mount point */ - err = -EBUSY; if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) - goto unlock; + return -EBUSY; - err = -EINVAL; if (d_is_symlink(newmnt->mnt.mnt_root)) - goto unlock; + return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, parent, mp); - -unlock: - unlock_mount(mp); - return err; + return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -2748,6 +2959,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; + struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; @@ -2768,7 +2980,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + mp = lock_mount(mountpoint); + if (IS_ERR(mp)) { + mntput(mnt); + return PTR_ERR(mp); + } + error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); + unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -2827,25 +3045,65 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, return err; } -int finish_automount(struct vfsmount *m, struct path *path) +int finish_automount(struct vfsmount *m, const struct path *path) { - struct mount *mnt = real_mount(m); + struct dentry *dentry = path->dentry; + struct mountpoint *mp; + struct mount *mnt; int err; + + if (!m) + return 0; + if (IS_ERR(m)) + return PTR_ERR(m); + + mnt = real_mount(m); /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == path->dentry) { + m->mnt_root == dentry) { err = -ELOOP; - goto fail; + goto discard; } - err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - if (!err) - return 0; -fail: + /* + * we don't want to use lock_mount() - in this case finding something + * that overmounts our mountpoint to be means "quitely drop what we've + * got", not "try to mount it on top". + */ + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) { + err = -ENOENT; + goto discard_locked; + } + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, dentry))) { + rcu_read_unlock(); + err = 0; + goto discard_locked; + } + rcu_read_unlock(); + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto discard_locked; + } + + err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(mp); + if (unlikely(err)) + goto discard; + mntput(m); + return 0; + +discard_locked: + namespace_unlock(); + inode_unlock(dentry->d_inode); +discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); @@ -2979,10 +3237,10 @@ static void shrink_submounts(struct mount *mnt) } } -void *copy_mount_options(const void __user * data) +static void *copy_mount_options(const void __user * data) { char *copy; - unsigned size; + unsigned left, offset; if (!data) return NULL; @@ -2991,20 +3249,31 @@ void *copy_mount_options(const void __user * data) if (!copy) return ERR_PTR(-ENOMEM); - size = PAGE_SIZE - offset_in_page(data); + left = copy_from_user(copy, data, PAGE_SIZE); - if (copy_from_user(copy, data, size)) { + /* + * Not all architectures have an exact copy_from_user(). Resort to + * byte at a time. + */ + offset = PAGE_SIZE - left; + while (left) { + char c; + if (get_user(c, (const char __user *)data + offset)) + break; + copy[offset] = c; + left--; + offset++; + } + + if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } - if (size != PAGE_SIZE) { - if (copy_from_user(copy + size, data + size, PAGE_SIZE - size)) - memset(copy + size, 0, PAGE_SIZE - size); - } + return copy; } -char *copy_mount_string(const void __user *data) +static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } @@ -3023,12 +3292,11 @@ char *copy_mount_string(const void __user *data) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -long do_mount(const char *dev_name, const char __user *dir_name, +int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page) { - struct path path; unsigned int mnt_flags = 0, sb_flags; - int retval = 0; + int ret; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) @@ -3041,19 +3309,13 @@ long do_mount(const char *dev_name, const char __user *dir_name, if (flags & MS_NOUSER) return -EINVAL; - /* ... and get the mountpoint */ - retval = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); - if (retval) - return retval; - - retval = security_sb_mount(dev_name, &path, - type_page, flags, data_page); - if (!retval && !may_mount()) - retval = -EPERM; - if (!retval && (flags & SB_MANDLOCK) && !may_mandlock()) - retval = -EPERM; - if (retval) - goto dput_out; + ret = security_sb_mount(dev_name, path, type_page, flags, data_page); + if (ret) + return ret; + if (!may_mount()) + return -EPERM; + if (flags & SB_MANDLOCK) + warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) @@ -3074,13 +3336,15 @@ long do_mount(const char *dev_name, const char __user *dir_name, mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags & MS_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; - mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; + mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; } sb_flags = flags & (SB_RDONLY | @@ -3093,22 +3357,32 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_I_VERSION); if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) - retval = do_reconfigure_mnt(&path, mnt_flags); - else if (flags & MS_REMOUNT) - retval = do_remount(&path, flags, sb_flags, mnt_flags, - data_page); - else if (flags & MS_BIND) - retval = do_loopback(&path, dev_name, flags & MS_REC); - else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) - retval = do_change_type(&path, flags); - else if (flags & MS_MOVE) - retval = do_move_mount_old(&path, dev_name); - else - retval = do_new_mount(&path, type_page, sb_flags, mnt_flags, - dev_name, data_page); -dput_out: + return do_reconfigure_mnt(path, mnt_flags); + if (flags & MS_REMOUNT) + return do_remount(path, flags, sb_flags, mnt_flags, data_page); + if (flags & MS_BIND) + return do_loopback(path, dev_name, flags & MS_REC); + if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return do_change_type(path, flags); + if (flags & MS_MOVE) + return do_move_mount_old(path, dev_name); + + return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, + data_page); +} + +long do_mount(const char *dev_name, const char __user *dir_name, + const char *type_page, unsigned long flags, void *data_page) +{ + struct path path; + int ret; + + ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); + if (ret) + return ret; + ret = path_mount(dev_name, &path, type_page, flags, data_page); path_put(&path); - return retval; + return ret; } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) @@ -3149,7 +3423,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); @@ -3165,9 +3439,10 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); - atomic_set(&new_ns->count, 1); + refcount_set(&new_ns->ns.count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3324,6 +3599,36 @@ out_type: return ret; } +#define FSMOUNT_VALID_FLAGS \ + (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ + MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ + MOUNT_ATTR_NOSYMFOLLOW) + +#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) + +#define MOUNT_SETATTR_PROPAGATION_FLAGS \ + (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) + +static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) +{ + unsigned int mnt_flags = 0; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; + + return mnt_flags; +} + /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. @@ -3346,24 +3651,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; - if (attr_flags & ~(MOUNT_ATTR_RDONLY | - MOUNT_ATTR_NOSUID | - MOUNT_ATTR_NODEV | - MOUNT_ATTR_NOEXEC | - MOUNT_ATTR__ATIME | - MOUNT_ATTR_NODIRATIME)) + if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; - if (attr_flags & MOUNT_ATTR_RDONLY) - mnt_flags |= MNT_READONLY; - if (attr_flags & MOUNT_ATTR_NOSUID) - mnt_flags |= MNT_NOSUID; - if (attr_flags & MOUNT_ATTR_NODEV) - mnt_flags |= MNT_NODEV; - if (attr_flags & MOUNT_ATTR_NOEXEC) - mnt_flags |= MNT_NOEXEC; - if (attr_flags & MOUNT_ATTR_NODIRATIME) - mnt_flags |= MNT_NODIRATIME; + mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: @@ -3407,9 +3698,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; - ret = -EPERM; - if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) - goto err_unlock; + if (fc->sb_flags & SB_MANDLOCK) + warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { @@ -3513,7 +3803,10 @@ SYSCALL_DEFINE5(move_mount, if (ret < 0) goto out_to; - ret = do_move_mount(&from_path, &to_path); + if (flags & MOVE_MOUNT_SET_GROUP) + ret = do_set_group(&from_path, &to_path); + else + ret = do_move_mount(&from_path, &to_path); out_to: path_put(&to_path); @@ -3561,7 +3854,7 @@ EXPORT_SYMBOL(path_is_under); * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: @@ -3671,6 +3964,415 @@ out0: return error; } +static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) +{ + unsigned int flags = mnt->mnt.mnt_flags; + + /* flags to clear */ + flags &= ~kattr->attr_clr; + /* flags to raise */ + flags |= kattr->attr_set; + + return flags; +} + +static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct vfsmount *m = &mnt->mnt; + struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; + + if (!kattr->mnt_userns) + return 0; + + /* + * Creating an idmapped mount with the filesystem wide idmapping + * doesn't make sense so block that. We don't allow mushy semantics. + */ + if (kattr->mnt_userns == fs_userns) + return -EINVAL; + + /* + * Once a mount has been idmapped we don't allow it to change its + * mapping. It makes things simpler and callers can just create + * another bind-mount they can idmap if they want to. + */ + if (is_idmapped_mnt(m)) + return -EPERM; + + /* The underlying filesystem doesn't support idmapped mounts yet. */ + if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) + return -EINVAL; + + /* We're not controlling the superblock. */ + if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Mount has already been visible in the filesystem hierarchy. */ + if (!is_anon_ns(mnt->mnt_ns)) + return -EINVAL; + + return 0; +} + +/** + * mnt_allow_writers() - check whether the attribute change allows writers + * @kattr: the new mount attributes + * @mnt: the mount to which @kattr will be applied + * + * Check whether thew new mount attributes in @kattr allow concurrent writers. + * + * Return: true if writers need to be held, false if not + */ +static inline bool mnt_allow_writers(const struct mount_kattr *kattr, + const struct mount *mnt) +{ + return (!(kattr->attr_set & MNT_READONLY) || + (mnt->mnt.mnt_flags & MNT_READONLY)) && + !kattr->mnt_userns; +} + +static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) +{ + struct mount *m; + int err; + + for (m = mnt; m; m = next_mnt(m, mnt)) { + if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { + err = -EPERM; + break; + } + + err = can_idmap_mount(kattr, m); + if (err) + break; + + if (!mnt_allow_writers(kattr, m)) { + err = mnt_hold_writers(m); + if (err) + break; + } + + if (!kattr->recurse) + return 0; + } + + if (err) { + struct mount *p; + + /* + * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will + * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all + * mounts and needs to take care to include the first mount. + */ + for (p = mnt; p; p = next_mnt(p, mnt)) { + /* If we had to hold writers unblock them. */ + if (p->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt_unhold_writers(p); + + /* + * We're done once the first mount we changed got + * MNT_WRITE_HOLD unset. + */ + if (p == m) + break; + } + } + return err; +} + +static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct user_namespace *mnt_userns, *old_mnt_userns; + + if (!kattr->mnt_userns) + return; + + /* + * We're the only ones able to change the mount's idmapping. So + * mnt->mnt.mnt_userns is stable and we can retrieve it directly. + */ + old_mnt_userns = mnt->mnt.mnt_userns; + + mnt_userns = get_user_ns(kattr->mnt_userns); + /* Pairs with smp_load_acquire() in mnt_user_ns(). */ + smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); + + /* + * If this is an idmapped filesystem drop the reference we've taken + * in vfs_create_mount() before. + */ + if (!initial_idmapping(old_mnt_userns)) + put_user_ns(old_mnt_userns); +} + +static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) +{ + struct mount *m; + + for (m = mnt; m; m = next_mnt(m, mnt)) { + unsigned int flags; + + do_idmap_mount(kattr, m); + flags = recalc_flags(kattr, m); + WRITE_ONCE(m->mnt.mnt_flags, flags); + + /* If we had to hold writers unblock them. */ + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt_unhold_writers(m); + + if (kattr->propagation) + change_mnt_propagation(m, kattr->propagation); + if (!kattr->recurse) + break; + } + touch_mnt_namespace(mnt->mnt_ns); +} + +static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +{ + struct mount *mnt = real_mount(path->mnt); + int err = 0; + + if (path->dentry != mnt->mnt.mnt_root) + return -EINVAL; + + if (kattr->propagation) { + /* + * Only take namespace_lock() if we're actually changing + * propagation. + */ + namespace_lock(); + if (kattr->propagation == MS_SHARED) { + err = invent_group_ids(mnt, kattr->recurse); + if (err) { + namespace_unlock(); + return err; + } + } + } + + err = -EINVAL; + lock_mount_hash(); + + /* Ensure that this isn't anything purely vfs internal. */ + if (!is_mounted(&mnt->mnt)) + goto out; + + /* + * If this is an attached mount make sure it's located in the callers + * mount namespace. If it's not don't let the caller interact with it. + * If this is a detached mount make sure it has an anonymous mount + * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE. + */ + if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns))) + goto out; + + /* + * First, we get the mount tree in a shape where we can change mount + * properties without failure. If we succeeded to do so we commit all + * changes and if we failed we clean up. + */ + err = mount_setattr_prepare(kattr, mnt); + if (!err) + mount_setattr_commit(kattr, mnt); + +out: + unlock_mount_hash(); + + if (kattr->propagation) { + namespace_unlock(); + if (err) + cleanup_group_ids(mnt, NULL); + } + + return err; +} + +static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + int err = 0; + struct ns_common *ns; + struct user_namespace *mnt_userns; + struct file *file; + + if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) + return 0; + + /* + * We currently do not support clearing an idmapped mount. If this ever + * is a use-case we can revisit this but for now let's keep it simple + * and not allow it. + */ + if (attr->attr_clr & MOUNT_ATTR_IDMAP) + return -EINVAL; + + if (attr->userns_fd > INT_MAX) + return -EINVAL; + + file = fget(attr->userns_fd); + if (!file) + return -EBADF; + + if (!proc_ns_file(file)) { + err = -EINVAL; + goto out_fput; + } + + ns = get_proc_ns(file_inode(file)); + if (ns->ops->type != CLONE_NEWUSER) { + err = -EINVAL; + goto out_fput; + } + + /* + * The initial idmapping cannot be used to create an idmapped + * mount. We use the initial idmapping as an indicator of a mount + * that is not idmapped. It can simply be passed into helpers that + * are aware of idmapped mounts as a convenient shortcut. A user + * can just create a dedicated identity mapping to achieve the same + * result. + */ + mnt_userns = container_of(ns, struct user_namespace, ns); + if (initial_idmapping(mnt_userns)) { + err = -EPERM; + goto out_fput; + } + + /* We're not controlling the target namespace. */ + if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) { + err = -EPERM; + goto out_fput; + } + + kattr->mnt_userns = get_user_ns(mnt_userns); + +out_fput: + fput(file); + return err; +} + +static int build_mount_kattr(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + *kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + .recurse = !!(flags & AT_RECURSIVE), + }; + + if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) + return -EINVAL; + if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) + return -EINVAL; + kattr->propagation = attr->propagation; + + if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) + return -EINVAL; + + kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); + kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); + + /* + * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, + * users wanting to transition to a different atime setting cannot + * simply specify the atime setting in @attr_set, but must also + * specify MOUNT_ATTR__ATIME in the @attr_clr field. + * So ensure that MOUNT_ATTR__ATIME can't be partially set in + * @attr_clr and that @attr_set can't have any atime bits set if + * MOUNT_ATTR__ATIME isn't set in @attr_clr. + */ + if (attr->attr_clr & MOUNT_ATTR__ATIME) { + if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) + return -EINVAL; + + /* + * Clear all previous time settings as they are mutually + * exclusive. + */ + kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; + switch (attr->attr_set & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_RELATIME: + kattr->attr_set |= MNT_RELATIME; + break; + case MOUNT_ATTR_NOATIME: + kattr->attr_set |= MNT_NOATIME; + break; + case MOUNT_ATTR_STRICTATIME: + break; + default: + return -EINVAL; + } + } else { + if (attr->attr_set & MOUNT_ATTR__ATIME) + return -EINVAL; + } + + return build_mount_idmapped(attr, usize, kattr, flags); +} + +static void finish_mount_kattr(struct mount_kattr *kattr) +{ + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_attr attr; + struct mount_kattr kattr; + + BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; + + err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (err) + return err; + + /* Don't bother walking through the mounts if this is a nop. */ + if (attr.attr_set == 0 && + attr.attr_clr == 0 && + attr.propagation == 0) + return 0; + + err = build_mount_kattr(&attr, usize, &kattr, flags); + if (err) + return err; + + err = user_path_at(dfd, path, kattr.lookup_flags, &target); + if (!err) { + err = do_mount_setattr(&target, &kattr); + path_put(&target); + } + finish_mount_kattr(&kattr); + return err; +} + static void __init init_mount_tree(void) { struct vfsmount *mnt; @@ -3706,7 +4408,7 @@ void __init mnt_init(void) int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), @@ -3738,7 +4440,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!atomic_dec_and_test(&ns->count)) + if (!refcount_dec_and_test(&ns->ns.count)) return; drop_collected_mounts(&ns->root->mnt); free_mnt_ns(ns); @@ -3770,6 +4472,19 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (mnt[i]) + real_mount(mnt[i])->mnt_ns = NULL; + synchronize_rcu_expedited(); + for (i = 0; i < num; i++) + mntput(mnt[i]); +} +EXPORT_SYMBOL(kern_unmount_array); + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); @@ -3808,10 +4523,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, bool visible = false; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; + if (mnt_is_cursor(mnt)) + continue; + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -3859,6 +4578,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: + unlock_ns_list(ns); up_read(&namespace_sem); return visible; } @@ -3920,16 +4640,18 @@ static void mntns_put(struct ns_common *ns) put_mnt_ns(to_mnt_ns(ns)); } -static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int mntns_install(struct nsset *nsset, struct ns_common *ns) { - struct fs_struct *fs = current->fs; + struct nsproxy *nsproxy = nsset->nsproxy; + struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; + struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(user_ns, CAP_SYS_CHROOT) || + !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) @@ -3975,3 +4697,25 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +#ifdef CONFIG_SYSCTL +static struct ctl_table fs_namespace_sysctls[] = { + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; + +static int __init init_fs_namespace_sysctls(void) +{ + register_sysctl_init("fs", fs_namespace_sysctls); + return 0; +} +fs_initcall(init_fs_namespace_sysctls); + +#endif /* CONFIG_SYSCTL */ |