aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-15 19:29:43 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-15 19:29:43 -0800
commitfaf145d6f3f3d6f2c066f65602ba9d0a03106915 (patch)
tree17685a9855b048998f734f9114c9cbf94aa61569
parentMerge branch 'signal-for-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace (diff)
parentexec: Move io_uring_task_cancel after the point of no return (diff)
downloadlinux-dev-faf145d6f3f3d6f2c066f65602ba9d0a03106915.tar.xz
linux-dev-faf145d6f3f3d6f2c066f65602ba9d0a03106915.zip
Merge branch 'exec-for-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull execve updates from Eric Biederman: "This set of changes ultimately fixes the interaction of posix file lock and exec. Fundamentally most of the change is just moving where unshare_files is called during exec, and tweaking the users of files_struct so that the count of files_struct is not unnecessarily played with. Along the way fcheck and related helpers were renamed to more accurately reflect what they do. There were also many other small changes that fell out, as this is the first time in a long time much of this code has been touched. Benchmarks haven't turned up any practical issues but Al Viro has observed a possibility for a lot of pounding on task_lock. So I have some changes in progress to convert put_files_struct to always rcu free files_struct. That wasn't ready for the merge window so that will have to wait until next time" * 'exec-for-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (27 commits) exec: Move io_uring_task_cancel after the point of no return coredump: Document coredump code exclusively used by cell spufs file: Remove get_files_struct file: Rename __close_fd_get_file close_fd_get_file file: Replace ksys_close with close_fd file: Rename __close_fd to close_fd and remove the files parameter file: Merge __alloc_fd into alloc_fd file: In f_dupfd read RLIMIT_NOFILE once. file: Merge __fd_install into fd_install proc/fd: In fdinfo seq_show don't use get_files_struct bpf/task_iter: In task_file_seq_get_next use task_lookup_next_fd_rcu proc/fd: In proc_readfd_common use task_lookup_next_fd_rcu file: Implement task_lookup_next_fd_rcu kcmp: In get_file_raw_ptr use task_lookup_fd_rcu proc/fd: In tid_fd_mode use task_lookup_fd_rcu file: Implement task_lookup_fd_rcu file: Rename fcheck lookup_fd_rcu file: Replace fcheck_files with files_lookup_fd_rcu file: Factor files_lookup_fd_locked out of fcheck_files file: Rename __fcheck_files to files_lookup_fd_raw ...
-rw-r--r--Documentation/filesystems/files.rst8
-rw-r--r--arch/powerpc/platforms/cell/spufs/coredump.c2
-rw-r--r--drivers/android/binder.c2
-rw-r--r--fs/autofs/dev-ioctl.c5
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/coredump.c6
-rw-r--r--fs/exec.c39
-rw-r--r--fs/file.c124
-rw-r--r--fs/io_uring.c2
-rw-r--r--fs/locks.c14
-rw-r--r--fs/notify/dnotify/dnotify.c2
-rw-r--r--fs/open.c2
-rw-r--r--fs/proc/fd.c48
-rw-r--r--include/linux/fdtable.h40
-rw-r--r--include/linux/syscalls.h12
-rw-r--r--kernel/bpf/syscall.c20
-rw-r--r--kernel/bpf/task_iter.c33
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/kcmp.c29
19 files changed, 158 insertions, 244 deletions
diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index cbf8e57376bf..bcf84459917f 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
be held.
4. To look up the file structure given an fd, a reader
- must use either fcheck() or fcheck_files() APIs. These
+ must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
take care of barrier requirements due to lock-free lookup.
An example::
@@ -70,7 +70,7 @@ the fdtable structure -
struct file *file;
rcu_read_lock();
- file = fcheck(fd);
+ file = lookup_fd_rcu(fd);
if (file) {
...
}
@@ -84,7 +84,7 @@ the fdtable structure -
on ->f_count::
rcu_read_lock();
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_rcu(files, fd);
if (file) {
if (atomic_long_inc_not_zero(&file->f_count))
*fput_needed = 1;
@@ -104,7 +104,7 @@ the fdtable structure -
lock-free, they must be installed using rcu_assign_pointer()
API. If they are looked up lock-free, rcu_dereference()
must be used. However it is advisable to use files_fdtable()
- and fcheck()/fcheck_files() which take care of these issues.
+ and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
7. While updating, the fdtable pointer must be looked up while
holding files->file_lock. If ->file_lock is dropped, then
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 026c181a98c5..60b5583e9eaf 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -74,7 +74,7 @@ static struct spu_context *coredump_next_context(int *fd)
*fd = n - 1;
rcu_read_lock();
- file = fcheck(*fd);
+ file = lookup_fd_rcu(*fd);
ctx = SPUFS_I(file_inode(file))->i_ctx;
get_spu_context(ctx);
rcu_read_unlock();
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1338209f9f86..c119736ca56a 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1836,7 +1836,7 @@ static void binder_deferred_fd_close(int fd)
if (!twcb)
return;
init_task_work(&twcb->twork, binder_do_fd_close);
- __close_fd_get_file(fd, &twcb->file);
+ close_fd_get_file(fd, &twcb->file);
if (twcb->file) {
filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, TWA_RESUME);
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 322b7dfb4ea0..5bf781ea6d67 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -4,9 +4,10 @@
* Copyright 2008 Ian Kent <raven@themaw.net>
*/
+#include <linux/module.h>
#include <linux/miscdevice.h>
#include <linux/compat.h>
-#include <linux/syscalls.h>
+#include <linux/fdtable.h>
#include <linux/magic.h>
#include <linux/nospec.h>
@@ -289,7 +290,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- return ksys_close(param->ioctlfd);
+ return close_fd(param->ioctlfd);
}
/*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ac0b5fc30ea6..950bc177238a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2198,6 +2198,7 @@ static int elf_core_dump(struct coredump_params *cprm)
{
size_t sz = get_note_info_size(&info);
+ /* For cell spufs */
sz += elf_coredump_extra_notes_size();
phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
@@ -2261,6 +2262,7 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!write_note_info(&info, cprm))
goto end_coredump;
+ /* For cell spufs */
if (elf_coredump_extra_notes_write(cprm))
goto end_coredump;
diff --git a/fs/coredump.c b/fs/coredump.c
index c6acfc694f65..a2f6ecc8e345 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -586,7 +586,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
int ispipe;
size_t *argv = NULL;
int argc = 0;
- struct files_struct *displaced;
/* require nonrelative corefile path and be extra careful */
bool need_suid_safe = false;
bool core_dumped = false;
@@ -792,11 +791,10 @@ void do_coredump(const kernel_siginfo_t *siginfo)
}
/* get us an unshared descriptor table; almost always a no-op */
- retval = unshare_files(&displaced);
+ /* The cell spufs coredump code reads the file descriptor tables */
+ retval = unshare_files();
if (retval)
goto close_fail;
- if (displaced)
- put_files_struct(displaced);
if (!dump_interrupted()) {
/*
* umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
diff --git a/fs/exec.c b/fs/exec.c
index aee36e5733ce..81b85f70e9f3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1259,6 +1259,16 @@ int begin_new_exec(struct linux_binprm * bprm)
goto out;
/*
+ * Cancel any io_uring activity across execve
+ */
+ io_uring_task_cancel();
+
+ /* Ensure the files table is not shared. */
+ retval = unshare_files();
+ if (retval)
+ goto out;
+
+ /*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visibile until then. This also enables the update
* to be lockless.
@@ -1779,21 +1789,11 @@ static int bprm_execve(struct linux_binprm *bprm,
int fd, struct filename *filename, int flags)
{
struct file *file;
- struct files_struct *displaced;
int retval;
- /*
- * Cancel any io_uring activity across execve
- */
- io_uring_task_cancel();
-
- retval = unshare_files(&displaced);
- if (retval)
- return retval;
-
retval = prepare_bprm_creds(bprm);
if (retval)
- goto out_files;
+ return retval;
check_unsafe_exec(bprm);
current->in_execve = 1;
@@ -1808,11 +1808,14 @@ static int bprm_execve(struct linux_binprm *bprm,
bprm->file = file;
/*
* Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. Relies on having exclusive access to
- * current->files (due to unshare_files above).
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
*/
- if (bprm->fdpath &&
- close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+ if (bprm->fdpath && get_close_on_exec(fd))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* Set the unchanging part of bprm->cred */
@@ -1830,8 +1833,6 @@ static int bprm_execve(struct linux_binprm *bprm,
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
- if (displaced)
- put_files_struct(displaced);
return retval;
out:
@@ -1848,10 +1849,6 @@ out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;
-out_files:
- if (displaced)
- reset_files_struct(displaced);
-
return retval;
}
diff --git a/fs/file.c b/fs/file.c
index e08e4daccac3..8434e0afecc7 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
- /* make sure all __fd_install() have seen resize_in_progress
+ /* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
@@ -181,7 +181,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
rcu_assign_pointer(files->fdt, new_fdt);
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- /* coupled with smp_rmb() in __fd_install() */
+ /* coupled with smp_rmb() in fd_install() */
smp_wmb();
return 1;
}
@@ -411,19 +411,6 @@ static struct fdtable *close_files(struct files_struct * files)
return fdt;
}
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
@@ -436,18 +423,6 @@ void put_files_struct(struct files_struct *files)
}
}
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
void exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
@@ -492,9 +467,9 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
/*
* allocate a file descriptor, mark it busy.
*/
-int __alloc_fd(struct files_struct *files,
- unsigned start, unsigned end, unsigned flags)
+static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
+ struct files_struct *files = current->files;
unsigned int fd;
int error;
struct fdtable *fdt;
@@ -550,14 +525,9 @@ out:
return error;
}
-static int alloc_fd(unsigned start, unsigned flags)
-{
- return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
-}
-
int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
- return __alloc_fd(current->files, 0, nofile, flags);
+ return alloc_fd(0, nofile, flags);
}
int get_unused_fd_flags(unsigned flags)
@@ -596,17 +566,13 @@ EXPORT_SYMBOL(put_unused_fd);
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*
- * NOTE: __fd_install() variant is really, really low-level; don't
- * use it unless you are forced to by truly lousy API shoved down
- * your throat. 'files' *MUST* be either current->files or obtained
- * by get_files_struct(current) done by whoever had given it to you,
- * or really bad things will happen. Normally you want to use
- * fd_install() instead.
+ * This consumes the "file" refcount, so callers should treat it
+ * as if they had called fput(file).
*/
-void __fd_install(struct files_struct *files, unsigned int fd,
- struct file *file)
+void fd_install(unsigned int fd, struct file *file)
{
+ struct files_struct *files = current->files;
struct fdtable *fdt;
rcu_read_lock_sched();
@@ -628,15 +594,6 @@ void __fd_install(struct files_struct *files, unsigned int fd,
rcu_read_unlock_sched();
}
-/*
- * This consumes the "file" refcount, so callers should treat it
- * as if they had called fput(file).
- */
-void fd_install(unsigned int fd, struct file *file)
-{
- __fd_install(current->files, fd, file);
-}
-
EXPORT_SYMBOL(fd_install);
static struct file *pick_file(struct files_struct *files, unsigned fd)
@@ -659,11 +616,9 @@ out_unlock:
return file;
}
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
- */
-int __close_fd(struct files_struct *files, unsigned fd)
+int close_fd(unsigned fd)
{
+ struct files_struct *files = current->files;
struct file *file;
file = pick_file(files, fd);
@@ -672,7 +627,7 @@ int __close_fd(struct files_struct *files, unsigned fd)
return filp_close(file, files);
}
-EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd); /* for ksys_close() */
static inline void __range_cloexec(struct files_struct *cur_fds,
unsigned int fd, unsigned int max_fd)
@@ -777,11 +732,11 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
/*
- * variant of __close_fd that gets a ref on the file for later fput.
+ * variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
-int __close_fd_get_file(unsigned int fd, struct file **res)
+int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
struct file *file;
@@ -850,7 +805,7 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
rcu_read_lock();
loop:
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_rcu(files, fd);
if (file) {
/* File object ref couldn't be taken.
* dup2() atomicity guarantee is the reason
@@ -901,6 +856,42 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
+struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+{
+ /* Must be called with rcu_read_lock held */
+ struct files_struct *files;
+ struct file *file = NULL;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ file = files_lookup_fd_rcu(files, fd);
+ task_unlock(task);
+
+ return file;
+}
+
+struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+{
+ /* Must be called with rcu_read_lock held */
+ struct files_struct *files;
+ unsigned int fd = *ret_fd;
+ struct file *file = NULL;
+
+ task_lock(task);
+ files = task->files;
+ if (files) {
+ for (; fd < files_fdtable(files)->max_fds; fd++) {
+ file = files_lookup_fd_rcu(files, fd);
+ if (file)
+ break;
+ }
+ }
+ task_unlock(task);
+ *ret_fd = fd;
+ return file;
+}
+
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*
@@ -923,7 +914,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
struct file *file;
if (atomic_read(&files->count) == 1) {
- file = __fcheck_files(files, fd);
+ file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
return (unsigned long)file;
@@ -1045,7 +1036,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
struct files_struct *files = current->files;
if (!file)
- return __close_fd(files, fd);
+ return close_fd(fd);
if (fd >= rlimit(RLIMIT_NOFILE))
return -EBADF;
@@ -1134,7 +1125,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
spin_lock(&files->file_lock);
err = expand_files(files, newfd);
- file = fcheck(oldfd);
+ file = files_lookup_fd_locked(files, oldfd);
if (unlikely(!file))
goto Ebadf;
if (unlikely(err < 0)) {
@@ -1163,7 +1154,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
int retval = oldfd;
rcu_read_lock();
- if (!fcheck_files(files, oldfd))
+ if (!files_lookup_fd_rcu(files, oldfd))
retval = -EBADF;
rcu_read_unlock();
return retval;
@@ -1188,10 +1179,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
+ unsigned long nofile = rlimit(RLIMIT_NOFILE);
int err;
- if (from >= rlimit(RLIMIT_NOFILE))
+ if (from >= nofile)
return -EINVAL;
- err = alloc_fd(from, flags);
+ err = alloc_fd(from, nofile, flags);
if (err >= 0) {
get_file(file);
fd_install(err, file);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b3544ecfe305..2b588bd5494c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4236,7 +4236,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
/* might be already done during nonblock submission */
if (!close->put_file) {
- ret = __close_fd_get_file(close->fd, &close->put_file);
+ ret = close_fd_get_file(close->fd, &close->put_file);
if (ret < 0)
return (ret == -ENOENT) ? -EBADF : ret;
}
diff --git a/fs/locks.c b/fs/locks.c
index ed95cd813e10..99ca97e81b7a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2539,14 +2539,15 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
*/
if (!error && file_lock->fl_type != F_UNLCK &&
!(file_lock->fl_flags & FL_OFDLCK)) {
+ struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
* update of i_flctx->flc_posix and check for it done in
* close(). rcu_read_lock() wouldn't do.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
+ spin_lock(&files->file_lock);
+ f = files_lookup_fd_locked(files, fd);
+ spin_unlock(&files->file_lock);
if (f != filp) {
file_lock->fl_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2670,14 +2671,15 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
*/
if (!error && file_lock->fl_type != F_UNLCK &&
!(file_lock->fl_flags & FL_OFDLCK)) {
+ struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
* update of i_flctx->flc_posix and check for it done in
* close(). rcu_read_lock() wouldn't do.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
+ spin_lock(&files->file_lock);
+ f = files_lookup_fd_locked(files, fd);
+ spin_unlock(&files->file_lock);
if (f != filp) {
file_lock->fl_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5dcda8f20c04..5486aaca60b0 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
}
rcu_read_lock();
- f = fcheck(fd);
+ f = lookup_fd_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
diff --git a/fs/open.c b/fs/open.c
index 4d7537ae59df..1e06e443a565 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1296,7 +1296,7 @@ EXPORT_SYMBOL(filp_close);
*/
SYSCALL_DEFINE1(close, unsigned int, fd)
{
- int retval = __close_fd(current->files, fd);
+ int retval = close_fd(fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..cb51763ed554 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -28,14 +28,13 @@ static int seq_show(struct seq_file *m, void *v)
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
-
+ task_lock(task);
+ files = task->files;
if (files) {
unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_locked(files, fd);
if (file) {
struct fdtable *fdt = files_fdtable(files);
@@ -47,8 +46,9 @@ static int seq_show(struct seq_file *m, void *v)
ret = 0;
}
spin_unlock(&files->file_lock);
- put_files_struct(files);
}
+ task_unlock(task);
+ put_task_struct(task);
if (ret)
return ret;
@@ -57,6 +57,7 @@ static int seq_show(struct seq_file *m, void *v)
(long long)file->f_pos, f_flags,
real_mount(file->f_path.mnt)->mnt_id);
+ /* show_fd_locks() never deferences files so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -83,18 +84,13 @@ static const struct file_operations proc_fdinfo_file_operations = {
static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
- struct files_struct *files = get_files_struct(task);
struct file *file;
- if (!files)
- return false;
-
rcu_read_lock();
- file = fcheck_files(files, fd);
+ file = task_lookup_fd_rcu(task, fd);
if (file)
*mode = file->f_mode;
rcu_read_unlock();
- put_files_struct(files);
return !!file;
}
@@ -146,29 +142,22 @@ static const struct dentry_operations tid_fd_dentry_operations = {
static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- struct files_struct *files = NULL;
struct task_struct *task;
int ret = -ENOENT;
task = get_proc_task(d_inode(dentry));
if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
-
- if (files) {
unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
- spin_lock(&files->file_lock);
- fd_file = fcheck_files(files, fd);
+ fd_file = fget_task(task, fd);
if (fd_file) {
*path = fd_file->f_path;
path_get(&fd_file->f_path);
ret = 0;
+ fput(fd_file);
}
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ put_task_struct(task);
}
return ret;
@@ -229,7 +218,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
instantiate_t instantiate)
{
struct task_struct *p = get_proc_task(file_inode(file));
- struct files_struct *files;
unsigned int fd;
if (!p)
@@ -237,22 +225,18 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- files = get_files_struct(p);
- if (!files)
- goto out;
rcu_read_lock();
- for (fd = ctx->pos - 2;
- fd < files_fdtable(files)->max_fds;
- fd++, ctx->pos++) {
+ for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = fcheck_files(files, fd);
+ f = task_lookup_next_fd_rcu(p, &fd);
+ ctx->pos = fd + 2LL;
if (!f)
- continue;
+ break;
data.mode = f->f_mode;
rcu_read_unlock();
data.fd = fd;
@@ -261,13 +245,11 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out_fd_loop;
+ goto out;
cond_resched();
rcu_read_lock();
}
rcu_read_unlock();
-out_fd_loop:
- put_files_struct(files);
out:
put_task_struct(p);
return 0;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index a32bf47c593e..d0e78174874a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -80,7 +80,7 @@ struct dentry;
/*
* The caller must ensure that fd table isn't shared or hold rcu or file lock
*/
-static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
+static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
@@ -91,39 +91,41 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
return NULL;
}
-static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
+static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
- RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
- !lockdep_is_held(&files->file_lock),
+ RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
"suspicious rcu_dereference_check() usage");
- return __fcheck_files(files, fd);
+ return files_lookup_fd_raw(files, fd);
}
-/*
- * Check whether the specified fd has an open file.
- */
-#define fcheck(fd) fcheck_files(current->files, fd)
+static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ "suspicious rcu_dereference_check() usage");
+ return files_lookup_fd_raw(files, fd);
+}
+
+static inline struct file *lookup_fd_rcu(unsigned int fd)
+{
+ return files_lookup_fd_rcu(current->files, fd);
+}
+
+struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
struct task_struct;
-struct files_struct *get_files_struct(struct task_struct *);
void put_files_struct(struct files_struct *fs);
-void reset_files_struct(struct files_struct *);
-int unshare_files(struct files_struct **);
+int unshare_files(void);
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
int (*)(const void *, struct file *, unsigned),
const void *);
-extern int __alloc_fd(struct files_struct *files,
- unsigned start, unsigned end, unsigned flags);
-extern void __fd_install(struct files_struct *files,
- unsigned int fd, struct file *file);
-extern int __close_fd(struct files_struct *files,
- unsigned int fd);
+extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
-extern int __close_fd_get_file(unsigned int fd, struct file **res);
+extern int close_fd_get_file(unsigned int fd, struct file **res);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
struct files_struct **new_fdp);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..0f72f380db72 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1295,18 +1295,6 @@ static inline long ksys_ftruncate(unsigned int fd, loff_t length)
return do_sys_ftruncate(fd, length, 1);
}
-extern int __close_fd(struct files_struct *files, unsigned int fd);
-
-/*
- * In contrast to sys_close(), this stub does not check whether the syscall
- * should or should not be restarted, but returns the raw error codes from
- * __close_fd().
- */
-static inline int ksys_close(unsigned int fd)
-{
- return __close_fd(current->files, fd);
-}
-
extern long do_sys_truncate(const char __user *pathname, loff_t length);
static inline long ksys_truncate(const char __user *pathname, loff_t length)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 287be337d5f6..4caf06fe4152 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3874,7 +3874,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
pid_t pid = attr->task_fd_query.pid;
u32 fd = attr->task_fd_query.fd;
const struct perf_event *event;
- struct files_struct *files;
struct task_struct *task;
struct file *file;
int err;
@@ -3892,23 +3891,11 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
- if (!files)
- return -ENOENT;
-
err = 0;
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = fget_task(task, fd);
+ put_task_struct(task);
if (!file)
- err = -EBADF;
- else
- get_file(file);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (err)
- goto out;
+ return -EBADF;
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
@@ -3948,7 +3935,6 @@ out_not_supp:
err = -ENOTSUPP;
put_file:
fput(file);
-out:
return err;
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 0458a40edf10..e73c07593024 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -130,7 +130,6 @@ struct bpf_iter_seq_task_file_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
- struct files_struct *files;
u32 tid;
u32 fd;
};
@@ -139,37 +138,26 @@ static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid, max_fds;
- struct files_struct *curr_files;
+ u32 curr_tid = info->tid;
struct task_struct *curr_task;
- int curr_fd = info->fd;
+ unsigned int curr_fd = info->fd;
/* If this function returns a non-NULL file object,
- * it held a reference to the task/files_struct/file.
+ * it held a reference to the task/file.
* Otherwise, it does not hold any reference.
*/
again:
if (info->task) {
curr_task = info->task;
- curr_files = info->files;
curr_fd = info->fd;
} else {
curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task) {
info->task = NULL;
- info->files = NULL;
return NULL;
}
- curr_files = get_files_struct(curr_task);
- if (!curr_files) {
- put_task_struct(curr_task);
- curr_tid = ++(info->tid);
- info->fd = 0;
- goto again;
- }
-
- info->files = curr_files;
+ /* set info->task and info->tid */
info->task = curr_task;
if (curr_tid == info->tid) {
curr_fd = info->fd;
@@ -180,13 +168,11 @@ again:
}
rcu_read_lock();
- max_fds = files_fdtable(curr_files)->max_fds;
- for (; curr_fd < max_fds; curr_fd++) {
+ for (;; curr_fd++) {
struct file *f;
-
- f = fcheck_files(curr_files, curr_fd);
+ f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
if (!f)
- continue;
+ break;
if (!get_file_rcu(f))
continue;
@@ -198,10 +184,8 @@ again:
/* the current task is done, go to the next task */
rcu_read_unlock();
- put_files_struct(curr_files);
put_task_struct(curr_task);
info->task = NULL;
- info->files = NULL;
info->fd = 0;
curr_tid = ++(info->tid);
goto again;
@@ -213,7 +197,6 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
struct file *file;
info->task = NULL;
- info->files = NULL;
file = task_file_seq_get_next(info);
if (file && *pos == 0)
++*pos;
@@ -275,9 +258,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v)
(void)__task_file_seq_show(seq, v, true);
} else {
fput((struct file *)v);
- put_files_struct(info->files);
put_task_struct(info->task);
- info->files = NULL;
info->task = NULL;
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 7425b3224891..4f44d87b82ef 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3031,21 +3031,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* the exec layer of the kernel.
*/
-int unshare_files(struct files_struct **displaced)
+int unshare_files(void)
{
struct task_struct *task = current;
- struct files_struct *copy = NULL;
+ struct files_struct *old, *copy = NULL;
int error;
error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
- if (error || !copy) {
- *displaced = NULL;
+ if (error || !copy)
return error;
- }
- *displaced = task->files;
+
+ old = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
+ put_files_struct(old);
return 0;
}
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b3ff9288c6cc..36e58eb5a11d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -61,16 +61,11 @@ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
static struct file *
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
- struct file *file = NULL;
+ struct file *file;
- task_lock(task);
rcu_read_lock();
-
- if (task->files)
- file = fcheck_files(task->files, idx);
-
+ file = task_lookup_fd_rcu(task, idx);
rcu_read_unlock();
- task_unlock(task);
return file;
}
@@ -107,7 +102,6 @@ static int kcmp_epoll_target(struct task_struct *task1,
{
struct file *filp, *filp_epoll, *filp_tgt;
struct kcmp_epoll_slot slot;
- struct files_struct *files;
if (copy_from_user(&slot, uslot, sizeof(slot)))
return -EFAULT;
@@ -116,23 +110,12 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (!filp)
return -EBADF;
- files = get_files_struct(task2);
- if (!files)
+ filp_epoll = fget_task(task2, slot.efd);
+ if (!filp_epoll)
return -EBADF;
- spin_lock(&files->file_lock);
- filp_epoll = fcheck_files(files, slot.efd);
- if (filp_epoll)
- get_file(filp_epoll);
- else
- filp_tgt = ERR_PTR(-EBADF);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (filp_epoll) {
- filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
- fput(filp_epoll);
- }
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);