aboutsummaryrefslogtreecommitdiffstats
path: root/fs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/file.c')
-rw-r--r--fs/file.c516
1 files changed, 392 insertions, 124 deletions
diff --git a/fs/file.c b/fs/file.c
index c8a4e4c86e55..5f9c802a5d8d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
+#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
@@ -18,6 +19,10 @@
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/close_range.h>
+#include <net/sock.h>
+
+#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -70,7 +75,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
- unsigned int cpy, set;
+ size_t cpy, set;
BUG_ON(nfdt->max_fds < ofdt->max_fds);
@@ -82,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
+/*
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
+ * 'unsigned long' in some places, but simply because that is how the Linux
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
+ * they are very much "bits in an array of unsigned long".
+ *
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
+ *
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
+ * let's consider it documentation (and maybe a test-case for gcc to improve
+ * its code generation ;)
+ */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
@@ -97,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
+ nr = ALIGN(nr, BITS_PER_LONG);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal
@@ -154,7 +175,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
- /* make sure all __fd_install() have seen resize_in_progress
+ /* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
@@ -177,7 +198,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
rcu_assign_pointer(files->fdt, new_fdt);
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- /* coupled with smp_rmb() in __fd_install() */
+ /* coupled with smp_rmb() in fd_install() */
smp_wmb();
return 1;
}
@@ -265,11 +286,34 @@ static unsigned int count_open_files(struct fdtable *fdt)
}
/*
+ * Note that a sane fdtable size always has to be a multiple of
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
+ *
+ * 'max_fds' will normally already be properly aligned, but it
+ * turns out that in the close_range() -> __close_range() ->
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
+ * up having a 'max_fds' value that isn't already aligned.
+ *
+ * Rather than make close_range() have to worry about this,
+ * just make that BITS_PER_LONG alignment be part of a sane
+ * fdtable size. Becuase that's really what it is.
+ */
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+{
+ unsigned int count;
+
+ count = count_open_files(fdt);
+ if (max_fds < NR_OPEN_DEFAULT)
+ max_fds = NR_OPEN_DEFAULT;
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
+}
+
+/*
* Allocate a new files structure and copy contents from the
* passed in files structure.
* errorp will be valid only when the returned files_struct is NULL.
*/
-struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
@@ -296,7 +340,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
- open_files = count_open_files(old_fdt);
+ open_files = sane_fdtable_size(old_fdt, max_fds);
/*
* Check whether we need to allocate a larger fd array and fd set.
@@ -327,7 +371,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
*/
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
- open_files = count_open_files(old_fdt);
+ open_files = sane_fdtable_size(old_fdt, max_fds);
}
copy_fd_bitmaps(new_fdt, old_fdt, open_files);
@@ -397,19 +441,6 @@ static struct fdtable *close_files(struct files_struct * files)
return fdt;
}
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
@@ -422,18 +453,6 @@ void put_files_struct(struct files_struct *files)
}
}
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
void exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
@@ -477,9 +496,9 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
/*
* allocate a file descriptor, mark it busy.
*/
-int __alloc_fd(struct files_struct *files,
- unsigned start, unsigned end, unsigned flags)
+static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
+ struct files_struct *files = current->files;
unsigned int fd;
int error;
struct fdtable *fdt;
@@ -535,14 +554,9 @@ out:
return error;
}
-static int alloc_fd(unsigned start, unsigned flags)
-{
- return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
-}
-
int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
- return __alloc_fd(current->files, 0, nofile, flags);
+ return alloc_fd(0, nofile, flags);
}
int get_unused_fd_flags(unsigned flags)
@@ -581,17 +595,13 @@ EXPORT_SYMBOL(put_unused_fd);
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*
- * NOTE: __fd_install() variant is really, really low-level; don't
- * use it unless you are forced to by truly lousy API shoved down
- * your throat. 'files' *MUST* be either current->files or obtained
- * by get_files_struct(current) done by whoever had given it to you,
- * or really bad things will happen. Normally you want to use
- * fd_install() instead.
+ * This consumes the "file" refcount, so callers should treat it
+ * as if they had called fput(file).
*/
-void __fd_install(struct files_struct *files, unsigned int fd,
- struct file *file)
+void fd_install(unsigned int fd, struct file *file)
{
+ struct files_struct *files = current->files;
struct fdtable *fdt;
rcu_read_lock_sched();
@@ -613,68 +623,195 @@ void __fd_install(struct files_struct *files, unsigned int fd,
rcu_read_unlock_sched();
}
-void fd_install(unsigned int fd, struct file *file)
-{
- __fd_install(current->files, fd, file);
-}
-
EXPORT_SYMBOL(fd_install);
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
+/**
+ * pick_file - return file associatd with fd
+ * @files: file struct to retrieve file from
+ * @fd: file descriptor to retrieve file for
+ *
+ * Context: files_lock must be held.
+ *
+ * Returns: The file associated with @fd (NULL if @fd is not open)
*/
-int __close_fd(struct files_struct *files, unsigned fd)
+static struct file *pick_file(struct files_struct *files, unsigned fd)
{
+ struct fdtable *fdt = files_fdtable(files);
struct file *file;
- struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
- goto out_unlock;
+ return NULL;
+
file = fdt->fd[fd];
- if (!file)
- goto out_unlock;
- rcu_assign_pointer(fdt->fd[fd], NULL);
- __put_unused_fd(files, fd);
+ if (file) {
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ __put_unused_fd(files, fd);
+ }
+ return file;
+}
+
+int close_fd(unsigned fd)
+{
+ struct files_struct *files = current->files;
+ struct file *file;
+
+ spin_lock(&files->file_lock);
+ file = pick_file(files, fd);
spin_unlock(&files->file_lock);
+ if (!file)
+ return -EBADF;
+
return filp_close(file, files);
+}
+EXPORT_SYMBOL(close_fd); /* for ksys_close() */
-out_unlock:
- spin_unlock(&files->file_lock);
- return -EBADF;
+/**
+ * last_fd - return last valid index into fd table
+ * @cur_fds: files struct
+ *
+ * Context: Either rcu read lock or files_lock must be held.
+ *
+ * Returns: Last valid index into fdtable.
+ */
+static inline unsigned last_fd(struct fdtable *fdt)
+{
+ return fdt->max_fds - 1;
+}
+
+static inline void __range_cloexec(struct files_struct *cur_fds,
+ unsigned int fd, unsigned int max_fd)
+{
+ struct fdtable *fdt;
+
+ /* make sure we're using the correct maximum value */
+ spin_lock(&cur_fds->file_lock);
+ fdt = files_fdtable(cur_fds);
+ max_fd = min(last_fd(fdt), max_fd);
+ if (fd <= max_fd)
+ bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
+ spin_unlock(&cur_fds->file_lock);
+}
+
+static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+ unsigned int max_fd)
+{
+ unsigned n;
+
+ rcu_read_lock();
+ n = last_fd(files_fdtable(cur_fds));
+ rcu_read_unlock();
+ max_fd = min(max_fd, n);
+
+ while (fd <= max_fd) {
+ struct file *file;
+
+ spin_lock(&cur_fds->file_lock);
+ file = pick_file(cur_fds, fd++);
+ spin_unlock(&cur_fds->file_lock);
+
+ if (file) {
+ /* found a valid file to close */
+ filp_close(file, cur_fds);
+ cond_resched();
+ }
+ }
+}
+
+/**
+ * __close_range() - Close all file descriptors in a given range.
+ *
+ * @fd: starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ */
+int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+{
+ struct task_struct *me = current;
+ struct files_struct *cur_fds = me->files, *fds = NULL;
+
+ if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
+ return -EINVAL;
+
+ if (fd > max_fd)
+ return -EINVAL;
+
+ if (flags & CLOSE_RANGE_UNSHARE) {
+ int ret;
+ unsigned int max_unshare_fds = NR_OPEN_MAX;
+
+ /*
+ * If the caller requested all fds to be made cloexec we always
+ * copy all of the file descriptors since they still want to
+ * use them.
+ */
+ if (!(flags & CLOSE_RANGE_CLOEXEC)) {
+ /*
+ * If the requested range is greater than the current
+ * maximum, we're closing everything so only copy all
+ * file descriptors beneath the lowest file descriptor.
+ */
+ rcu_read_lock();
+ if (max_fd >= last_fd(files_fdtable(cur_fds)))
+ max_unshare_fds = fd;
+ rcu_read_unlock();
+ }
+
+ ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
+ if (ret)
+ return ret;
+
+ /*
+ * We used to share our file descriptor table, and have now
+ * created a private one, make sure we're using it below.
+ */
+ if (fds)
+ swap(cur_fds, fds);
+ }
+
+ if (flags & CLOSE_RANGE_CLOEXEC)
+ __range_cloexec(cur_fds, fd, max_fd);
+ else
+ __range_close(cur_fds, fd, max_fd);
+
+ if (fds) {
+ /*
+ * We're done closing the files we were supposed to. Time to install
+ * the new file descriptor table and drop the old one.
+ */
+ task_lock(me);
+ me->files = cur_fds;
+ task_unlock(me);
+ put_files_struct(fds);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/*
- * variant of __close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file, and then
- * an fput().
+ * See close_fd_get_file() below, this variant assumes current->files->file_lock
+ * is held.
*/
-int __close_fd_get_file(unsigned int fd, struct file **res)
+struct file *__close_fd_get_file(unsigned int fd)
+{
+ return pick_file(current->files, fd);
+}
+
+/*
+ * variant of close_fd that gets a ref on the file for later fput.
+ * The caller must ensure that filp_close() called on the file.
+ */
+struct file *close_fd_get_file(unsigned int fd)
{
struct files_struct *files = current->files;
struct file *file;
- struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
- goto out_unlock;
- file = fdt->fd[fd];
- if (!file)
- goto out_unlock;
- rcu_assign_pointer(fdt->fd[fd], NULL);
- __put_unused_fd(files, fd);
+ file = pick_file(files, fd);
spin_unlock(&files->file_lock);
- get_file(file);
- *res = file;
- return 0;
-out_unlock:
- spin_unlock(&files->file_lock);
- *res = NULL;
- return -ENOENT;
+ return file;
}
void do_close_on_exec(struct files_struct *files)
@@ -713,49 +850,86 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static inline struct file *__fget_files_rcu(struct files_struct *files,
+ unsigned int fd, fmode_t mask)
+{
+ for (;;) {
+ struct file *file;
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+ struct file __rcu **fdentry;
+
+ if (unlikely(fd >= fdt->max_fds))
+ return NULL;
+
+ fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+ file = rcu_dereference_raw(*fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(file->f_mode & mask))
+ return NULL;
+
+ /*
+ * Ok, we have a file pointer. However, because we do
+ * this all locklessly under RCU, we may be racing with
+ * that file being closed.
+ *
+ * Such a race can take two forms:
+ *
+ * (a) the file ref already went down to zero,
+ * and get_file_rcu() fails. Just try again:
+ */
+ if (unlikely(!get_file_rcu(file)))
+ continue;
+
+ /*
+ * (b) the file table entry has changed under us.
+ * Note that we don't need to re-check the 'fdt->fd'
+ * pointer having changed, because it always goes
+ * hand-in-hand with 'fdt'.
+ *
+ * If so, we need to put our ref and try again.
+ */
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
+ unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ fput(file);
+ continue;
+ }
+
+ /*
+ * Ok, we have a ref to the file, and checked that it
+ * still exists.
+ */
+ return file;
+ }
+}
+
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
- fmode_t mask, unsigned int refs)
+ fmode_t mask)
{
struct file *file;
rcu_read_lock();
-loop:
- file = fcheck_files(files, fd);
- if (file) {
- /* File object ref couldn't be taken.
- * dup2() atomicity guarantee is the reason
- * we loop to catch the new file (or NULL pointer)
- */
- if (file->f_mode & mask)
- file = NULL;
- else if (!get_file_rcu_many(file, refs))
- goto loop;
- }
+ file = __fget_files_rcu(files, fd, mask);
rcu_read_unlock();
return file;
}
-static inline struct file *__fget(unsigned int fd, fmode_t mask,
- unsigned int refs)
-{
- return __fget_files(current->files, fd, mask, refs);
-}
-
-struct file *fget_many(unsigned int fd, unsigned int refs)
+static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
- return __fget(fd, FMODE_PATH, refs);
+ return __fget_files(current->files, fd, mask);
}
struct file *fget(unsigned int fd)
{
- return __fget(fd, FMODE_PATH, 1);
+ return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- return __fget(fd, 0, 1);
+ return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);
@@ -765,12 +939,49 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
task_lock(task);
if (task->files)
- file = __fget_files(task->files, fd, 0, 1);
+ file = __fget_files(task->files, fd, 0);
task_unlock(task);
return file;
}
+struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+{
+ /* Must be called with rcu_read_lock held */
+ struct files_struct *files;
+ struct file *file = NULL;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ file = files_lookup_fd_rcu(files, fd);
+ task_unlock(task);
+
+ return file;
+}
+
+struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+{
+ /* Must be called with rcu_read_lock held */
+ struct files_struct *files;
+ unsigned int fd = *ret_fd;
+ struct file *file = NULL;
+
+ task_lock(task);
+ files = task->files;
+ if (files) {
+ for (; fd < files_fdtable(files)->max_fds; fd++) {
+ file = files_lookup_fd_rcu(files, fd);
+ if (file)
+ break;
+ }
+ }
+ task_unlock(task);
+ *ret_fd = fd;
+ return file;
+}
+EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*
@@ -793,12 +1004,12 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
struct file *file;
if (atomic_read(&files->count) == 1) {
- file = __fcheck_files(files, fd);
+ file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask, 1);
+ file = __fget(fd, mask);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
@@ -915,7 +1126,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
struct files_struct *files = current->files;
if (!file)
- return __close_fd(files, fd);
+ return close_fd(fd);
if (fd >= rlimit(RLIMIT_NOFILE))
return -EBADF;
@@ -931,6 +1142,67 @@ out_unlock:
return err;
}
+/**
+ * __receive_fd() - Install received file into file descriptor table
+ * @file: struct file that was received from another process
+ * @ufd: __user pointer to write new fd number to
+ * @o_flags: the O_* flags to apply to the new fd entry
+ *
+ * Installs a received file into the file descriptor table, with appropriate
+ * checks and count updates. Optionally writes the fd number to userspace, if
+ * @ufd is non-NULL.
+ *
+ * This helper handles its own reference counting of the incoming
+ * struct file.
+ *
+ * Returns newly install fd or -ve on error.
+ */
+int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
+{
+ int new_fd;
+ int error;
+
+ error = security_file_receive(file);
+ if (error)
+ return error;
+
+ new_fd = get_unused_fd_flags(o_flags);
+ if (new_fd < 0)
+ return new_fd;
+
+ if (ufd) {
+ error = put_user(new_fd, ufd);
+ if (error) {
+ put_unused_fd(new_fd);
+ return error;
+ }
+ }
+
+ fd_install(new_fd, get_file(file));
+ __receive_sock(file);
+ return new_fd;
+}
+
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
+{
+ int error;
+
+ error = security_file_receive(file);
+ if (error)
+ return error;
+ error = replace_fd(new_fd, file, o_flags);
+ if (error)
+ return error;
+ __receive_sock(file);
+ return new_fd;
+}
+
+int receive_fd(struct file *file, unsigned int o_flags)
+{
+ return __receive_fd(file, NULL, o_flags);
+}
+EXPORT_SYMBOL_GPL(receive_fd);
+
static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
@@ -948,7 +1220,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
spin_lock(&files->file_lock);
err = expand_files(files, newfd);
- file = fcheck(oldfd);
+ file = files_lookup_fd_locked(files, oldfd);
if (unlikely(!file))
goto Ebadf;
if (unlikely(err < 0)) {
@@ -977,7 +1249,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
int retval = oldfd;
rcu_read_lock();
- if (!fcheck_files(files, oldfd))
+ if (!files_lookup_fd_rcu(files, oldfd))
retval = -EBADF;
rcu_read_unlock();
return retval;
@@ -985,7 +1257,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
return ksys_dup3(oldfd, newfd, 0);
}
-int ksys_dup(unsigned int fildes)
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
int ret = -EBADF;
struct file *file = fget_raw(fildes);
@@ -1000,17 +1272,13 @@ int ksys_dup(unsigned int fildes)
return ret;
}
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
-{
- return ksys_dup(fildes);
-}
-
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
+ unsigned long nofile = rlimit(RLIMIT_NOFILE);
int err;
- if (from >= rlimit(RLIMIT_NOFILE))
+ if (from >= nofile)
return -EINVAL;
- err = alloc_fd(from, flags);
+ err = alloc_fd(from, nofile, flags);
if (err >= 0) {
get_file(file);
fd_install(err, file);