aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig1
-rw-r--r--fs/proc/array.c17
-rw-r--r--fs/proc/base.c105
-rw-r--r--fs/proc/bootconfig.c2
-rw-r--r--fs/proc/cpuinfo.c6
-rw-r--r--fs/proc/devices.c6
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/generic.c9
-rw-r--r--fs/proc/inode.c25
-rw-r--r--fs/proc/internal.h12
-rw-r--r--fs/proc/kcore.c14
-rw-r--r--fs/proc/kmsg.c3
-rw-r--r--fs/proc/loadavg.c6
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/proc_net.c39
-rw-r--r--fs/proc/proc_sysctl.c164
-rw-r--r--fs/proc/proc_tty.c2
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/task_mmu.c167
-rw-r--r--fs/proc/task_nommu.c45
-rw-r--r--fs/proc/uptime.c6
-rw-r--r--fs/proc/version.c6
-rw-r--r--fs/proc/vmcore.c184
26 files changed, 568 insertions, 307 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..32b1116ae137 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
+ depends on PROC_FS
default n
help
Provides a fast way to retrieve first level children pids of a task. See
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff869a66b34e..49283b8103c7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -69,7 +69,6 @@
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
-#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
@@ -88,10 +87,10 @@
#include <linux/pid_namespace.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
+#include <linux/kthread.h>
#include <asm/processor.h>
#include "internal.h"
@@ -100,8 +99,14 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
char tcomm[64];
+ /*
+ * Test before PF_KTHREAD because all workqueue worker threads are
+ * kernel threads.
+ */
if (p->flags & PF_WQ_WORKER)
wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else if (p->flags & PF_KTHREAD)
+ get_kthread_comm(tcomm, sizeof(tcomm), p);
else
__get_task_comm(tcomm, sizeof(tcomm), p);
@@ -274,7 +279,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
@@ -468,6 +473,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
u64 cgtime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
+ int exit_code = task->exit_code;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -531,6 +537,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
maj_flt += sig->maj_flt;
thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;
+
+ if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
+ exit_code = sig->group_exit_code;
}
sid = task_session_nr_ns(task, ns);
@@ -630,7 +639,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, " 0 0 0 0 0 0 0");
if (permitted)
- seq_put_decimal_ll(m, " ", task->exit_code);
+ seq_put_decimal_ll(m, " ", exit_code);
else
seq_puts(m, " 0");
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 13eda8de2998..9e479d7d202b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,7 +74,6 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
@@ -670,10 +669,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
/************************************************************************/
/* permission checks */
-static int proc_fd_access_allowed(struct inode *inode)
+static bool proc_fd_access_allowed(struct inode *inode)
{
struct task_struct *task;
- int allowed = 0;
+ bool allowed = false;
/* Allow access to a task's file descriptors if it is us or we
* may use ptrace attach to the process and find out that
* information.
@@ -1762,27 +1761,27 @@ out:
return ERR_PTR(error);
}
-static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
@@ -1886,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei)
put_pid(pid);
}
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block *sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
@@ -1915,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */
ei->pid = pid;
- if (S_ISDIR(mode)) {
- spin_lock(&pid->lock);
- hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
- spin_unlock(&pid->lock);
- }
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -1932,6 +1926,39 @@ out_unlock:
return NULL;
}
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ inode = proc_pid_make_inode(sb, task, mode);
+ if (!inode)
+ return NULL;
+
+ /* Let proc_flush_pid find this directory inode */
+ ei = PROC_I(inode);
+ pid = ei->pid;
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+
+ return inode;
+}
+
int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -2323,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ struct vma_iterator vmi;
genradix_init(&fa);
@@ -2361,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
* routine might require mmap_lock taken in might_fault().
*/
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ pos = 2;
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (++pos <= ctx->pos)
@@ -2701,7 +2731,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
return -ESRCH;
length = security_getprocattr(task, PROC_I(inode)->op.lsm,
- (char*)file->f_path.dentry->d_name.name,
+ file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
if (length > 0)
@@ -3155,6 +3185,35 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -3286,6 +3345,10 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3351,7 +3414,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
@@ -3619,6 +3683,10 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3650,7 +3718,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 6d8d4bf20837..2e244ada1f97 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -32,6 +32,8 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
xbc_for_each_key_value(leaf, val) {
ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 419760fd77bd..f38bda5b83ec 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 837971e74109..fe7bfcb7d049 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -4,6 +4,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
+#include "internal.h"
static int devinfo_show(struct seq_file *f, void *v)
{
@@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = {
static int __init proc_devices_init(void)
{
- proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..913bef0d2a36 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5b78739e60e4..587b91d9d998 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
ent->proc_dops = &proc_misc_dentry_ops;
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ pde_force_lookup(ent);
out:
return ent;
@@ -791,12 +794,6 @@ void proc_remove(struct proc_dir_entry *de)
}
EXPORT_SYMBOL(proc_remove);
-void *PDE_DATA(const struct inode *inode)
-{
- return __PDE_DATA(inode);
-}
-EXPORT_SYMBOL(PDE_DATA);
-
/*
* Pull a user buffer into memory and pass it to the file's write handler if
* one is supplied. The ->write() method is permitted to modify the
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 599eb724ff2d..f495fdb39151 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -26,8 +26,6 @@
#include <linux/mount.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
-
#include "internal.h"
static void proc_evict_inode(struct inode *inode)
@@ -66,7 +64,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
@@ -214,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde)
complete(pde->pde_unload_completion);
}
-/* pde is locked on entry, unlocked on exit */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
__releases(&pde->pde_unload_lock)
{
@@ -224,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
*
* rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
* "struct file" needs to be available at the right moment.
- *
- * Therefore, first process to enter this function does ->release() and
- * signals its completion to the other process which does nothing.
*/
if (pdeo->closing) {
/* somebody else is doing that, just wait */
@@ -240,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
+
file = pdeo->file;
pde->proc_ops->proc_release(file_inode(file), file);
+
spin_lock(&pde->pde_unload_lock);
- /* After ->release. */
+ /* Strictly after ->proc_release, see above. */
list_del(&pdeo->lh);
c = pdeo->c;
spin_unlock(&pde->pde_unload_lock);
@@ -489,6 +494,9 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (!pde->proc_ops->proc_lseek)
+ file->f_mode &= ~FMODE_LSEEK;
+
if (pde_is_permanent(pde)) {
open = pde->proc_ops->proc_open;
if (open)
@@ -650,6 +658,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
return NULL;
}
+ inode->i_private = de->data;
inode->i_ino = de->low_ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
PROC_I(inode)->pde = de;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 03415f3fb3a8..b701d0207edf 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
return pde->flags & PROC_ENTRY_PERMANENT;
}
+static inline void pde_make_permanent(struct proc_dir_entry *pde)
+{
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -115,11 +120,6 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
return PROC_I(inode)->pde;
}
-static inline void *__PDE_DATA(const struct inode *inode)
-{
- return PDE(inode)->data;
-}
-
static inline struct pid *proc_pid(const struct inode *inode)
{
return PROC_I(inode)->pid;
@@ -290,7 +290,7 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
#ifdef CONFIG_MMU
- struct vm_area_struct *tail_vma;
+ struct vma_iterator iter;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 982e694aae77..dff921f7ca33 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* the previous entry, search for a matching entry.
*/
if (!m || start < m->addr || start >= m->addr + m->size) {
- list_for_each_entry(m, &kclist_head, list) {
- if (start >= m->addr &&
- start < m->addr + m->size)
+ struct kcore_list *iter;
+
+ m = NULL;
+ list_for_each_entry(iter, &kclist_head, list) {
+ if (start >= iter->addr &&
+ start < iter->addr + iter->size) {
+ m = iter;
break;
+ }
}
}
@@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
page_offline_freeze();
}
- if (&m->list == &kclist_head) {
+ if (!m) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
}
- m = NULL; /* skip the list anchor */
goto skip;
}
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index b38ad552887f..2fc92a13f9f8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,11 +15,8 @@
#include <linux/fs.h>
#include <linux/syslog.h>
-#include <linux/uaccess.h>
#include <asm/io.h>
-extern wait_queue_head_t log_wait;
-
static int kmsg_open(struct inode * inode, struct file * file)
{
return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index f32878d9a39f..817981e57223 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,6 +9,7 @@
#include <linux/seq_file.h>
#include <linux/seqlock.h>
#include <linux/time.h>
+#include "internal.h"
static int loadavg_proc_show(struct seq_file *m, void *v)
{
@@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
static int __init proc_loadavg_init(void)
{
- proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6fa761c9cc78..5101131e6047 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ seq_printf(m, "Zswap: %8lu kB\n",
+ (unsigned long)(zswap_pool_total_size >> 10));
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
show_val_kb(m, "Dirty: ",
global_node_page_state(NR_FILE_DIRTY));
show_val_kb(m, "Writeback: ",
@@ -108,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
show_val_kb(m, "PageTables: ",
global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
@@ -155,7 +164,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
static int __init proc_meminfo_init(void)
{
- proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd..4d3493579458 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
-#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde..f2273b164535 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -10,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -90,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpagecount_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
@@ -267,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpageflags_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpageflags_read,
};
@@ -321,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecgroup_read,
};
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 15c2e55d2ed2..856839b8ae8b 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,9 +8,6 @@
*
* proc net directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -61,15 +58,27 @@ static int seq_open_net(struct inode *inode, struct file *file)
}
#ifdef CONFIG_NET_NS
p->net = net;
+ netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL);
#endif
return 0;
}
+static void seq_file_net_put_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *priv = seq->private;
+
+ put_net_track(priv->net, &priv->ns_tracker);
+#else
+ put_net(&init_net);
+#endif
+}
+
static int seq_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq = f->private_data;
- put_net(seq_file_net(seq));
+ seq_file_net_put_net(seq);
seq_release_private(ino, f);
return 0;
}
@@ -87,7 +96,8 @@ int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
- p->net = get_net(current->nsproxy->net_ns);
+ p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker,
+ GFP_KERNEL);
#endif
return 0;
}
@@ -97,7 +107,7 @@ void bpf_iter_fini_seq_net(void *priv_data)
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
- put_net(p->net);
+ put_net_track(p->net, &p->ns_tracker);
#endif
}
@@ -125,7 +135,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* @parent: The parent directory in which to create.
* @ops: The seq_file ops with which to read the file.
* @write: The write method with which to 'modify' the file.
- * @data: Data for retrieval by PDE_DATA().
+ * @data: Data for retrieval by pde_data().
*
* Create a network namespaced proc file in the @parent directory with the
* specified @name and @mode that allows reading of a file that displays a
@@ -140,7 +150,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* modified by the @write function. @write should return 0 on success.
*
* The @data value is accessible from the @show and @write functions by calling
- * PDE_DATA() on the file inode. The network namespace must be accessed by
+ * pde_data() on the file inode. The network namespace must be accessed by
* calling seq_file_net() on the seq_file struct.
*/
struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
@@ -217,7 +227,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* @parent: The parent directory in which to create.
* @show: The seqfile show method with which to read the file.
* @write: The write method with which to 'modify' the file.
- * @data: Data for retrieval by PDE_DATA().
+ * @data: Data for retrieval by pde_data().
*
* Create a network-namespaced proc file in the @parent directory with the
* specified @name and @mode that allows reading of a file that displays a
@@ -232,7 +242,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* modified by the @write function. @write should return 0 on success.
*
* The @data value is accessible from the @show and @write functions by calling
- * PDE_DATA() on the file inode. The network namespace must be accessed by
+ * pde_data() on the file inode. The network namespace must be accessed by
* calling seq_file_single_net() on the seq_file struct.
*/
struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
@@ -340,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net)
kgid_t gid;
int err;
+ /*
+ * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+ * Corresponding inode (PDE(inode) == net->proc_net) is never
+ * instantiated therefore blanket zeroing is fine.
+ * net->proc_net_stat inode is instantiated normally.
+ */
err = -ENOMEM;
netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
@@ -363,6 +379,9 @@ static __net_init int proc_net_ns_init(struct net *net)
proc_set_user(netd, uid, gid);
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5d66faecd4ef..48f2d60bd78a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -16,24 +16,38 @@
#include <linux/module.h>
#include <linux/bpf-cgroup.h>
#include <linux/mount.h>
+#include <linux/kmemleak.h>
#include "internal.h"
+#define list_for_each_table_entry(entry, table) \
+ for ((entry) = (table); (entry)->procname; (entry)++)
+
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { 0, 1, INT_MAX };
-EXPORT_SYMBOL(sysctl_vals);
-
/* Support for permanently empty directories */
struct ctl_table sysctl_mount_point[] = {
{ }
};
+/**
+ * register_sysctl_mount_point() - registers a sysctl mount point
+ * @path: path for the mount point
+ *
+ * Used to create a permanently empty directory to serve as mount point.
+ * There are some subtle but important permission checks this allows in the
+ * case of unprivileged mounts.
+ */
+struct ctl_table_header *register_sysctl_mount_point(const char *path)
+{
+ return register_sysctl(path, sysctl_mount_point);
+}
+EXPORT_SYMBOL(register_sysctl_mount_point);
+
static bool is_empty_dir(struct ctl_table_header *head)
{
return head->ctl_table[0].child == sysctl_mount_point;
@@ -163,7 +177,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
else {
pr_err("sysctl duplicate entry: ");
sysctl_print_dir(head->parent);
- pr_cont("/%s\n", entry->procname);
+ pr_cont("%s\n", entry->procname);
return -EEXIST;
}
}
@@ -197,15 +211,19 @@ static void init_header(struct ctl_table_header *head,
INIT_HLIST_HEAD(&head->inodes);
if (node) {
struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++)
+
+ list_for_each_table_entry(entry, table) {
node->header = head;
+ node++;
+ }
}
}
static void erase_header(struct ctl_table_header *head)
{
struct ctl_table *entry;
- for (entry = head->ctl_table; entry->procname; entry++)
+
+ list_for_each_table_entry(entry, head->ctl_table)
erase_entry(head, entry);
}
@@ -230,7 +248,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
err = insert_links(header);
if (err)
goto fail_links;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -960,7 +978,6 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
table = (struct ctl_table *)(node + 1);
new_name = (char *)(table + 2);
memcpy(new_name, name, namelen);
- new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
init_header(&new->header, set->dir.header.root, set, node, table);
@@ -1020,8 +1037,8 @@ failed:
if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
- pr_cont("/%*.*s %ld\n",
- namelen, namelen, name, PTR_ERR(subdir));
+ pr_cont("%*.*s %ld\n", namelen, namelen, name,
+ PTR_ERR(subdir));
}
drop_sysctl_table(&dir->header);
if (new)
@@ -1053,7 +1070,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
struct ctl_dir *dir;
int ret;
- ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
set = lookup_header_set(root);
@@ -1113,35 +1129,36 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
static int sysctl_check_table(const char *path, struct ctl_table *table)
{
+ struct ctl_table *entry;
int err = 0;
- for (; table->procname; table++) {
- if (table->child)
- err |= sysctl_err(path, table, "Not a file");
-
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_douintvec) ||
- (table->proc_handler == proc_douintvec_minmax) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dou8vec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- err |= sysctl_err(path, table, "No data");
- if (!table->maxlen)
- err |= sysctl_err(path, table, "No maxlen");
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ err |= sysctl_err(path, entry, "Not a file");
+
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
else
- err |= sysctl_check_table_array(path, table);
+ err |= sysctl_check_table_array(path, entry);
}
- if (!table->proc_handler)
- err |= sysctl_err(path, table, "No proc_handler");
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
- if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
- err |= sysctl_err(path, table, "bogus .mode 0%o",
- table->mode);
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
}
return err;
}
@@ -1157,7 +1174,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
name_bytes = 0;
nr_entries = 0;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
@@ -1174,14 +1191,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
node = (struct ctl_node *)(links + 1);
link_table = (struct ctl_table *)(node + nr_entries);
link_name = (char *)&link_table[nr_entries + 1];
+ link = link_table;
- for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ list_for_each_table_entry(entry, table) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
link->data = link_root;
link_name += len;
+ link++;
}
init_header(links, dir->header.root, dir->header.set, node, link_table);
links->nreg = nr_entries;
@@ -1196,7 +1215,7 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table *entry, *link;
/* Are there links available for every entry in table? */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
if (!link)
@@ -1209,7 +1228,7 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
head->nreg++;
@@ -1220,7 +1239,7 @@ static bool get_links(struct ctl_dir *dir,
static int insert_links(struct ctl_table_header *head)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
- struct ctl_dir *core_parent = NULL;
+ struct ctl_dir *core_parent;
struct ctl_table_header *links;
int err;
@@ -1312,11 +1331,11 @@ struct ctl_table_header *__register_sysctl_table(
struct ctl_node *node;
int nr_entries = 0;
- for (entry = table; entry->procname; entry++)
+ list_for_each_table_entry(entry, table)
nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
@@ -1384,6 +1403,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab
}
EXPORT_SYMBOL(register_sysctl);
+/**
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base
+ * @table: This is the sysctl table that needs to be registered to the path
+ * @table_name: The name of sysctl table, only used for log printing when
+ * registration fails
+ *
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
+ *
+ * Context: Can only be called after your respective sysctl base path has been
+ * registered. So for instance, most base directories are registered early on
+ * init before init levels are processed through proc_sys_init() and
+ * sysctl_init_bases().
+ */
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+ const char *table_name)
+{
+ struct ctl_table_header *hdr = register_sysctl(path, table);
+
+ if (unlikely(!hdr)) {
+ pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+ return;
+ }
+ kmemleak_not_leak(hdr);
+}
+
static char *append_path(const char *path, char *pos, const char *name)
{
int namelen;
@@ -1407,7 +1458,7 @@ static int count_subheaders(struct ctl_table *table)
if (!table || !table->procname)
return 1;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_subheaders += count_subheaders(entry->child);
else
@@ -1426,7 +1477,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
int nr_dirs = 0;
int err = -ENOMEM;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_dirs++;
else
@@ -1443,7 +1494,9 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
goto out;
ctl_table_arg = files;
- for (new = files, entry = table; entry->procname; entry++) {
+ new = files;
+
+ list_for_each_table_entry(entry, table) {
if (entry->child)
continue;
*new = *entry;
@@ -1467,7 +1520,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
}
/* Recurse into the subdirectories. */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
char *child_pos;
if (!entry->child)
@@ -1597,6 +1650,15 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
}
EXPORT_SYMBOL(register_sysctl_table);
+int __register_sysctl_base(struct ctl_table *base_table)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_table(base_table);
+ kmemleak_not_leak(hdr);
+ return 0;
+}
+
static void put_links(struct ctl_table_header *header)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
@@ -1612,7 +1674,7 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
struct ctl_table_header *link_head;
struct ctl_table *link;
const char *name = entry->procname;
@@ -1626,7 +1688,7 @@ static void put_links(struct ctl_table_header *header)
else {
pr_err("sysctl link missing during unregister: ");
sysctl_print_dir(parent);
- pr_cont("/%s\n", name);
+ pr_cont("%s\n", name);
}
}
}
@@ -1710,7 +1772,7 @@ int __init proc_sys_init(void)
proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return sysctl_init();
+ return sysctl_init_bases();
}
struct sysctl_alias {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c69ff191e5d8..5c6a5ceab2f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,8 +4,6 @@
*
* Copyright 1997, Theodore Ts'o
*/
-
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..3c2ee3eb1138 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,9 +6,6 @@
*
* proc root directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -305,6 +302,11 @@ void __init proc_root_init(void)
proc_mkdir("bus", NULL);
proc_sys_init();
+ /*
+ * Last things last. It is not like userspace processes eager
+ * to open /proc files exist at this point but register last
+ * anyway.
+ */
register_filesystem(&proc_fs_type);
}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 12901dcf57e2..f4616083faef 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -3,6 +3,7 @@
#include <linux/kernel_stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
/*
* /proc/softirqs ... display the number of softirqs
@@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v)
static int __init proc_softirqs_init(void)
{
- proc_create_single("softirqs", 0, NULL, show_softirqs);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("softirqs", 0, NULL, show_softirqs);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad667dbc96f5..8a74cdcc9af0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
-#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -123,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -2UL;
+ vma = get_gate_vma(priv->mm);
+ }
+
+ return vma;
+}
+
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
/* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
@@ -152,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return ERR_PTR(-EINTR);
}
+ vma_iter_init(&priv->iter, mm, last_addr);
hold_task_mempolicy(priv);
- priv->tail_vma = get_gate_vma(mm);
-
- vma = find_vma(mm, last_addr);
- if (vma)
- return vma;
+ if (last_addr == -2UL)
+ return get_gate_vma(mm);
- return priv->tail_vma;
+ return proc_get_vma(priv, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next, *vma = v;
-
- if (vma == priv->tail_vma)
- next = NULL;
- else if (vma->vm_next)
- next = vma->vm_next;
- else
- next = priv->tail_vma;
-
- *ppos = next ? next->vm_start : -1UL;
-
- return next;
+ if (*ppos == -2UL) {
+ *ppos = -1UL;
+ return NULL;
+ }
+ return proc_get_vma(m->private, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -308,6 +312,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
+ struct anon_vma_name *anon_name;
+
if (!mm) {
name = "[vdso]";
goto done;
@@ -319,8 +325,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- if (is_stack(vma))
+ if (is_stack(vma)) {
name = "[stack]";
+ goto done;
+ }
+
+ anon_name = anon_vma_name(vma);
+ if (anon_name) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[anon:%s]", anon_name->name);
+ }
}
done:
@@ -395,6 +409,7 @@ struct mem_size_stats {
u64 pss_anon;
u64 pss_file;
u64 pss_shmem;
+ u64 pss_dirty;
u64 pss_locked;
u64 swap_pss;
};
@@ -416,6 +431,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
mss->pss_locked += pss;
if (dirty || PageDirty(page)) {
+ mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
else
@@ -429,7 +445,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
}
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -456,8 +473,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* page_count(page) == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
* page_count().
+ *
+ * The page_mapcount() is called to get a snapshot of the mapcount.
+ * Without holding the page lock this snapshot can be slightly wrong as
+ * we cannot always read the mapcount atomically. It is not safe to
+ * call page_mapcount() even with PTL held if the page is not mapped,
+ * especially for migration entries. Treat regular migration entries
+ * as mapcount == 1.
*/
- if (page_count(page) == 1) {
+ if ((page_count(page) == 1) || migration) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
@@ -506,9 +530,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false, young = false, dirty = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
+ young = pte_young(*pte);
+ dirty = pte_dirty(*pte);
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
@@ -525,8 +552,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_pfn_swap_entry(swpent))
+ } else if (is_pfn_swap_entry(swpent)) {
+ if (is_migration_entry(swpent))
+ migration = true;
page = pfn_swap_entry_to_page(swpent);
+ }
} else {
smaps_pte_hole_lookup(addr, walk);
return;
@@ -535,7 +565,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, young, dirty, locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -546,6 +576,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pmd_present(*pmd)) {
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -553,8 +584,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- if (is_migration_entry(entry))
+ if (is_migration_entry(entry)) {
+ migration = true;
page = pfn_swap_entry_to_page(entry);
+ }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -566,7 +599,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -779,6 +814,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
{
SEQ_PUT_DEC("Rss: ", mss->resident);
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
if (rollup_mode) {
/*
* These are meaningful only for smaps_rollup, otherwise two of
@@ -831,7 +867,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_active(vma));
+ hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -844,16 +880,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct mem_size_stats mss;
- struct mm_struct *mm;
+ struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
- unsigned long last_vma_end = 0;
+ unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return -ESRCH;
- mm = priv->mm;
if (!mm || !mmget_not_zero(mm)) {
ret = -ESRCH;
goto out_put_task;
@@ -866,8 +902,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_mm;
hold_task_mempolicy(priv);
+ vma = mas_find(&mas, ULONG_MAX);
- for (vma = priv->mm->mmap; vma;) {
+ if (unlikely(!vma))
+ goto empty_set;
+
+ vma_start = vma->vm_start;
+ do {
smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
@@ -876,6 +917,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* access it for write request.
*/
if (mmap_lock_is_contended(mm)) {
+ mas_pause(&mas);
mmap_read_unlock(mm);
ret = mmap_read_lock_killable(mm);
if (ret) {
@@ -919,7 +961,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* contains last_vma_end.
* Iterate VMA' from last_vma_end.
*/
- vma = find_vma(mm, last_vma_end - 1);
+ vma = mas_find(&mas, ULONG_MAX);
/* Case 3 above */
if (!vma)
break;
@@ -933,11 +975,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
smap_gather_stats(vma, &mss, last_vma_end);
}
/* Case 2 above */
- vma = vma->vm_next;
- }
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
- last_vma_end, 0, 0, 0, 0);
+empty_set:
+ show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
@@ -1230,6 +1271,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
@@ -1249,7 +1291,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
if (type == CLEAR_REFS_SOFT_DIRTY) {
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
vma->vm_flags &= ~VM_SOFTDIRTY;
@@ -1261,8 +1303,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
- &cp);
+ walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
flush_tlb_mm(mm);
@@ -1367,6 +1408,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ bool migration = false;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1384,17 +1426,30 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pte_swp_uffd_wp(pte))
flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
- if (pm->show_pfn)
+ if (pm->show_pfn) {
+ pgoff_t offset;
+ /*
+ * For PFN swap offsets, keeping the offset field
+ * to be PFN only to be compatible with old smaps.
+ */
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
+ migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
+ if (pte_marker_entry_uffd_wp(entry))
+ flags |= PM_UFFD_WP;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1410,8 +1465,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
spinlock_t *ptl;
pte_t *pte, *orig_pte;
int err = 0;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool migration = false;
+
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
u64 flags = 0, frame = 0;
@@ -1439,7 +1495,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
unsigned long offset;
if (pm->show_pfn) {
- offset = swp_offset(entry) +
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
+ offset = offset +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
@@ -1450,11 +1510,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ migration = is_migration_entry(entry);
page = pfn_swap_entry_to_page(entry);
}
#endif
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
@@ -1523,10 +1584,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pte_pfn(pte) +
((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
}
for (; addr != end; addr += PAGE_SIZE) {
@@ -1564,7 +1630,8 @@ static const struct mm_walk_ops pagemap_ops = {
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bits 57-60 zero
+ * Bit 57 pte is uffd-wp write-protected
+ * Bits 58-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1751,7 +1818,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))
@@ -1839,8 +1906,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
return 0;
page = pte_page(huge_pte);
- if (!page)
- return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a6d21fc0033c..2fd06f52b6a4 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -20,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
-
- mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
bytes += kobjsize(vma);
region = vma->vm_region;
@@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long task_vsize(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- struct rb_node *p;
unsigned long vsize = 0;
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- }
mmap_read_unlock(mm);
return vsize;
}
@@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long size = kobjsize(mm);
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
*/
static int show_map(struct seq_file *m, void *_p)
{
- struct rb_node *p = _p;
-
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, _p);
}
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
struct mm_struct *mm;
- struct rb_node *p;
- loff_t n = *pos;
+ struct vm_area_struct *vma;
+ unsigned long addr = *pos;
+
+ /* See m_next(). Zero at the start or after lseek. */
+ if (addr == -1UL)
+ return NULL;
/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
@@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EINTR);
}
- /* start from the Nth VMA */
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
- if (n-- == 0)
- return p;
+ /* start the next element from addr */
+ vma = find_vma(mm, addr);
+ if (vma)
+ return vma;
mmap_read_unlock(mm);
mmput(mm);
@@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml)
static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
{
- struct rb_node *p = _p;
+ struct vm_area_struct *vma = _p;
- (*pos)++;
- return p ? rb_next(p) : NULL;
+ *pos = vma->vm_end;
+ return find_vma(vma->vm_mm, vma->vm_end);
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index deb99bc9b7e6..b5343d209381 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
+#include "internal.h"
static int uptime_proc_show(struct seq_file *m, void *v)
{
@@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v)
static int __init proc_uptime_init(void)
{
- proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index b449f186577f..02e3c3cd4a9a 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -5,6 +5,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include "internal.h"
static int version_proc_show(struct seq_file *m, void *v)
{
@@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v)
static int __init proc_version_init(void)
{
- proc_create_single("version", 0, NULL, version_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("version", 0, NULL, version_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 509f85148fee..f2aa86c421f2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,7 +25,7 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -62,18 +62,17 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-static DECLARE_RWSEM(vmcore_cb_rwsem);
+static DEFINE_SPINLOCK(vmcore_cb_lock);
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
/* List of registered vmcore callbacks. */
static LIST_HEAD(vmcore_cb_list);
-/* Whether we had a surprise unregistration of a callback. */
-static bool vmcore_cb_unstable;
/* Whether the vmcore has been opened once. */
static bool vmcore_opened;
void register_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
INIT_LIST_HEAD(&cb->next);
+ spin_lock(&vmcore_cb_lock);
list_add_tail(&cb->next, &vmcore_cb_list);
/*
* Registering a vmcore callback after the vmcore was opened is
@@ -81,24 +80,24 @@ void register_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback registration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
}
EXPORT_SYMBOL_GPL(register_vmcore_cb);
void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
- list_del(&cb->next);
+ spin_lock(&vmcore_cb_lock);
+ list_del_rcu(&cb->next);
/*
* Unregistering a vmcore callback after the vmcore was opened is
* very unusual (e.g., forced driver removal), but we cannot stop
* unregistering.
*/
- if (vmcore_opened) {
+ if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback unregistration\n");
- vmcore_cb_unstable = true;
- }
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
@@ -107,11 +106,8 @@ static bool pfn_is_ram(unsigned long pfn)
struct vmcore_cb *cb;
bool ret = true;
- lockdep_assert_held_read(&vmcore_cb_rwsem);
- if (unlikely(vmcore_cb_unstable))
- return false;
-
- list_for_each_entry(cb, &vmcore_cb_list, next) {
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
if (unlikely(!cb->pfn_is_ram))
continue;
ret = cb->pfn_is_ram(cb, pfn);
@@ -124,21 +120,21 @@ static bool pfn_is_ram(unsigned long pfn)
static int open_vmcore(struct inode *inode, struct file *file)
{
- down_read(&vmcore_cb_rwsem);
+ spin_lock(&vmcore_cb_lock);
vmcore_opened = true;
- up_read(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
return 0;
}
/* Reads a page from the oldmem device from given offset. */
-ssize_t read_from_oldmem(char *buf, size_t count,
- u64 *ppos, int userbuf,
- bool encrypted)
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+ u64 *ppos, bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -146,7 +142,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -155,35 +151,29 @@ ssize_t read_from_oldmem(char *buf, size_t count,
/* If pfn is not ram, return zeros for sparse dump files */
if (!pfn_is_ram(pfn)) {
- tmp = 0;
- if (!userbuf)
- memset(buf, 0, nr_bytes);
- else if (clear_user(buf, nr_bytes))
- tmp = -EFAULT;
+ tmp = iov_iter_zero(nr_bytes, iter);
} else {
if (encrypted)
- tmp = copy_oldmem_page_encrypted(pfn, buf,
+ tmp = copy_oldmem_page_encrypted(iter, pfn,
nr_bytes,
- offset,
- userbuf);
+ offset);
else
- tmp = copy_oldmem_page(pfn, buf, nr_bytes,
- offset, userbuf);
+ tmp = copy_oldmem_page(iter, pfn, nr_bytes,
+ offset);
}
- if (tmp < 0) {
- up_read(&vmcore_cb_rwsem);
- return tmp;
+ if (tmp < nr_bytes) {
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return -EFAULT;
}
*ppos += nr_bytes;
count -= nr_bytes;
- buf += nr_bytes;
read += nr_bytes;
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
- up_read(&vmcore_cb_rwsem);
return read;
}
@@ -206,7 +196,12 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, false);
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos, false);
}
/*
@@ -214,7 +209,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -231,29 +232,14 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
/*
* Architectures which support memory encryption override this.
*/
-ssize_t __weak
-copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
-{
- return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
-}
-
-/*
- * Copy to either kernel or user space
- */
-static int copy_to(void *target, void *src, size_t size, int userbuf)
+ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter,
+ unsigned long pfn, size_t csize, unsigned long offset)
{
- if (userbuf) {
- if (copy_to_user((char __user *) target, src, size))
- return -EFAULT;
- } else {
- memcpy(target, src, size);
- }
- return 0;
+ return copy_oldmem_page(iter, pfn, csize, offset);
}
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
-static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
{
struct vmcoredd_node *dump;
u64 offset = 0;
@@ -266,14 +252,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (copy_to(dst, buf, tsz, userbuf)) {
+ if (copy_to_iter(buf, tsz, iter) < tsz) {
ret = -EFAULT;
goto out_unlock;
}
size -= tsz;
start += tsz;
- dst += tsz;
/* Leave now if buffer filled already */
if (!size)
@@ -329,33 +314,28 @@ out_unlock:
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
-static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
- int userbuf)
+static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
{
ssize_t acc = 0, tmp;
size_t tsz;
u64 start;
struct vmcore *m = NULL;
- if (buflen == 0 || *fpos >= vmcore_size)
+ if (!iov_iter_count(iter) || *fpos >= vmcore_size)
return 0;
- /* trim buflen to not go beyond EOF */
- if (buflen > vmcore_size - *fpos)
- buflen = vmcore_size - *fpos;
+ iov_iter_truncate(iter, vmcore_size - *fpos);
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
- if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter));
+ if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
@@ -376,35 +356,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
/* Read device dumps */
if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
- (size_t)*fpos, buflen);
+ (size_t)*fpos, iov_iter_count(iter));
start = *fpos - elfcorebuf_sz;
- if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+ if (vmcoredd_copy_dumps(iter, start, tsz))
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (!buflen)
+ if (!iov_iter_count(iter))
return acc;
}
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
/* Read remaining elf notes */
- tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos,
+ iov_iter_count(iter));
kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
- if (copy_to(buffer, kaddr, tsz, userbuf))
+ if (copy_to_iter(kaddr, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
@@ -412,19 +389,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
if (*fpos < m->offset + m->size) {
tsz = (size_t)min_t(unsigned long long,
m->offset + m->size - *fpos,
- buflen);
+ iov_iter_count(iter));
start = m->paddr + *fpos - m->offset;
- tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ tmp = read_from_oldmem(iter, tsz, &start,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
}
@@ -432,15 +407,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
return acc;
}
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
{
- return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+ return __read_vmcore(iter, &iocb->ki_pos);
}
/*
* The vmcore fault handler uses the page cache and fills data using the
- * standard __vmcore_read() function.
+ * standard __read_vmcore() function.
*
* On s390 the fault handler is used for memory regions that can't be mapped
* directly with remap_pfn_range().
@@ -450,9 +424,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#ifdef CONFIG_S390
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pgoff_t index = vmf->pgoff;
+ struct iov_iter iter;
+ struct kvec kvec;
struct page *page;
loff_t offset;
- char *buf;
int rc;
page = find_or_create_page(mapping, index, GFP_KERNEL);
@@ -460,8 +435,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
offset = (loff_t) index << PAGE_SHIFT;
- buf = __va((page_to_pfn(page) << PAGE_SHIFT));
- rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+ kvec.iov_base = page_address(page);
+ kvec.iov_len = PAGE_SIZE;
+ iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+
+ rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
unlock_page(page);
put_page(page);
@@ -483,7 +461,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @sizez: size of buffer
+ * @size: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -574,18 +552,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
- int ret;
+ int ret, idx;
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- down_read(&vmcore_cb_rwsem);
- if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable)
+ idx = srcu_read_lock(&vmcore_cb_srcu);
+ if (!list_empty(&vmcore_cb_list))
ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return ret;
}
@@ -711,7 +689,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
static const struct proc_ops vmcore_proc_ops = {
.proc_open = open_vmcore,
- .proc_read = read_vmcore,
+ .proc_read_iter = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
};