aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/array.c91
-rw-r--r--fs/proc/base.c60
-rw-r--r--fs/proc/fd.c3
-rw-r--r--fs/proc/generic.c184
-rw-r--r--fs/proc/inode.c12
-rw-r--r--fs/proc/internal.h13
-rw-r--r--fs/proc/meminfo.c15
-rw-r--r--fs/proc/namespaces.c153
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c343
-rw-r--r--fs/proc/vmcore.c8
14 files changed, 449 insertions, 453 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e4f35c..1295a00ca316 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <asm/pgtable.h>
@@ -89,39 +90,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
- int i;
- char *buf, *end;
- char *name;
+ char *buf;
char tcomm[sizeof(p->comm)];
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- end = m->buf + m->size;
buf = m->buf + m->count;
- name = tcomm;
- i = sizeof(tcomm);
- while (i && (buf < end)) {
- unsigned char c = *name;
- name++;
- i--;
- *buf = c;
- if (!c)
- break;
- if (c == '\\') {
- buf++;
- if (buf < end)
- *buf++ = c;
- continue;
- }
- if (c == '\n') {
- *buf++ = '\\';
- if (buf < end)
- *buf++ = 'n';
- continue;
- }
- buf++;
- }
+
+ /* Ignore error for now */
+ string_escape_str(tcomm, &buf, m->size - m->count,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+
m->count = buf - m->buf;
seq_putc(m, '\n');
}
@@ -157,20 +137,29 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
int g;
- struct fdtable *fdt = NULL;
+ struct task_struct *tracer;
const struct cred *cred;
- pid_t ppid, tpid;
+ pid_t ppid, tpid = 0, tgid, ngid;
+ unsigned int max_fds = 0;
rcu_read_lock();
ppid = pid_alive(p) ?
task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
- tpid = 0;
- if (pid_alive(p)) {
- struct task_struct *tracer = ptrace_parent(p);
- if (tracer)
- tpid = task_pid_nr_ns(tracer, ns);
- }
+
+ tracer = ptrace_parent(p);
+ if (tracer)
+ tpid = task_pid_nr_ns(tracer, ns);
+
+ tgid = task_tgid_nr_ns(p, ns);
+ ngid = task_numa_group_id(p);
cred = get_task_cred(p);
+
+ task_lock(p);
+ if (p->files)
+ max_fds = files_fdtable(p->files)->max_fds;
+ task_unlock(p);
+ rcu_read_unlock();
+
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
@@ -179,12 +168,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
- "Gid:\t%d\t%d\t%d\t%d\n",
+ "Gid:\t%d\t%d\t%d\t%d\n"
+ "FDSize:\t%d\nGroups:\t",
get_task_state(p),
- task_tgid_nr_ns(p, ns),
- task_numa_group_id(p),
- pid_nr_ns(pid, ns),
- ppid, tpid,
+ tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
from_kuid_munged(user_ns, cred->uid),
from_kuid_munged(user_ns, cred->euid),
from_kuid_munged(user_ns, cred->suid),
@@ -192,20 +179,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
from_kgid_munged(user_ns, cred->gid),
from_kgid_munged(user_ns, cred->egid),
from_kgid_munged(user_ns, cred->sgid),
- from_kgid_munged(user_ns, cred->fsgid));
-
- task_lock(p);
- if (p->files)
- fdt = files_fdtable(p->files);
- seq_printf(m,
- "FDSize:\t%d\n"
- "Groups:\t",
- fdt ? fdt->max_fds : 0);
- rcu_read_unlock();
+ from_kgid_munged(user_ns, cred->fsgid),
+ max_fds);
group_info = cred->group_info;
- task_unlock(p);
-
for (g = 0; g < group_info->ngroups; g++)
seq_printf(m, "%d ",
from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
@@ -339,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_puts(m, "Cpus_allowed:\t");
- seq_cpumask(m, &task->cpus_allowed);
- seq_putc(m, '\n');
- seq_puts(m, "Cpus_allowed_list:\t");
- seq_cpumask_list(m, &task->cpus_allowed);
- seq_putc(m, '\n');
+ seq_printf(m, "Cpus_allowed:\t%*pb\n",
+ cpumask_pr_args(&task->cpus_allowed));
+ seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
+ cpumask_pr_args(&task->cpus_allowed));
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 772efa45a452..3f3d7aeb0712 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2464,6 +2464,57 @@ static const struct file_operations proc_projid_map_operations = {
.llseek = seq_lseek,
.release = proc_id_map_release,
};
+
+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ int ret;
+
+ ret = -ESRCH;
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ if (file->f_mode & FMODE_WRITE) {
+ ret = -EACCES;
+ if (!ns_capable(ns, CAP_SYS_ADMIN))
+ goto err_put_ns;
+ }
+
+ ret = single_open(file, &proc_setgroups_show, ns);
+ if (ret)
+ goto err_put_ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_setgroups_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ int ret = single_release(inode, file);
+ put_user_ns(ns);
+ return ret;
+}
+
+static const struct file_operations proc_setgroups_operations = {
+ .open = proc_setgroups_open,
+ .write = proc_setgroups_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_setgroups_release,
+};
#endif /* CONFIG_USER_NS */
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -2572,6 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
@@ -2618,6 +2670,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
dput(dentry);
}
+ if (pid == tgid)
+ return;
+
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", tgid);
leader = d_hash_and_lookup(mnt->mnt_root, &name);
@@ -2789,7 +2844,7 @@ retry:
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
- struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -2913,6 +2968,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
};
@@ -3095,7 +3151,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = file->f_dentry->d_sb->s_fs_info;
+ ns = inode->i_sb->s_fs_info;
tid = (int)file->f_version;
file->f_version = 0;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index e11d7c590bb0..8e5ad83b629a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -53,7 +53,8 @@ static int seq_show(struct seq_file *m, void *v)
(long long)file->f_pos, f_flags,
real_mount(file->f_path.mnt)->mnt_id);
if (file->f_op->show_fdinfo)
- ret = file->f_op->show_fdinfo(m, file);
+ file->f_op->show_fdinfo(m, file);
+ ret = seq_has_overflowed(m);
fput(file);
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 317b72641ebf..3309f59d421b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -31,9 +31,73 @@ static DEFINE_SPINLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
- if (de->namelen != len)
- return 0;
- return !memcmp(name, de->name, len);
+ if (len < de->namelen)
+ return -1;
+ if (len > de->namelen)
+ return 1;
+
+ return memcmp(name, de->name, len);
+}
+
+static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
+{
+ return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
+ subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
+{
+ return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
+ subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+ const char *name,
+ unsigned int len)
+{
+ struct rb_node *node = dir->subdir.rb_node;
+
+ while (node) {
+ struct proc_dir_entry *de = container_of(node,
+ struct proc_dir_entry,
+ subdir_node);
+ int result = proc_match(len, name, de);
+
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return de;
+ }
+ return NULL;
+}
+
+static bool pde_subdir_insert(struct proc_dir_entry *dir,
+ struct proc_dir_entry *de)
+{
+ struct rb_root *root = &dir->subdir;
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct proc_dir_entry *this =
+ container_of(*new, struct proc_dir_entry, subdir_node);
+ int result = proc_match(de->namelen, de->name, this);
+
+ parent = *new;
+ if (result < 0)
+ new = &(*new)->rb_left;
+ else if (result > 0)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&de->subdir_node, parent, new);
+ rb_insert_color(&de->subdir_node, root);
+ return true;
}
static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -58,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- struct proc_dir_entry *de = PROC_I(inode)->pde;
+ struct proc_dir_entry *de = PDE(inode);
if (de && de->nlink)
set_nlink(inode, de->nlink);
@@ -92,10 +156,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
break;
len = next - cp;
- for (de = de->subdir; de ; de = de->next) {
- if (proc_match(len, cp, de))
- break;
- }
+ de = pde_subdir_find(de, cp, len);
if (!de) {
WARN(1, "name '%s'\n", name);
return -ENOENT;
@@ -183,19 +244,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct inode *inode;
spin_lock(&proc_subdir_lock);
- for (de = de->subdir; de ; de = de->next) {
- if (de->namelen != dentry->d_name.len)
- continue;
- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
- pde_get(de);
- spin_unlock(&proc_subdir_lock);
- inode = proc_get_inode(dir->i_sb, de);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &simple_dentry_operations);
- d_add(dentry, inode);
- return NULL;
- }
+ de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
+ if (de) {
+ pde_get(de);
+ spin_unlock(&proc_subdir_lock);
+ inode = proc_get_inode(dir->i_sb, de);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ d_set_d_op(dentry, &simple_dentry_operations);
+ d_add(dentry, inode);
+ return NULL;
}
spin_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -225,7 +283,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
return 0;
spin_lock(&proc_subdir_lock);
- de = de->subdir;
+ de = pde_subdir_first(de);
i = ctx->pos - 2;
for (;;) {
if (!de) {
@@ -234,7 +292,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
}
if (!i)
break;
- de = de->next;
+ de = pde_subdir_next(de);
i--;
}
@@ -249,7 +307,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
}
spin_lock(&proc_subdir_lock);
ctx->pos++;
- next = de->next;
+ next = pde_subdir_next(de);
pde_put(de);
de = next;
} while (de);
@@ -286,39 +344,21 @@ static const struct inode_operations proc_dir_inode_operations = {
static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
- struct proc_dir_entry *tmp;
int ret;
-
+
ret = proc_alloc_inum(&dp->low_ino);
if (ret)
return ret;
- if (S_ISDIR(dp->mode)) {
- dp->proc_fops = &proc_dir_operations;
- dp->proc_iops = &proc_dir_inode_operations;
- dir->nlink++;
- } else if (S_ISLNK(dp->mode)) {
- dp->proc_iops = &proc_link_inode_operations;
- } else if (S_ISREG(dp->mode)) {
- BUG_ON(dp->proc_fops == NULL);
- dp->proc_iops = &proc_file_inode_operations;
- } else {
- WARN_ON(1);
- return -EINVAL;
- }
-
spin_lock(&proc_subdir_lock);
-
- for (tmp = dir->subdir; tmp; tmp = tmp->next)
- if (strcmp(tmp->name, dp->name) == 0) {
- WARN(1, "proc_dir_entry '%s/%s' already registered\n",
- dir->name, dp->name);
- break;
- }
-
- dp->next = dir->subdir;
dp->parent = dir;
- dir->subdir = dp;
+ if (pde_subdir_insert(dir, dp) == false) {
+ WARN(1, "proc_dir_entry '%s/%s' already registered\n",
+ dir->name, dp->name);
+ spin_unlock(&proc_subdir_lock);
+ proc_free_inum(dp->low_ino);
+ return -EEXIST;
+ }
spin_unlock(&proc_subdir_lock);
return 0;
@@ -354,6 +394,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
+ ent->subdir = RB_ROOT;
atomic_set(&ent->count, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
@@ -373,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
if (ent->data) {
strcpy((char*)ent->data,dest);
+ ent->proc_iops = &proc_link_inode_operations;
if (proc_register(parent, ent) < 0) {
kfree(ent->data);
kfree(ent);
@@ -398,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
ent->data = data;
+ ent->proc_fops = &proc_dir_operations;
+ ent->proc_iops = &proc_dir_inode_operations;
+ parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
+ parent->nlink--;
ent = NULL;
}
}
@@ -435,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
return NULL;
}
+ BUG_ON(proc_fops == NULL);
+
if ((mode & S_IALLUGO) == 0)
mode |= S_IRUGO;
pde = __proc_create(&parent, name, mode, 1);
@@ -442,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
goto out;
pde->proc_fops = proc_fops;
pde->data = data;
+ pde->proc_iops = &proc_file_inode_operations;
if (proc_register(parent, pde) < 0)
goto out_free;
return pde;
@@ -485,7 +534,6 @@ void pde_put(struct proc_dir_entry *pde)
*/
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
- struct proc_dir_entry **p;
struct proc_dir_entry *de = NULL;
const char *fn = name;
unsigned int len;
@@ -497,14 +545,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
}
len = strlen(fn);
- for (p = &parent->subdir; *p; p=&(*p)->next ) {
- if (proc_match(len, fn, *p)) {
- de = *p;
- *p = de->next;
- de->next = NULL;
- break;
- }
- }
+ de = pde_subdir_find(parent, fn, len);
+ if (de)
+ rb_erase(&de->subdir_node, &parent->subdir);
spin_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -516,16 +559,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
if (S_ISDIR(de->mode))
parent->nlink--;
de->nlink = 0;
- WARN(de->subdir, "%s: removing non-empty directory "
- "'%s/%s', leaking at least '%s'\n", __func__,
- de->parent->name, de->name, de->subdir->name);
+ WARN(pde_subdir_first(de),
+ "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+ __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
pde_put(de);
}
EXPORT_SYMBOL(remove_proc_entry);
int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
{
- struct proc_dir_entry **p;
struct proc_dir_entry *root = NULL, *de, *next;
const char *fn = name;
unsigned int len;
@@ -537,24 +579,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
}
len = strlen(fn);
- for (p = &parent->subdir; *p; p=&(*p)->next ) {
- if (proc_match(len, fn, *p)) {
- root = *p;
- *p = root->next;
- root->next = NULL;
- break;
- }
- }
+ root = pde_subdir_find(parent, fn, len);
if (!root) {
spin_unlock(&proc_subdir_lock);
return -ENOENT;
}
+ rb_erase(&root->subdir_node, &parent->subdir);
+
de = root;
while (1) {
- next = de->subdir;
+ next = pde_subdir_first(de);
if (next) {
- de->subdir = next->next;
- next->next = NULL;
+ rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 333080d7a671..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -32,8 +32,6 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
- const struct proc_ns_operations *ns_ops;
- void *ns;
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@@ -42,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
put_pid(PROC_I(inode)->pid);
/* Let go of any associated proc directory entry */
- de = PROC_I(inode)->pde;
+ de = PDE(inode);
if (de)
pde_put(de);
head = PROC_I(inode)->sysctl;
@@ -50,11 +48,6 @@ static void proc_evict_inode(struct inode *inode)
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head);
}
- /* Release any associated namespace */
- ns_ops = PROC_I(inode)->ns.ns_ops;
- ns = PROC_I(inode)->ns.ns;
- if (ns_ops && ns)
- ns_ops->put(ns);
}
static struct kmem_cache * proc_inode_cachep;
@@ -73,8 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
- ei->ns.ns = NULL;
- ei->ns.ns_ops = NULL;
+ ei->ns_ops = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa7a0ee182e1..6fcdba573e0f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -24,10 +24,9 @@ struct mempolicy;
* tree) of these proc_dir_entries, so that we can dynamically
* add new files to /proc.
*
- * The "next" pointer creates a linked list of one /proc directory,
- * while parent/subdir create the directory structure (every
- * /proc file has a parent, but "subdir" is NULL for all
- * non-directory entries).
+ * parent/subdir are used for the directory structure (every /proc file has a
+ * parent, but "subdir" is empty for all non-directory entries).
+ * subdir_node is used to build the rb tree "subdir" of the parent.
*/
struct proc_dir_entry {
unsigned int low_ino;
@@ -38,7 +37,9 @@ struct proc_dir_entry {
loff_t size;
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
- struct proc_dir_entry *next, *parent, *subdir;
+ struct proc_dir_entry *parent;
+ struct rb_root subdir;
+ struct rb_node subdir_node;
void *data;
atomic_t count; /* use count */
atomic_t in_use; /* number of callers into module in progress; */
@@ -64,7 +65,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
- struct proc_ns ns;
+ const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index aa1eee06420f..d3ebf2e61853 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -12,6 +12,9 @@
#include <linux/vmstat.h>
#include <linux/atomic.h>
#include <linux/vmalloc.h>
+#ifdef CONFIG_CMA
+#include <linux/cma.h>
+#endif
#include <asm/page.h>
#include <asm/pgtable.h>
#include "internal.h"
@@ -138,6 +141,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages: %8lu kB\n"
#endif
+#ifdef CONFIG_CMA
+ "CmaTotal: %8lu kB\n"
+ "CmaFree: %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
@@ -187,12 +194,16 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
vmi.used >> 10,
vmi.largest_chunk >> 10
#ifdef CONFIG_MEMORY_FAILURE
- ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+ , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
HPAGE_PMD_NR)
#endif
+#ifdef CONFIG_CMA
+ , K(totalcma_pages)
+ , K(global_page_state(NR_FREE_CMA_PAGES))
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 89026095f2b5..c9eac4563fa8 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -1,10 +1,6 @@
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
-#include <linux/sched.h>
#include <linux/ptrace.h>
-#include <linux/fs_struct.h>
-#include <linux/mount.h>
-#include <linux/path.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/utsname.h>
@@ -34,138 +30,45 @@ static const struct proc_ns_operations *ns_entries[] = {
&mntns_operations,
};
-static const struct file_operations ns_file_operations = {
- .llseek = no_llseek,
-};
-
-static const struct inode_operations ns_inode_operations = {
- .setattr = proc_setattr,
-};
-
-static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- struct inode *inode = dentry->d_inode;
- const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
-
- return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
- ns_ops->name, inode->i_ino);
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
- .d_delete = always_delete_dentry,
- .d_dname = ns_dname,
-};
-
-static struct dentry *proc_ns_get_dentry(struct super_block *sb,
- struct task_struct *task, const struct proc_ns_operations *ns_ops)
-{
- struct dentry *dentry, *result;
- struct inode *inode;
- struct proc_inode *ei;
- struct qstr qname = { .name = "", };
- void *ns;
-
- ns = ns_ops->get(task);
- if (!ns)
- return ERR_PTR(-ENOENT);
-
- dentry = d_alloc_pseudo(sb, &qname);
- if (!dentry) {
- ns_ops->put(ns);
- return ERR_PTR(-ENOMEM);
- }
-
- inode = iget_locked(sb, ns_ops->inum(ns));
- if (!inode) {
- dput(dentry);
- ns_ops->put(ns);
- return ERR_PTR(-ENOMEM);
- }
-
- ei = PROC_I(inode);
- if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_op = &ns_inode_operations;
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &ns_file_operations;
- ei->ns.ns_ops = ns_ops;
- ei->ns.ns = ns;
- unlock_new_inode(inode);
- } else {
- ns_ops->put(ns);
- }
-
- d_set_d_op(dentry, &ns_dentry_operations);
- result = d_instantiate_unique(dentry, inode);
- if (result) {
- dput(dentry);
- dentry = result;
- }
-
- return dentry;
-}
-
static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
- struct super_block *sb = inode->i_sb;
- struct proc_inode *ei = PROC_I(inode);
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
task = get_proc_task(inode);
if (!task)
- goto out;
+ return error;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto out_put_task;
-
- ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
- if (IS_ERR(ns_path.dentry)) {
- error = ERR_CAST(ns_path.dentry);
- goto out_put_task;
+ if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error)
+ nd_jump_link(nd, &ns_path);
}
-
- ns_path.mnt = mntget(nd->path.mnt);
- nd_jump_link(nd, &ns_path);
- error = NULL;
-
-out_put_task:
put_task_struct(task);
-out:
return error;
}
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct inode *inode = dentry->d_inode;
- struct proc_inode *ei = PROC_I(inode);
- const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
- void *ns;
char name[50];
int res = -EACCES;
task = get_proc_task(inode);
if (!task)
- goto out;
-
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto out_put_task;
+ return res;
- res = -ENOENT;
- ns = ns_ops->get(task);
- if (!ns)
- goto out_put_task;
-
- snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
- res = readlink_copy(buffer, buflen, name);
- ns_ops->put(ns);
-out_put_task:
+ if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ res = ns_get_name(name, sizeof(name), task, ns_ops);
+ if (res >= 0)
+ res = readlink_copy(buffer, buflen, name);
+ }
put_task_struct(task);
-out:
return res;
}
@@ -189,7 +92,7 @@ static int proc_ns_instantiate(struct inode *dir,
ei = PROC_I(inode);
inode->i_mode = S_IFLNK|S_IRWXUGO;
inode->i_op = &proc_ns_link_inode_operations;
- ei->ns.ns_ops = ns_ops;
+ ei->ns_ops = ns_ops;
d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
@@ -267,31 +170,3 @@ const struct inode_operations proc_ns_dir_inode_operations = {
.getattr = pid_getattr,
.setattr = proc_setattr,
};
-
-struct file *proc_ns_fget(int fd)
-{
- struct file *file;
-
- file = fget(fd);
- if (!file)
- return ERR_PTR(-EBADF);
-
- if (file->f_op != &ns_file_operations)
- goto out_invalid;
-
- return file;
-
-out_invalid:
- fput(file);
- return ERR_PTR(-EINVAL);
-}
-
-struct proc_ns *get_proc_ns(struct inode *inode)
-{
- return &PROC_I(inode)->ns;
-}
-
-bool proc_ns_inode(struct inode *inode)
-{
- return inode->i_fop == &ns_file_operations;
-}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
#include <linux/ksm.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/huge_mm.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
* just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
* to make sure a given page is a thp, not a non-huge compound page.
*/
- else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
- PageAnon(compound_head(page))))
- u |= 1 << KPF_THP;
+ else if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ if (PageLRU(head) || PageAnon(head))
+ u |= 1 << KPF_THP;
+ else if (is_huge_zero_page(head)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ }
+ } else if (is_zero_pfn(page_to_pfn(page)))
+ u |= 1 << KPF_ZERO_PAGE;
+
/*
* Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..1bde894bc624 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net)
if (!netd)
goto out;
+ netd->subdir = RB_ROOT;
netd->data = net;
netd->nlink = 2;
netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 094e44d4a6be..e74ac9f1a2c0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = {
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
+ .subdir = RB_ROOT,
.name = "/proc",
};
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf2d03f8fd3e..510413eb25b8 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -159,7 +159,7 @@ static int show_stat(struct seq_file *p, void *v)
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_put_decimal_ull(p, ' ', kstat_irqs(j));
+ seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4e0388cffe3d..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap;
+ unsigned long data, text, lib, swap, ptes, pmds;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
/*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
+ ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+ pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n"
+ "VmPMD:\t%8lu kB\n"
"VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE * sizeof(pte_t) *
- atomic_long_read(&mm->nr_ptes)) >> 10,
+ ptes >> 10,
+ pmds >> 10,
swap << (PAGE_SHIFT-10));
}
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
- struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -443,75 +445,97 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
- unsigned long nonlinear;
u64 pss;
};
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
+ unsigned long size, bool young, bool dirty)
+{
+ int mapcount;
+
+ if (PageAnon(page))
+ mss->anonymous += size;
+
+ mss->resident += size;
+ /* Accumulate the size in pages that have been accessed. */
+ if (young || PageReferenced(page))
+ mss->referenced += size;
+ mapcount = page_mapcount(page);
+ if (mapcount >= 2) {
+ u64 pss_delta;
+
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += size;
+ else
+ mss->shared_clean += size;
+ pss_delta = (u64)size << PSS_SHIFT;
+ do_div(pss_delta, mapcount);
+ mss->pss += pss_delta;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += size;
+ else
+ mss->private_clean += size;
+ mss->pss += (u64)size << PSS_SHIFT;
+ }
+}
-static void smaps_pte_entry(pte_t ptent, unsigned long addr,
- unsigned long ptent_size, struct mm_walk *walk)
+static void smaps_pte_entry(pte_t *pte, unsigned long addr,
+ struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
- pgoff_t pgoff = linear_page_index(vma, addr);
+ struct vm_area_struct *vma = walk->vma;
struct page *page = NULL;
- int mapcount;
- if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
- } else if (is_swap_pte(ptent)) {
- swp_entry_t swpent = pte_to_swp_entry(ptent);
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
if (!non_swap_entry(swpent))
- mss->swap += ptent_size;
+ mss->swap += PAGE_SIZE;
else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
- } else if (pte_file(ptent)) {
- if (pte_to_pgoff(ptent) != pgoff)
- mss->nonlinear += ptent_size;
}
if (!page)
return;
+ smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+}
- if (PageAnon(page))
- mss->anonymous += ptent_size;
-
- if (page->index != pgoff)
- mss->nonlinear += ptent_size;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
- mss->resident += ptent_size;
- /* Accumulate the size in pages that have been accessed. */
- if (pte_young(ptent) || PageReferenced(page))
- mss->referenced += ptent_size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- if (pte_dirty(ptent) || PageDirty(page))
- mss->shared_dirty += ptent_size;
- else
- mss->shared_clean += ptent_size;
- mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
- } else {
- if (pte_dirty(ptent) || PageDirty(page))
- mss->private_dirty += ptent_size;
- else
- mss->private_clean += ptent_size;
- mss->pss += (ptent_size << PSS_SHIFT);
- }
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ if (IS_ERR_OR_NULL(page))
+ return;
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ smaps_account(mss, page, HPAGE_PMD_SIZE,
+ pmd_young(*pmd), pmd_dirty(*pmd));
+}
+#else
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+ struct mm_walk *walk)
+{
}
+#endif
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
+ smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
- mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -524,7 +548,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
- smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
+ smaps_pte_entry(pte, addr, walk);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
@@ -552,6 +576,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
[ilog2(VM_DENYWRITE)] = "dw",
+#ifdef CONFIG_X86_INTEL_MPX
+ [ilog2(VM_MPX)] = "mp",
+#endif
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
@@ -561,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
- [ilog2(VM_NONLINEAR)] = "nl",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_DONTDUMP)] = "dd",
#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -595,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
};
memset(&mss, 0, sizeof mss);
- mss.vma = vma;
/* mmap_sem is held in m_start */
- if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
+ walk_page_vma(vma, &smaps_walk);
show_map_vma(m, vma, is_pid);
@@ -633,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
- if (vma->vm_flags & VM_NONLINEAR)
- seq_printf(m, "Nonlinear: %8lu kB\n",
- mss.nonlinear >> 10);
-
show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
return 0;
@@ -712,18 +732,18 @@ enum clear_refs_types {
CLEAR_REFS_ANON,
CLEAR_REFS_MAPPED,
CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_MM_HIWATER_RSS,
CLEAR_REFS_LAST,
};
struct clear_refs_private {
- struct vm_area_struct *vma;
enum clear_refs_types type;
};
+#ifdef CONFIG_MEM_SOFT_DIRTY
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
@@ -737,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
- } else if (pte_file(ptent)) {
- ptent = pte_file_clear_soft_dirty(ptent);
}
set_pte_at(vma->vm_mm, addr, pte, ptent);
-#endif
}
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd = *pmdp;
+
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+
+#else
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct clear_refs_private *cp = walk->private;
- struct vm_area_struct *vma = cp->vma;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
- split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty_pmd(vma, addr, pmd);
+ goto out;
+ }
+
+ page = pmd_page(*pmd);
+
+ /* Clear accessed and referenced bits. */
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ ClearPageReferenced(page);
+out:
+ spin_unlock(ptl);
+ return 0;
+ }
+
if (pmd_trans_unstable(pmd))
return 0;
@@ -783,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+ * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
+ */
+ if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+ return 1;
+ if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ return 1;
+ return 0;
+}
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -823,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
};
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
+ .test_walk = clear_refs_test_walk,
.mm = mm,
.private = &cp,
};
+
+ if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ /*
+ * Writing 5 to /proc/pid/clear_refs resets the peak
+ * resident set size to this mm's current rss value.
+ */
+ down_write(&mm->mmap_sem);
+ reset_mm_hiwater_rss(mm);
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
+
down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -842,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
mmu_notifier_invalidate_range_start(mm, 0, -1);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- cp.vma = vma;
- if (is_vm_hugetlb_page(vma))
- continue;
- /*
- * Writing 1 to /proc/pid/clear_refs affects all pages.
- *
- * Writing 2 to /proc/pid/clear_refs only affects
- * Anonymous pages.
- *
- * Writing 3 to /proc/pid/clear_refs only affects file
- * mapped pages.
- *
- * Writing 4 to /proc/pid/clear_refs affects all pages.
- */
- if (type == CLEAR_REFS_ANON && vma->vm_file)
- continue;
- if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
- }
+ walk_page_range(0, ~0UL, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
+out_mm:
mmput(mm);
}
put_task_struct(task);
@@ -1031,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
struct pagemapread *pm = walk->private;
spinlock_t *ptl;
- pte_t *pte;
+ pte_t *pte, *orig_pte;
int err = 0;
- /* find the first VMA at or above 'addr' */
- vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1065,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pmd_trans_unstable(pmd))
return 0;
- while (1) {
- /* End of address space hole, which we mark as non-present. */
- unsigned long hole_end;
-
- if (vma)
- hole_end = min(end, vma->vm_start);
- else
- hole_end = end;
-
- for (; addr < hole_end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- return err;
- }
-
- if (!vma || vma->vm_start >= end)
- break;
- /*
- * We can't possibly be in a hugetlb VMA. In general,
- * for a mm_walk with a pmd_entry and a hugetlb_entry,
- * the pmd_entry can only be called on addresses in a
- * hugetlb if the walk starts in a non-hugetlb VMA and
- * spans a hugepage VMA. Since pagemap_read walks are
- * PMD-sized and PMD-aligned, this will never be true.
- */
- BUG_ON(is_vm_hugetlb_page(vma));
-
- /* Addresses in the VMA. */
- for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- pagemap_entry_t pme;
- pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
- pte_unmap(pte);
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- return err;
- }
+ /*
+ * We can assume that @vma always points to a valid one and @end never
+ * goes beyond vma->vm_end.
+ */
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ pagemap_entry_t pme;
- if (addr == end)
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
break;
-
- vma = find_vma(walk->mm, addr);
}
+ pte_unmap_unlock(orig_pte, ptl);
cond_resched();
@@ -1135,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
int err = 0;
int flags2;
pagemap_entry_t pme;
- vma = find_vma(walk->mm, addr);
- WARN_ON_ONCE(!vma);
-
- if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ if (vma->vm_flags & VM_SOFTDIRTY)
flags2 = __PM_SOFT_DIRTY;
else
flags2 = 0;
@@ -1303,7 +1341,6 @@ const struct file_operations proc_pagemap_operations = {
#ifdef CONFIG_NUMA
struct numa_maps {
- struct vm_area_struct *vma;
unsigned long pages;
unsigned long anon;
unsigned long active;
@@ -1372,18 +1409,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct numa_maps *md;
+ struct numa_maps *md = walk->private;
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte;
pte_t *pte;
- md = walk->private;
-
- if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
- page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ page = can_gather_numa_stats(huge_pte, vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1395,7 +1431,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
- struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+ struct page *page = can_gather_numa_stats(*pte, vma, addr);
if (!page)
continue;
gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1405,7 +1441,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
@@ -1424,7 +1460,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
}
#else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
return 0;
@@ -1442,7 +1478,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mm_walk walk = {};
+ struct mm_walk walk = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+ .private = md,
+ .mm = mm,
+ };
struct mempolicy *pol;
char buffer[64];
int nid;
@@ -1453,13 +1494,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- md->vma = vma;
-
- walk.hugetlb_entry = gather_hugetbl_stats;
- walk.pmd_entry = gather_pte_stats;
- walk.private = md;
- walk.mm = mm;
-
pol = __get_vma_policy(vma, vma->vm_start);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1493,7 +1527,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- walk_page_range(vma->vm_start, vma->vm_end, &walk);
+ /* mmap_sem is held by m_start */
+ walk_page_vma(vma, &walk);
if (!md->pages)
goto out;
@@ -1522,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
for_each_node_state(nid, N_MEMORY)
if (md->node[nid])
seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+
+ seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
m_cache_vma(m, vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a90d6d354199..4e61388ec03d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
nhdr_ptr = notes_section;
while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf64_Nhdr) +
- ((nhdr_ptr->n_namesz + 3) & ~3) +
- ((nhdr_ptr->n_descsz + 3) & ~3);
+ (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+ (((u64)nhdr_ptr->n_descsz + 3) & ~3);
if ((real_sz + sz) > max_sz) {
pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
@@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
nhdr_ptr = notes_section;
while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf32_Nhdr) +
- ((nhdr_ptr->n_namesz + 3) & ~3) +
- ((nhdr_ptr->n_descsz + 3) & ~3);
+ (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+ (((u64)nhdr_ptr->n_descsz + 3) & ~3);
if ((real_sz + sz) > max_sz) {
pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);