From c6c405336bd3b0ebd1d76aaf9ea88b35dba77e61 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 16 Jul 2019 16:26:39 -0700 Subject: vmcore: add a kernel parameter novmcoredd Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in second kernel"), drivers are allowed to add device related dump data to vmcore as they want by using the device dump API. This has a potential issue, the data is stored in memory, drivers may append too much data and use too much memory. The vmcore is typically used in a kdump kernel which runs in a pre-reserved small chunk of memory. So as a result it will make kdump unusable at all due to OOM issues. So introduce new 'novmcoredd' command line option. User can disable device dump to reduce memory usage. This is helpful if device dump is using too much memory, disabling device dump could make sure a regular vmcore without device dump data is still available. [akpm@linux-foundation.org: tweak documentation] [akpm@linux-foundation.org: vmcore.c needs moduleparam.h] Link: http://lkml.kernel.org/r/20190528111856.7276-1-kasong@redhat.com Signed-off-by: Kairui Song Acked-by: Dave Young Reviewed-by: Bhupesh Sharma Cc: Rahul Lakkireddy Cc: "David S . Miller" Cc: Eric Biederman Cc: Alexey Dobriyan Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Kconfig | 3 ++- fs/proc/vmcore.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 4c3dcb718961..cba429db95d9 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -58,7 +58,8 @@ config PROC_VMCORE_DEVICE_DUMP snapshot. If you say Y here, the collected device dumps will be added - as ELF notes to /proc/vmcore. + as ELF notes to /proc/vmcore. You can still disable device + dump using the kernel command line option 'novmcoredd'. config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 57957c91c6df..7bcc92add72c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,9 @@ static struct proc_dir_entry *proc_vmcore; /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); + +static bool vmcoredd_disabled; +core_param(novmcoredd, vmcoredd_disabled, bool, 0); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Device Dump Size */ @@ -1452,6 +1456,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) size_t data_size; int ret; + if (vmcoredd_disabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + if (!data || !strlen(data->dump_name) || !data->vmcoredd_callback || !data->size) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From 9af27b28b1da1020e427b626c4967d0206b55100 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:26:45 -0700 Subject: fs/proc/inode.c: use typeof_member() macro Don't repeat function signatures twice. This is a kind-of-precursor for "struct proc_ops". Note: typeof(pde->proc_fops->...) ...; can't be used because ->proc_fops is "const struct file_operations *". "const" prevents assignment down the code and it can't be deleted in the type system. Link: http://lkml.kernel.org/r/20190529191110.GB5703@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 5f8d215b3fd0..dbe43a50caf2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -200,7 +200,8 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (use_pde(pde)) { - loff_t (*llseek)(struct file *, loff_t, int); + typeof_member(struct file_operations, llseek) llseek; + llseek = pde->proc_fops->llseek; if (!llseek) llseek = default_llseek; @@ -212,10 +213,11 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { + typeof_member(struct file_operations, read) read; + read = pde->proc_fops->read; if (read) rv = read(file, buf, count, ppos); @@ -226,10 +228,11 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { + typeof_member(struct file_operations, write) write; + write = pde->proc_fops->write; if (write) rv = write(file, buf, count, ppos); @@ -242,8 +245,9 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - __poll_t (*poll)(struct file *, struct poll_table_struct *); if (use_pde(pde)) { + typeof_member(struct file_operations, poll) poll; + poll = pde->proc_fops->poll; if (poll) rv = poll(file, pts); @@ -256,8 +260,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*ioctl)(struct file *, unsigned int, unsigned long); if (use_pde(pde)) { + typeof_member(struct file_operations, unlocked_ioctl) ioctl; + ioctl = pde->proc_fops->unlocked_ioctl; if (ioctl) rv = ioctl(file, cmd, arg); @@ -271,8 +276,9 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*compat_ioctl)(struct file *, unsigned int, unsigned long); if (use_pde(pde)) { + typeof_member(struct file_operations, compat_ioctl) compat_ioctl; + compat_ioctl = pde->proc_fops->compat_ioctl; if (compat_ioctl) rv = compat_ioctl(file, cmd, arg); @@ -286,8 +292,9 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - int (*mmap)(struct file *, struct vm_area_struct *); if (use_pde(pde)) { + typeof_member(struct file_operations, mmap) mmap; + mmap = pde->proc_fops->mmap; if (mmap) rv = mmap(file, vma); @@ -305,7 +312,7 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long rv = -EIO; if (use_pde(pde)) { - typeof(proc_reg_get_unmapped_area) *get_area; + typeof_member(struct file_operations, get_unmapped_area) get_area; get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU @@ -326,8 +333,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; - int (*open)(struct inode *, struct file *); - int (*release)(struct inode *, struct file *); + typeof_member(struct file_operations, open) open; + typeof_member(struct file_operations, release) release; struct pde_opener *pdeo; /* -- cgit v1.2.3-59-g8ed1b From 5ec27ec735ba0477d48c80561cc5e856f0c5dfaf Mon Sep 17 00:00:00 2001 From: Radoslaw Burny Date: Tue, 16 Jul 2019 16:26:51 -0700 Subject: fs/proc/proc_sysctl.c: fix the default values of i_uid/i_gid on /proc/sys inodes. Normally, the inode's i_uid/i_gid are translated relative to s_user_ns, but this is not a correct behavior for proc. Since sysctl permission check in test_perm is done against GLOBAL_ROOT_[UG]ID, it makes more sense to use these values in u_[ug]id of proc inodes. In other words: although uid/gid in the inode is not read during test_perm, the inode logically belongs to the root of the namespace. I have confirmed this with Eric Biederman at LPC and in this thread: https://lore.kernel.org/lkml/87k1kzjdff.fsf@xmission.com Consequences ============ Since the i_[ug]id values of proc nodes are not used for permissions checks, this change usually makes no functional difference. However, it causes an issue in a setup where: * a namespace container is created without root user in container - hence the i_[ug]id of proc nodes are set to INVALID_[UG]ID * container creator tries to configure it by writing /proc/sys files, e.g. writing /proc/sys/kernel/shmmax to configure shared memory limit Kernel does not allow to open an inode for writing if its i_[ug]id are invalid, making it impossible to write shmmax and thus - configure the container. Using a container with no root mapping is apparently rare, but we do use this configuration at Google. Also, we use a generic tool to configure the container limits, and the inability to write any of them causes a failure. History ======= The invalid uids/gids in inodes first appeared due to 81754357770e (fs: Update i_[ug]id_(read|write) to translate relative to s_user_ns). However, AFAIK, this did not immediately cause any issues. The inability to write to these "invalid" inodes was only caused by a later commit 0bd23d09b874 (vfs: Don't modify inodes with a uid or gid unknown to the vfs). Tested: Used a repro program that creates a user namespace without any mapping and stat'ed /proc/$PID/root/proc/sys/kernel/shmmax from outside. Before the change, it shows the overflow uid, with the change it's 0. The overflow uid indicates that the uid in the inode is not correct and thus it is not possible to open the file for writing. Link: http://lkml.kernel.org/r/20190708115130.250149-1-rburny@google.com Fixes: 0bd23d09b874 ("vfs: Don't modify inodes with a uid or gid unknown to the vfs") Signed-off-by: Radoslaw Burny Acked-by: Luis Chamberlain Cc: Kees Cook Cc: "Eric W . Biederman" Cc: Seth Forshee Cc: John Sperbeck Cc: Alexey Dobriyan Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c74570736b24..36ad1b0d6259 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -499,6 +499,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } return inode; } -- cgit v1.2.3-59-g8ed1b