From 2a5eb9995528441447d33838727f6ec1caf08139 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 16 Feb 2024 22:25:44 -0800 Subject: binfmt_elf: Leave a gap between .bss and brk Currently the brk starts its randomization immediately after .bss, which means there is a chance that when the random offset is 0, linear overflows from .bss can reach into the brk area. Leave at least a single page gap between .bss and brk (when it has not already been explicitly relocated into the mmap range). Reported-by: Closes: https://lore.kernel.org/linux-hardening/CA+2EKTVLvc8hDZc+2Yhwmus=dzOUG5E4gV7ayCbu0MPJTZzWkw@mail.gmail.com/ Link: https://lore.kernel.org/r/20240217062545.1631668-2-keescook@chromium.org Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/binfmt_elf.c') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..7862962f7a85 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1262,6 +1262,9 @@ out_free_interp: if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } else { + /* Otherwise leave a gap between .bss and brk. */ + mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; } mm->brk = mm->start_brk = arch_randomize_brk(mm); -- cgit v1.2.3-59-g8ed1b From 6b839b3b76cf17296ebd4a893841f32cae08229c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 5 Feb 2024 09:26:30 -0800 Subject: regset: use kvzalloc() for regset_get_alloc() While browsing through ChromeOS crash reports, I found one with an allocation failure that looked like this: chrome: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=urgent,mems_allowed=0 CPU: 7 PID: 3295 Comm: chrome Not tainted 5.15.133-20574-g8044615ac35c #1 (HASH:1162 1) Hardware name: Google Lazor (rev3 - 8) with KB Backlight (DT) Call trace: ... warn_alloc+0x104/0x174 __alloc_pages+0x5f0/0x6e4 kmalloc_order+0x44/0x98 kmalloc_order_trace+0x34/0x124 __kmalloc+0x228/0x36c __regset_get+0x68/0xcc regset_get_alloc+0x1c/0x28 elf_core_dump+0x3d8/0xd8c do_coredump+0xeb8/0x1378 get_signal+0x14c/0x804 ... An order 7 allocation is (1 << 7) contiguous pages, or 512K. It's not a surprise that this allocation failed on a system that's been running for a while. More digging showed that it was fairly easy to see the order 7 allocation by just sending a SIGQUIT to chrome (or other processes) to generate a core dump. The actual amount being allocated was 279,584 bytes and it was for "core_note_type" NT_ARM_SVE. There was quite a bit of discussion [1] on the mailing lists in response to my v1 patch attempting to switch to vmalloc. The overall conclusion was that we could likely reduce the 279,584 byte allocation by quite a bit and Mark Brown has sent a patch to that effect [2]. However even with the 279,584 byte allocation gone there are still 65,552 byte allocations. These are just barely more than the 65,536 bytes and thus would require an order 5 allocation. An order 5 allocation is still something to avoid unless necessary and nothing needs the memory here to be contiguous. Change the allocation to kvzalloc() which should still be efficient for small allocations but doesn't force the memory subsystem to work hard (and maybe fail) at getting a large contiguous chunk. [1] https://lore.kernel.org/r/20240201171159.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid [2] https://lore.kernel.org/r/20240203-arm64-sve-ptrace-regset-size-v1-1-2c3ba1386b9e@kernel.org Link: https://lkml.kernel.org/r/20240205092626.v2.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid Signed-off-by: Douglas Anderson Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Christian Brauner Cc: Dave Martin Cc: Eric Biederman Cc: Jan Kara Cc: Kees Cook Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 2 +- kernel/regset.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/binfmt_elf.c') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..ac178ad38823 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1928,7 +1928,7 @@ static void free_note_info(struct elf_note_info *info) threads = t->next; WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); for (i = 1; i < info->thread_notes; ++i) - kfree(t->notes[i].data); + kvfree(t->notes[i].data); kfree(t); } kfree(info->psinfo.data); diff --git a/kernel/regset.c b/kernel/regset.c index 586823786f39..b2871fa68b2a 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -16,14 +16,14 @@ static int __regset_get(struct task_struct *target, if (size > regset->n * regset->size) size = regset->n * regset->size; if (!p) { - to_free = p = kzalloc(size, GFP_KERNEL); + to_free = p = kvzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; } res = regset->regset_get(target, regset, (struct membuf){.p = p, .left = size}); if (res < 0) { - kfree(to_free); + kvfree(to_free); return res; } *data = p; @@ -71,6 +71,6 @@ int copy_regset_to_user(struct task_struct *target, ret = regset_get_alloc(target, regset, size, &buf); if (ret > 0) ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; - kfree(buf); + kvfree(buf); return ret; } -- cgit v1.2.3-59-g8ed1b From 4bbf9c3b53e637eb3a14ee27b996300ce88e752a Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 6 May 2024 19:37:00 +0000 Subject: fs/coredump: Enable dynamic configuration of max file note size Introduce the capability to dynamically configure the maximum file note size for ELF core dumps via sysctl. Why is this being done? We have observed that during a crash when there are more than 65k mmaps in memory, the existing fixed limit on the size of the ELF notes section becomes a bottleneck. The notes section quickly reaches its capacity, leading to incomplete memory segment information in the resulting coredump. This truncation compromises the utility of the coredumps, as crucial information about the memory state at the time of the crash might be omitted. This enhancement removes the previous static limit of 4MB, allowing system administrators to adjust the size based on system-specific requirements or constraints. Eg: $ sysctl -a | grep core_file_note_size_limit kernel.core_file_note_size_limit = 4194304 $ sysctl -n kernel.core_file_note_size_limit 4194304 $echo 519304 > /proc/sys/kernel/core_file_note_size_limit $sysctl -n kernel.core_file_note_size_limit 519304 Attempting to write beyond the ceiling value of 16MB $echo 17194304 > /proc/sys/kernel/core_file_note_size_limit bash: echo: write error: Invalid argument Signed-off-by: Vijay Nag Signed-off-by: Allen Pais Link: https://lore.kernel.org/r/20240506193700.7884-1-apais@linux.microsoft.com Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 7 +++++-- fs/coredump.c | 17 +++++++++++++++++ include/linux/coredump.h | 2 ++ 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'fs/binfmt_elf.c') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7862962f7a85..b5a25ee49eea 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1567,7 +1567,6 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } -#define MAX_FILE_NOTE_SIZE (4*1024*1024) /* * Format of NT_FILE note: * @@ -1595,8 +1594,12 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm names_ofs = (2 + 3 * count) * sizeof(data[0]); alloc: - if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ + /* paranoia check */ + if (size >= core_file_note_size_limit) { + pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n", + size); return -EINVAL; + } size = round_up(size, PAGE_SIZE); /* * "size" can be 0 here legitimately. diff --git a/fs/coredump.c b/fs/coredump.c index be6403b4b14b..317065e3eb9b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -56,10 +56,15 @@ static bool dump_vma_snapshot(struct coredump_params *cprm); static void free_vma_snapshot(struct coredump_params *cprm); +#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) +/* Define a reasonable max cap */ +#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; +unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; struct core_name { char *corename; @@ -998,6 +1003,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, return error; } +static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; +static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; + static struct ctl_table coredump_sysctls[] = { { .procname = "core_uses_pid", @@ -1020,6 +1028,15 @@ static struct ctl_table coredump_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "core_file_note_size_limit", + .data = &core_file_note_size_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (unsigned int *)&core_file_note_size_min, + .extra2 = (unsigned int *)&core_file_note_size_max, + }, }; static int __init init_fs_coredump_sysctls(void) diff --git a/include/linux/coredump.h b/include/linux/coredump.h index d3eba4360150..0904ba010341 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -30,6 +30,8 @@ struct coredump_params { struct core_vma_metadata *vma_meta; }; +extern unsigned int core_file_note_size_limit; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. -- cgit v1.2.3-59-g8ed1b