From 1d1585ca0f48fe7ed95c3571f3e4a82b2b5045dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:56 +0100 Subject: uaccess: Add non-pagefault user-space write function Commit 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions") missed to add probe write function, therefore factor out a probe_write_common() helper with most logic of probe_kernel_write() except setting KERNEL_DS, and add a new probe_user_write() helper so it can be used from BPF side. Again, on some archs, the user address space and kernel address space can co-exist and be overlapping, so in such case, setting KERNEL_DS would mean that the given address is treated as being in kernel address space. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Masami Hiramatsu Link: https://lore.kernel.org/bpf/9df2542e68141bfa3addde631441ee45503856a8.1572649915.git.daniel@iogearbox.net --- mm/maccess.c | 45 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/maccess.c b/mm/maccess.c index d065736f6b87..2d3c3d01064c 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -18,6 +18,18 @@ probe_read_common(void *dst, const void __user *src, size_t size) return ret ? -EFAULT : 0; } +static __always_inline long +probe_write_common(void __user *dst, const void *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_to_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + /** * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data @@ -85,6 +97,7 @@ EXPORT_SYMBOL_GPL(probe_user_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ + long __weak probe_kernel_write(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_write"))); @@ -94,15 +107,39 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - pagefault_enable(); + ret = probe_write_common((__force void __user *)dst, src, size); set_fs(old_fs); - return ret ? -EFAULT : 0; + return ret; } EXPORT_SYMBOL_GPL(probe_kernel_write); +/** + * probe_user_write(): safely attempt to write to a user-space location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_user_write(void __user *dst, const void *src, size_t size) + __attribute__((alias("__probe_user_write"))); + +long __probe_user_write(void __user *dst, const void *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(dst, size)) + ret = probe_write_common(dst, src, size); + set_fs(old_fs); + + return ret; +} +EXPORT_SYMBOL_GPL(probe_user_write); /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. -- cgit v1.2.3-59-g8ed1b From 75a1a607bb7e6d918be3aca11ec2214a275392f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:57 +0100 Subject: uaccess: Add strict non-pagefault kernel-space read function Add two new probe_kernel_read_strict() and strncpy_from_unsafe_strict() helpers which by default alias to the __probe_kernel_read() and the __strncpy_from_unsafe(), respectively, but can be overridden by archs which have non-overlapping address ranges for kernel space and user space in order to bail out with -EFAULT when attempting to probe user memory including non-canonical user access addresses [0]: 4-level page tables: user-space mem: 0x0000000000000000 - 0x00007fffffffffff non-canonical: 0x0000800000000000 - 0xffff7fffffffffff 5-level page tables: user-space mem: 0x0000000000000000 - 0x00ffffffffffffff non-canonical: 0x0100000000000000 - 0xfeffffffffffffff The idea is that these helpers are complementary to the probe_user_read() and strncpy_from_unsafe_user() which probe user-only memory. Both added helpers here do the same, but for kernel-only addresses. Both set of helpers are going to be used for BPF tracing. They also explicitly avoid throwing the splat for non-canonical user addresses from 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). For compat, the current probe_kernel_read() and strncpy_from_unsafe() are left as-is. [0] Documentation/x86/x86_64/mm.txt Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: x86@kernel.org Link: https://lore.kernel.org/bpf/eefeefd769aa5a013531f491a71f0936779e916b.1572649915.git.daniel@iogearbox.net --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/maccess.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/uaccess.h | 4 ++++ mm/maccess.c | 25 ++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 arch/x86/mm/maccess.c (limited to 'mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 84373dc9b341..bbc68a54795e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c new file mode 100644 index 000000000000..f5b85bdc0535 --- /dev/null +++ b/arch/x86/mm/maccess.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#ifdef CONFIG_X86_64 +static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + /* + * Range covering the highest possible canonical userspace address + * as well as non-canonical address range. For the canonical range + * we also need to include the userspace guard page. + */ + return vaddr < TASK_SIZE_MAX + PAGE_SIZE || + canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr; +} +#else +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + return vaddr < TASK_SIZE_MAX; +} +#endif + +long probe_kernel_read_strict(void *dst, const void *src, size_t size) +{ + if (unlikely(invalid_probe_range((unsigned long)src))) + return -EFAULT; + + return __probe_kernel_read(dst, src, size); +} + +long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count) +{ + if (unlikely(invalid_probe_range((unsigned long)unsafe_addr))) + return -EFAULT; + + return __strncpy_from_unsafe(dst, unsafe_addr, count); +} diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 38555435a64a..67f016010aad 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, * happens, handle that and return -EFAULT. */ extern long probe_kernel_read(void *dst, const void *src, size_t size); +extern long probe_kernel_read_strict(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size); /* @@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, + long count); +extern long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, long count); extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); diff --git a/mm/maccess.c b/mm/maccess.c index 2d3c3d01064c..3ca8d97e5010 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -43,11 +43,20 @@ probe_write_common(void __user *dst, const void *src, size_t size) * do_page_fault() doesn't attempt to take mmap_sem. This makes * probe_kernel_read() suitable for use within regions where the caller * already holds mmap_sem, or other locks which nest inside mmap_sem. + * + * probe_kernel_read_strict() is the same as probe_kernel_read() except for + * the case where architectures have non-overlapping user and kernel address + * ranges: probe_kernel_read_strict() will additionally return -EFAULT for + * probing memory on a user address range where probe_user_read() is supposed + * to be used instead. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_read"))); +long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + long __probe_kernel_read(void *dst, const void *src, size_t size) { long ret; @@ -157,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_user_write); * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. + * + * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except + * for the case where architectures have non-overlapping user and kernel address + * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for + * probing memory on a user address range where strncpy_from_unsafe_user() is + * supposed to be used instead. */ -long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) + +long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) + __attribute__((alias("__strncpy_from_unsafe"))); + +long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, + long count) + __attribute__((alias("__strncpy_from_unsafe"))); + +long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); const void *src = unsafe_addr; -- cgit v1.2.3-59-g8ed1b From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- include/linux/bpf.h | 11 +++-- include/linux/vmalloc.h | 1 + include/uapi/linux/bpf.h | 3 ++ kernel/bpf/arraymap.c | 58 ++++++++++++++++++++++--- kernel/bpf/syscall.c | 99 ++++++++++++++++++++++++++++++++++++++++-- mm/vmalloc.c | 20 +++++++++ tools/include/uapi/linux/bpf.h | 3 ++ 7 files changed, 183 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fb606dc61a3a..e913dd5946ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ struct bpf_map_ops { u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); + int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); }; struct bpf_map_memory { @@ -96,9 +98,10 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; + char name[BPF_OBJ_NAME_LEN]; bool unpriv_array; - bool frozen; /* write-once */ - /* 48 bytes hole */ + bool frozen; /* write-once; write-protected by freeze_mutex */ + /* 22 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -106,7 +109,8 @@ struct bpf_map { atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; struct work_struct work; - char name[BPF_OBJ_NAME_LEN]; + struct mutex freeze_mutex; + u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); void *bpf_map_area_alloc(size_t size, int numa_node); +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4e7809408073..b4c58a191eb1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); +extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4842a134b202..dbbcf0b02970 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1c65ce0098a9..a42097c36b0c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,7 +14,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr) (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_flags & BPF_F_MMAPABLE) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } array_size = sizeof(*array); - if (percpu) + if (percpu) { array_size += (u64) max_entries * sizeof(void *); - else - array_size += (u64) max_entries * elem_size; + } else { + /* rely on vmalloc() to return page-aligned memory and + * ensure array->value is exactly page-aligned + */ + if (attr->map_flags & BPF_F_MMAPABLE) { + array_size = PAGE_ALIGN(array_size); + array_size += PAGE_ALIGN((u64) max_entries * elem_size); + } else { + array_size += (u64) max_entries * elem_size; + } + } /* make sure there is no u32 overflow later in round_up() */ cost = array_size; @@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + if (attr->map_flags & BPF_F_MMAPABLE) { + void *data; + + /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ + data = bpf_map_area_mmapable_alloc(array_size, numa_node); + if (!data) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + array = data + PAGE_ALIGN(sizeof(struct bpf_array)) + - offsetof(struct bpf_array, value); + } else { + array = bpf_map_area_alloc(array_size, numa_node); + } if (!array) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ + return (void *)round_down((unsigned long)array, PAGE_SIZE); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - bpf_map_area_free(array); + if (array->map.map_flags & BPF_F_MMAPABLE) + bpf_map_area_free(array_map_vmalloc_addr(array)); + else + bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } +int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + + if (!(map->map_flags & BPF_F_MMAPABLE)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = { .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, + .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 52fe4bacb330..bac3becf9f90 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size, int numa_node) +static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -142,18 +142,33 @@ void *bpf_map_area_alloc(size_t size, int numa_node) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + /* kmalloc()'ed memory can't be mmap()'ed */ + if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); if (area != NULL) return area; } - + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | + __GFP_RETRY_MAYFAIL | flags); + } return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, __builtin_return_address(0)); } +void *bpf_map_area_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, false); +} + +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, true); +} + void bpf_map_area_free(void *area) { kvfree(area); @@ -425,6 +440,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, return -EINVAL; } +/* called for any extra memory-mapped regions (except initial) */ +static void bpf_map_mmap_open(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt++; + mutex_unlock(&map->freeze_mutex); + } +} + +/* called for all unmapped memory region (including initial) */ +static void bpf_map_mmap_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt--; + mutex_unlock(&map->freeze_mutex); + } + + bpf_map_put_with_uref(map); +} + +static const struct vm_operations_struct bpf_map_default_vmops = { + .open = bpf_map_mmap_open, + .close = bpf_map_mmap_close, +}; + +static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct bpf_map *map = filp->private_data; + int err; + + if (!map->ops->map_mmap || map_value_has_spin_lock(map)) + return -ENOTSUPP; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + mutex_lock(&map->freeze_mutex); + + if ((vma->vm_flags & VM_WRITE) && map->frozen) { + err = -EPERM; + goto out; + } + + /* set default open/close callbacks */ + vma->vm_ops = &bpf_map_default_vmops; + vma->vm_private_data = map; + + err = map->ops->map_mmap(map, vma); + if (err) + goto out; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) + map->writecnt++; +out: + mutex_unlock(&map->freeze_mutex); + return err; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -432,6 +515,7 @@ const struct file_operations bpf_map_fops = { .release = bpf_map_release, .read = bpf_dummy_read, .write = bpf_dummy_write, + .mmap = bpf_map_mmap, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -577,6 +661,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); + mutex_init(&map->freeze_mutex); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -1163,6 +1248,13 @@ static int map_freeze(const union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + + mutex_lock(&map->freeze_mutex); + + if (map->writecnt) { + err = -EBUSY; + goto err_put; + } if (READ_ONCE(map->frozen)) { err = -EBUSY; goto err_put; @@ -1174,6 +1266,7 @@ static int map_freeze(const union bpf_attr *attr) WRITE_ONCE(map->frozen, true); err_put: + mutex_unlock(&map->freeze_mutex); fdput(f); return err; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3c70e275f4e..4a7d7459c4f9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2671,6 +2671,26 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); +/** + * vmalloc_user_node_flags - allocate memory for userspace on a specific node + * @size: allocation size + * @node: numa node + * @flags: flags for the page level allocator + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + flags | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4842a134b202..dbbcf0b02970 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -348,6 +348,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit v1.2.3-59-g8ed1b From ed81745a4c96841937f1da35c0eb66ac312e1480 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 23 Nov 2019 14:08:35 -0800 Subject: mm: Implement no-MMU variant of vmalloc_user_node_flags To fix build with !CONFIG_MMU, implement it for no-MMU configurations as well. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Reported-by: kbuild test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Johannes Weiner Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191123220835.1237773-1-andriin@fb.com --- mm/nommu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 99b7ec318824..7de592058ab4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -155,11 +155,11 @@ void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) return __vmalloc(size, flags, PAGE_KERNEL); } -void *vmalloc_user(unsigned long size) +static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, flags, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; @@ -172,8 +172,19 @@ void *vmalloc_user(unsigned long size) return ret; } + +void *vmalloc_user(unsigned long size) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} EXPORT_SYMBOL(vmalloc_user); +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_user_flags(size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); -- cgit v1.2.3-59-g8ed1b