diff options
| author | 2017-03-22 15:45:46 -0700 | |
|---|---|---|
| committer | 2017-03-22 15:45:46 -0700 | |
| commit | 9bdf64d511c5c23e39b7a58ff4056d839060c026 (patch) | |
| tree | a546eefbd355792aa544c0f220b38bab4493b1aa /kernel | |
| parent | net: stmmac: fix dma operation mode config for older versions (diff) | |
| parent | bpf: Add tests for map-in-map (diff) | |
Merge branch 'bpf-map-in-map'
Martin KaFai Lau says:
====================
bpf: Add map-in-map support
This patchset adds map-in-map support (map->map).
One use case is the (vips -> webservers) in the L4 load balancer so
that different vips can be backed by different set of webservers.
Please refer to the individual commit log for details.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 74 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 121 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 97 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.h | 23 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 13 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 57 |
7 files changed, 366 insertions, 21 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bcf9955fac95..bc9da93db403 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,6 +17,8 @@ #include <linux/filter.h> #include <linux/perf_event.h> +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -117,20 +119,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; - u32 elem_size = array->elem_size; + u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, array->map.max_entries, - elem_size == 1 ? 2 : 3); - if (elem_size == 1) { - /* nop */ - } else if (is_power_of_2(elem_size)) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + + if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); @@ -605,3 +604,64 @@ static int __init register_cgroup_array_map(void) } late_initcall(register_cgroup_array_map); #endif + +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void array_of_map_free(struct bpf_map *map) +{ + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static const struct bpf_map_ops array_of_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list array_of_map_type __ro_after_init = { + .ops = &array_of_map_ops, + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, +}; + +static int __init register_array_of_map(void) +{ + bpf_register_map_type(&array_of_map_type); + return 0; +} +late_initcall(register_array_of_map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 000153acb6d5..343fb5394c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include <linux/rculist_nulls.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { struct hlist_nulls_head head; @@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + if (l->state == HTAB_EXTRA_ELEM_USED) { l->state = HTAB_EXTRA_ELEM_FREE; return; @@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); + + return map; +} + +static void fd_htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +static const struct bpf_map_ops htab_of_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list htab_of_map_type __ro_after_init = { + .ops = &htab_of_map_ops, + .type = BPF_MAP_TYPE_HASH_OF_MAPS, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_lru_type); bpf_register_map_type(&htab_lru_percpu_type); + bpf_register_map_type(&htab_of_map_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/slab.h> +#include <linux/bpf.h> + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include <linux/types.h> + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 48c914b983bd..c35ebfe6d84d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -215,7 +215,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -352,6 +352,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + err = -ENOTSUPP; } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -438,11 +441,17 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90bf46787603..09923cc5c7c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 65536 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1197,6 +1199,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -1357,6 +1363,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() @@ -1369,7 +1377,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -2093,14 +2105,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -3025,16 +3042,33 @@ process_bpf_exit: return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } @@ -3307,7 +3341,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (!map_ptr->ops->map_gen_lookup) + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); |
