diff options
| author | 2014-09-26 00:16:57 -0700 | |
|---|---|---|
| committer | 2014-09-26 15:05:14 -0400 | |
| commit | 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 (patch) | |
| tree | 12f09f26bee9813ae33cfc195582c41e94b2e4e9 /kernel/bpf/syscall.c | |
| parent | net: sched: use pinned timers (diff) | |
| download | wireguard-linux-99c55f7d47c0dc6fc64729f37bf435abf43f4c60.tar.xz wireguard-linux-99c55f7d47c0dc6fc64729f37bf435abf43f4c60.zip | |
bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
Userspace example:
/* this syscall wrapper creates a map with given type and attributes
 * and returns map_fd on success.
 * use close(map_fd) to delete the map
 */
int bpf_create_map(enum bpf_map_type map_type, int key_size,
                   int value_size, int max_entries)
{
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries
    };
    return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
'union bpf_attr' is backwards compatible with future extensions.
More details in Documentation/networking/filter.txt and in manpage
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf/syscall.c')
| -rw-r--r-- | kernel/bpf/syscall.c | 169 | 
1 files changed, 169 insertions, 0 deletions
| diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/anon_inodes.h> + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ +	struct bpf_map_type_list *tl; +	struct bpf_map *map; + +	list_for_each_entry(tl, &bpf_map_types, list_node) { +		if (tl->type == attr->map_type) { +			map = tl->ops->map_alloc(attr); +			if (IS_ERR(map)) +				return map; +			map->ops = tl->ops; +			map->map_type = attr->map_type; +			return map; +		} +	} +	return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ +	list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ +	struct bpf_map *map = container_of(work, struct bpf_map, work); + +	/* implementation dependent freeing */ +	map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ +	if (atomic_dec_and_test(&map->refcnt)) { +		INIT_WORK(&map->work, bpf_map_free_deferred); +		schedule_work(&map->work); +	} +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ +	struct bpf_map *map = filp->private_data; + +	bpf_map_put(map); +	return 0; +} + +static const struct file_operations bpf_map_fops = { +	.release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ +	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ +		   sizeof(attr->CMD##_LAST_FIELD), 0, \ +		   sizeof(*attr) - \ +		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ +		   sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ +	struct bpf_map *map; +	int err; + +	err = CHECK_ATTR(BPF_MAP_CREATE); +	if (err) +		return -EINVAL; + +	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */ +	map = find_and_alloc_map(attr); +	if (IS_ERR(map)) +		return PTR_ERR(map); + +	atomic_set(&map->refcnt, 1); + +	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + +	if (err < 0) +		/* failed to allocate fd */ +		goto free_map; + +	return err; + +free_map: +	map->ops->map_free(map); +	return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ +	union bpf_attr attr = {}; +	int err; + +	/* the syscall is limited to root temporarily. This restriction will be +	 * lifted when security audit is clean. Note that eBPF+tracing must have +	 * this restriction, since it may pass kernel data to user space +	 */ +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (!access_ok(VERIFY_READ, uattr, 1)) +		return -EFAULT; + +	if (size > PAGE_SIZE)	/* silly large */ +		return -E2BIG; + +	/* If we're handed a bigger struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. new +	 * user-space does not rely on any kernel feature +	 * extensions we dont know about yet. +	 */ +	if (size > sizeof(attr)) { +		unsigned char __user *addr; +		unsigned char __user *end; +		unsigned char val; + +		addr = (void __user *)uattr + sizeof(attr); +		end  = (void __user *)uattr + size; + +		for (; addr < end; addr++) { +			err = get_user(val, addr); +			if (err) +				return err; +			if (val) +				return -E2BIG; +		} +		size = sizeof(attr); +	} + +	/* copy attributes from user space, may be less than sizeof(bpf_attr) */ +	if (copy_from_user(&attr, uattr, size) != 0) +		return -EFAULT; + +	switch (cmd) { +	case BPF_MAP_CREATE: +		err = map_create(&attr); +		break; +	default: +		err = -EINVAL; +		break; +	} + +	return err; +} | 
