From 546ac1ffb70d25b56c1126940e5ec639c4dd7413 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:28:56 -0700 Subject: bpf: add devmap, a map for storing net device references Device map (devmap) is a BPF map, primarily useful for networking applications, that uses a key to lookup a reference to a netdevice. The map provides a clean way for BPF programs to build virtual port to physical port maps. Additionally, it provides a scoping function for the redirect action itself allowing multiple optimizations. Future patches will leverage the map to provide batching at the XDP layer. Another optimization/feature, that is not yet implemented, would be to support multiple netdevices per key to support efficient multicast and broadcast support. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 kernel/bpf/devmap.c (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c new file mode 100644 index 000000000000..1a878356bd37 --- /dev/null +++ b/kernel/bpf/devmap.c @@ -0,0 +1,264 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* Devmaps primary use is as a backend map for XDP BPF helper call + * bpf_redirect_map(). Because XDP is mostly concerned with performance we + * spent some effort to ensure the datapath with redirect maps does not use + * any locking. This is a quick note on the details. + * + * We have three possible paths to get into the devmap control plane bpf + * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall + * will invoke an update, delete, or lookup operation. To ensure updates and + * deletes appear atomic from the datapath side xchg() is used to modify the + * netdev_map array. Then because the datapath does a lookup into the netdev_map + * array (read-only) from an RCU critical section we use call_rcu() to wait for + * an rcu grace period before free'ing the old data structures. This ensures the + * datapath always has a valid copy. However, the datapath does a "flush" + * operation that pushes any pending packets in the driver outside the RCU + * critical section. Each bpf_dtab_netdev tracks these pending operations using + * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed + * until all bits are cleared indicating outstanding flush operations have + * completed. + * + * BPF syscalls may race with BPF program calls on any of the update, delete + * or lookup operations. As noted above the xchg() operation also keep the + * netdev_map consistent in this case. From the devmap side BPF programs + * calling into these operations are the same as multiple user space threads + * making system calls. + */ +#include +#include +#include +#include +#include "percpu_freelist.h" +#include "bpf_lru_list.h" +#include "map_in_map.h" + +struct bpf_dtab_netdev { + struct net_device *dev; + int key; + struct rcu_head rcu; + struct bpf_dtab *dtab; +}; + +struct bpf_dtab { + struct bpf_map map; + struct bpf_dtab_netdev **netdev_map; +}; + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + u64 cost; + int err; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags) + return ERR_PTR(-EINVAL); + + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + if (attr->value_size > KMALLOC_MAX_SIZE) + return ERR_PTR(-E2BIG); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + dtab->map.map_type = attr->map_type; + dtab->map.key_size = attr->key_size; + dtab->map.value_size = attr->value_size; + dtab->map.max_entries = attr->max_entries; + dtab->map.map_flags = attr->map_flags; + + err = -ENOMEM; + + /* make sure page count doesn't overflow */ + cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_dtab; + + dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(dtab->map.pages); + if (err) + goto free_dtab; + + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *)); + if (!dtab->netdev_map) + goto free_dtab; + + return &dtab->map; + +free_dtab: + kfree(dtab); + return ERR_PTR(err); +} + +static void dev_map_free(struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further reads against netdev_map. It does __not__ ensure pending + * flush operations (if any) are complete. + */ + synchronize_rcu(); + + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + dev_put(dev->dev); + kfree(dev); + } + + /* At this point bpf program is detached and all pending operations + * _must_ be complete + */ + bpf_map_area_free(dtab->netdev_map); + kfree(dtab); +} + +static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= dtab->map.max_entries) { + *next = 0; + return 0; + } + + if (index == dtab->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or + * update happens in parallel here a dev_put wont happen until after reading the + * ifindex. + */ +static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev; + u32 i = *(u32 *)key; + + if (i >= map->max_entries) + return NULL; + + dev = READ_ONCE(dtab->netdev_map[i]); + return dev ? &dev->dev->ifindex : NULL; +} + +static void __dev_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_dtab_netdev *old_dev; + + old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_put(old_dev->dev); + kfree(old_dev); +} + +static int dev_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + /* Use synchronize_rcu() here to ensure any rcu critical sections + * have completed, but this does not guarantee a flush has happened + * yet. Because driver side rcu_read_lock/unlock only protects the + * running XDP program. However, for pending flush operations the + * dev and ctx are stored in another per cpu map. And additionally, + * the driver tear down ensures all soft irqs are complete before + * removing the net device in the case of dev_put equals zero. + */ + old_dev = xchg(&dtab->netdev_map[k], NULL); + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + return 0; +} + +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct net *net = current->nsproxy->net_ns; + struct bpf_dtab_netdev *dev, *old_dev; + u32 i = *(u32 *)key; + u32 ifindex = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= dtab->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + if (!ifindex) { + dev = NULL; + } else { + dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); + if (!dev) + return -ENOMEM; + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + kfree(dev); + return -EINVAL; + } + + dev->key = i; + dev->dtab = dtab; + } + + /* Use call_rcu() here to ensure rcu critical sections have completed + * Remembering the driver side flush operation will happen before the + * net device is removed. + */ + old_dev = xchg(&dtab->netdev_map[i], dev); + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +const struct bpf_map_ops dev_map_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_get_next_key, + .map_lookup_elem = dev_map_lookup_elem, + .map_update_elem = dev_map_update_elem, + .map_delete_elem = dev_map_delete_elem, +}; -- cgit v1.3-6-gb490 From 97f91a7cf04ff605845c20948b8a80e54cbd3376 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:29:18 -0700 Subject: bpf: add bpf_redirect_map helper routine BPF programs can use the devmap with a bpf_redirect_map() helper routine to forward packets to netdevice in map. Signed-off-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 8 +++++++- kernel/bpf/devmap.c | 12 +++++++++++ kernel/bpf/verifier.c | 4 ++++ net/core/filter.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/devmap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b69e7a5869ff..d0d3281ac678 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -379,4 +379,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +/* Map specifics */ +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ecbb0e7e15bc..1106a8c4cd36 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,11 @@ union bpf_attr { * @flags: bit 0 - if set, redirect to ingress instead of egress * other bits - reserved * Return: TC_ACT_REDIRECT + * int bpf_redirect_map(key, map, flags) + * redirect to endpoint in map + * @key: index in map to lookup + * @map: fd of map to do lookup in + * @flags: -- * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -592,7 +597,8 @@ union bpf_attr { FN(get_socket_uid), \ FN(set_hash), \ FN(setsockopt), \ - FN(skb_adjust_room), + FN(skb_adjust_room), \ + FN(redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1a878356bd37..36dc13deb2e1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,6 +159,18 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev; + + if (key >= map->max_entries) + return NULL; + + dev = READ_ONCE(dtab->netdev_map[key]); + return dev ? dev->dev : NULL; +} + /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or * update happens in parallel here a dev_put wont happen until after reading the * ifindex. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4016774d5cca..df05d65f0c87 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1312,6 +1312,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; + case BPF_FUNC_redirect_map: + if (map->map_type != BPF_MAP_TYPE_DEVMAP) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index e30d38b27f21..e93a558324b5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1779,6 +1779,7 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { struct redirect_info { u32 ifindex; u32 flags; + struct bpf_map *map; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1792,6 +1793,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; + ri->map = NULL; return TC_ACT_REDIRECT; } @@ -1819,6 +1821,29 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + if (unlikely(flags)) + return XDP_ABORTED; + + ri->ifindex = ifindex; + ri->flags = flags; + ri->map = map; + + return XDP_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_map_proto = { + .func = bpf_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2423,14 +2448,39 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp) return -EOPNOTSUPP; } +int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_map *map = ri->map; + struct net_device *fwd; + int err = -EINVAL; + + ri->ifindex = 0; + ri->map = NULL; + + fwd = __dev_map_lookup_elem(map, ri->ifindex); + if (!fwd) + goto out; + + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); + err = __bpf_tx_xdp(fwd, xdp); +out: + return err; +} + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct net_device *fwd; + if (ri->map) + return xdp_do_redirect_map(dev, xdp, xdp_prog); + fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); ri->ifindex = 0; + ri->map = NULL; if (unlikely(!fwd)) { bpf_warn_invalid_xdp_redirect(ri->ifindex); return -EINVAL; @@ -3089,6 +3139,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_adjust_head_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; + case BPF_FUNC_redirect_map: + return &bpf_redirect_map_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.3-6-gb490 From 11393cc9b9be2a1f61559e6fb9c27bc8fa20b1ff Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:29:40 -0700 Subject: xdp: Add batching support to redirect map For performance reasons we want to avoid updating the tail pointer in the driver tx ring as much as possible. To accomplish this we add batching support to the redirect path in XDP. This adds another ndo op "xdp_flush" that is used to inform the driver that it should bump the tail pointer on the TX ring. Signed-off-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 28 ++++++++- include/linux/bpf.h | 2 + include/linux/filter.h | 7 +++ include/linux/netdevice.h | 5 +- kernel/bpf/devmap.c | 84 ++++++++++++++++++++++++++- net/core/filter.c | 55 ++++++++++++++---- 6 files changed, 166 insertions(+), 15 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 38f7ff97d636..0f867dcda65f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2415,6 +2415,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, */ wmb(); writel(ring->next_to_use, ring->tail); + + xdp_do_flush_map(); } u64_stats_update_begin(&rx_ring->syncp); @@ -5817,6 +5819,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter) usleep_range(10000, 20000); + /* synchronize_sched() needed for pending XDP buffers to drain */ + if (adapter->xdp_ring[0]) + synchronize_sched(); netif_tx_stop_all_queues(netdev); /* call carrier off first to avoid false dev_watchdog timeouts */ @@ -9850,15 +9855,31 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (err != IXGBE_XDP_TX) return -ENOMEM; + return 0; +} + +static void ixgbe_xdp_flush(struct net_device *dev) +{ + struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_ring *ring; + + /* Its possible the device went down between xdp xmit and flush so + * we need to ensure device is still up. + */ + if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) + return; + + ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL; + if (unlikely(!ring)) + return; + /* Force memory writes to complete before letting h/w know there * are new descriptors to fetch. */ wmb(); - - ring = adapter->xdp_ring[smp_processor_id()]; writel(ring->next_to_use, ring->tail); - return 0; + return; } static const struct net_device_ops ixgbe_netdev_ops = { @@ -9908,6 +9929,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_features_check = ixgbe_features_check, .ndo_xdp = ixgbe_xdp, .ndo_xdp_xmit = ixgbe_xdp_xmit, + .ndo_xdp_flush = ixgbe_xdp_flush, }; /** diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0d3281ac678..6850a760dc94 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -381,5 +381,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +void __dev_map_insert_ctx(struct bpf_map *map, u32 index); +void __dev_map_flush(struct bpf_map *map); #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index ce8211fa91c7..3323ee91c172 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,10 +712,17 @@ bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the + * same cpu context. Further for best results no more than a single map + * for the do_redirect/do_flush pair should be used. This limitation is + * because we only track one map and force a flush when the map changes. + * This does not appear to be a real limiation for existing software. + */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); +void xdp_do_flush_map(void); void bpf_warn_invalid_xdp_action(u32 act); void bpf_warn_invalid_xdp_redirect(u32 ifindex); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 77f5376005e6..03b104908235 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1142,7 +1142,9 @@ struct xfrmdev_ops { * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. - * + * void (*ndo_xdp_flush)(struct net_device *dev); + * This function is used to inform the driver to flush a paticular + * xpd tx queue. Must be called on same CPU as xdp_xmit. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1329,6 +1331,7 @@ struct net_device_ops { struct netdev_xdp *xdp); int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); + void (*ndo_xdp_flush)(struct net_device *dev); }; /** diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 36dc13deb2e1..b2ef04a1c86a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,6 +53,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; + unsigned long int __percpu *flush_needed; }; static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); + cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; @@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + /* A per cpu bitfield with a bit per possible net device */ + dtab->flush_needed = __alloc_percpu( + BITS_TO_LONGS(attr->max_entries) * + sizeof(unsigned long), + __alignof__(unsigned long)); + if (!dtab->flush_needed) + goto free_dtab; + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *)); if (!dtab->netdev_map) @@ -105,6 +115,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) return &dtab->map; free_dtab: + free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -112,7 +123,7 @@ free_dtab: static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + int i, cpu; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map) */ synchronize_rcu(); + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + + while (!bitmap_empty(bitmap, dtab->map.max_entries)) + cpu_relax(); + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ + free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +void __dev_map_insert_ctx(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); + + __set_bit(key, bitmap); +} + struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return dev ? dev->dev : NULL; } +/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled + * from the driver before returning from its napi->poll() routine. The poll() + * routine is called either from busy_poll context or net_rx_action signaled + * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the + * net device can be torn down. On devmap tear down we ensure the ctx bitmap + * is zeroed before completing to ensure all flush operations have completed. + */ +void __dev_map_flush(struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); + u32 bit; + + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct net_device *netdev; + + /* This is possible if the dev entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!dev)) + continue; + + netdev = dev->dev; + + __clear_bit(bit, bitmap); + if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) + continue; + + netdev->netdev_ops->ndo_xdp_flush(netdev); + } +} + /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or * update happens in parallel here a dev_put wont happen until after reading the * ifindex. @@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->dev->ifindex : NULL; } +static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev) +{ + if (old_dev->dev->netdev_ops->ndo_xdp_flush) { + struct net_device *fl = old_dev->dev; + unsigned long *bitmap; + int cpu; + + for_each_online_cpu(cpu) { + bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu); + __clear_bit(old_dev->key, bitmap); + + fl->netdev_ops->ndo_xdp_flush(old_dev->dev); + } + } +} + static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *old_dev; old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_map_flush_old(old_dev); dev_put(old_dev->dev); kfree(old_dev); } diff --git a/net/core/filter.c b/net/core/filter.c index e93a558324b5..e23aa6fa1119 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1780,6 +1780,7 @@ struct redirect_info { u32 ifindex; u32 flags; struct bpf_map *map; + struct bpf_map *map_to_flush; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -2438,34 +2439,68 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp) +static int __bpf_tx_xdp(struct net_device *dev, + struct bpf_map *map, + struct xdp_buff *xdp, + u32 index) { - if (dev->netdev_ops->ndo_xdp_xmit) { - dev->netdev_ops->ndo_xdp_xmit(dev, xdp); - return 0; + int err; + + if (!dev->netdev_ops->ndo_xdp_xmit) { + bpf_warn_invalid_xdp_redirect(dev->ifindex); + return -EOPNOTSUPP; } - bpf_warn_invalid_xdp_redirect(dev->ifindex); - return -EOPNOTSUPP; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + if (err) + return err; + + if (map) + __dev_map_insert_ctx(map, index); + else + dev->netdev_ops->ndo_xdp_flush(dev); + + return err; } +void xdp_do_flush_map(void) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_map *map = ri->map_to_flush; + + ri->map = NULL; + ri->map_to_flush = NULL; + + if (map) + __dev_map_flush(map); +} +EXPORT_SYMBOL_GPL(xdp_do_flush_map); + int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_map *map = ri->map; + u32 index = ri->ifindex; struct net_device *fwd; int err = -EINVAL; ri->ifindex = 0; ri->map = NULL; - fwd = __dev_map_lookup_elem(map, ri->ifindex); + fwd = __dev_map_lookup_elem(map, index); if (!fwd) goto out; - trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); - err = __bpf_tx_xdp(fwd, xdp); + if (ri->map_to_flush && (ri->map_to_flush != map)) + xdp_do_flush_map(); + + err = __bpf_tx_xdp(fwd, map, xdp, index); + if (likely(!err)) + ri->map_to_flush = map; + out: + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); return err; } @@ -2488,7 +2523,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); - return __bpf_tx_xdp(fwd, xdp); + return __bpf_tx_xdp(fwd, NULL, xdp, 0); } EXPORT_SYMBOL_GPL(xdp_do_redirect); -- cgit v1.3-6-gb490 From 2ddf71e23cc246e95af72a6deed67b4a50a7b81c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:30:02 -0700 Subject: net: add notifier hooks for devmap bpf map The BPF map devmap holds a refcnt on the net_device structure when it is in the map. We need to do this to ensure on driver unload we don't lose a dev reference. However, its not very convenient to have to manually unload the map when destroying a net device so add notifier handlers to do the cleanup automatically. But this creates a race between update/destroy BPF syscall and programs and the unregister netdev hook. Unfortunately, the best I could come up with is either to live with requiring manual removal of net devices from the map before removing the net device OR to add a mutex in devmap to ensure the map is not modified while we are removing a device. The fallout also requires that BPF programs no longer update/delete the map from the BPF program side because the mutex may sleep and this can not be done from inside an rcu critical section. This is not a real problem though because I have not come up with any use cases where this is actually useful in practice. If/when we come up with a compelling user for this we may need to revisit this. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 2 +- kernel/bpf/devmap.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 +- 3 files changed, 75 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3323ee91c172..d19ed3c15e1e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -716,7 +716,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. - * This does not appear to be a real limiation for existing software. + * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); int xdp_do_redirect(struct net_device *dev, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b2ef04a1c86a..899364d097f5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -34,6 +34,17 @@ * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. + * + * Finally, any of the above may race with a netdev_unregister notifier. The + * unregister notifier must search for net devices in the map structure that + * contain a reference to the net device and remove them. This is a two step + * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) + * check to see if the ifindex is the same as the net_device being removed. + * Unfortunately, the xchg() operations do not protect against this. To avoid + * potentially removing incorrect objects the dev_map_list_mutex protects + * conflicting netdev unregister and BPF syscall operations. Updates and + * deletes from a BPF program (done in rcu critical section) are blocked + * because of this mutex. */ #include #include @@ -54,8 +65,12 @@ struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; unsigned long int __percpu *flush_needed; + struct list_head list; }; +static DEFINE_MUTEX(dev_map_list_mutex); +static LIST_HEAD(dev_map_list); + static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; @@ -112,6 +127,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; + mutex_lock(&dev_map_list_mutex); + list_add_tail(&dtab->list, &dev_map_list); + mutex_unlock(&dev_map_list_mutex); return &dtab->map; free_dtab: @@ -146,6 +164,11 @@ static void dev_map_free(struct bpf_map *map) cpu_relax(); } + /* Although we should no longer have datapath or bpf syscall operations + * at this point we we can still race with netdev notifier, hence the + * lock. + */ + mutex_lock(&dev_map_list_mutex); for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -160,6 +183,8 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ + list_del(&dtab->list); + mutex_unlock(&dev_map_list_mutex); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -296,9 +321,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) * the driver tear down ensures all soft irqs are complete before * removing the net device in the case of dev_put equals zero. */ + mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + mutex_unlock(&dev_map_list_mutex); return 0; } @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, * Remembering the driver side flush operation will happen before the * net device is removed. */ + mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[i], dev); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + mutex_unlock(&dev_map_list_mutex); return 0; } @@ -356,3 +385,47 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, }; + +static int dev_map_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dtab *dtab; + int i; + + switch (event) { + case NETDEV_UNREGISTER: + mutex_lock(&dev_map_list_mutex); + list_for_each_entry(dtab, &dev_map_list, list) { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev || + dev->dev->ifindex != netdev->ifindex) + continue; + dev = xchg(&dtab->netdev_map[i], NULL); + if (dev) + call_rcu(&dev->rcu, + __dev_map_entry_free); + } + } + mutex_unlock(&dev_map_list_mutex); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block dev_map_notifier = { + .notifier_call = dev_map_notification, +}; + +static int __init dev_map_init(void) +{ + register_netdevice_notifier(&dev_map_notifier); + return 0; +} + +subsys_initcall(dev_map_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df05d65f0c87..ebe9b38ff522 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1281,7 +1281,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * for now. */ case BPF_MAP_TYPE_DEVMAP: - if (func_id == BPF_FUNC_map_lookup_elem) + if (func_id != BPF_FUNC_redirect_map) goto error; break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: -- cgit v1.3-6-gb490 From 241a974ba2c0d98e2104012cb80ed4494c0e66a7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Jul 2017 10:40:04 +0300 Subject: bpf: dev_map_alloc() shouldn't return NULL We forgot to set the error code on two error paths which means that we return ERR_PTR(0) which is NULL. The caller, find_and_alloc_map(), is not expecting that and will have a NULL dereference. Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Dan Carpenter Acked-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 899364d097f5..d439ee0eadb1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -114,6 +114,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu( BITS_TO_LONGS(attr->max_entries) * -- cgit v1.3-6-gb490 From 4cc7b9544b9a904add353406ed1bacbf56f75c52 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 4 Aug 2017 22:02:19 -0700 Subject: bpf: devmap fix mutex in rcu critical section Originally we used a mutex to protect concurrent devmap update and delete operations from racing with netdev unregister notifier callbacks. The notifier hook is needed because we increment the netdev ref count when a dev is added to the devmap. This ensures the netdev reference is valid in the datapath. However, we don't want to block unregister events, hence the initial mutex and notifier handler. The concern was in the notifier hook we search the map for dev entries that hold a refcnt on the net device being torn down. But, in order to do this we require two steps, (i) dereference the netdev: dev = rcu_dereference(map[i]) (ii) test ifindex: dev->ifindex == removing_ifindex and then finally we can swap in the NULL dev in the map via an xchg operation, xchg(map[i], NULL) The danger here is a concurrent update could run a different xchg op concurrently leading us to replace the new dev with a NULL dev incorrectly. CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) dev = rcu_dereference(map[i]) xchg(map[i]), new_dev); rcu_call(dev,...) xchg(map[i], NULL) The above flow would create the incorrect state with the dev reference in the update path being lost. To resolve this the original code used a mutex around the above block. However, updates, deletes, and lookups occur inside rcu critical sections so we can't use a mutex in this context safely. Fortunately, by writing slightly better code we can avoid the mutex altogether. If CPU 1 in the above example uses a cmpxchg and _only_ replaces the dev reference in the map when it is in fact the expected dev the race is removed completely. The two cases being illustrated here, first the race condition, CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) dev = rcu_dereference(map[i]) xchg(map[i]), new_dev); rcu_call(dev,...) odev = cmpxchg(map[i], dev, NULL) Now we can test the cmpxchg return value, detect odev != dev and abort. Or in the good case, CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) odev = cmpxchg(map[i], dev, NULL) [...] Now 'odev == dev' and we can do proper cleanup. And viola the original race we tried to solve with a mutex is corrected and the trace noted by Sasha below is resolved due to removal of the mutex. Note: When walking the devmap and removing dev references as needed we depend on the core to fail any calls to dev_get_by_index() using the ifindex of the device being removed. This way we do not race with the user while searching the devmap. Additionally, the mutex was also protecting list add/del/read on the list of maps in-use. This patch converts this to an RCU list and spinlock implementation. This protects the list from concurrent alloc/free operations. The notifier hook walks this list so it uses RCU read semantics. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747 in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1 1 lock held by syz-executor1/16315: #0: (rcu_read_lock){......}, at: [] map_delete_elem kernel/bpf/syscall.c:577 [inline] #0: (rcu_read_lock){......}, at: [] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] #0: (rcu_read_lock){......}, at: [] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388 Fixes: 2ddf71e23cc2 ("net: add notifier hooks for devmap bpf map") Reported-by: Sasha Levin Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d439ee0eadb1..7192fb67d4de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -40,11 +40,12 @@ * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. - * Unfortunately, the xchg() operations do not protect against this. To avoid - * potentially removing incorrect objects the dev_map_list_mutex protects - * conflicting netdev unregister and BPF syscall operations. Updates and - * deletes from a BPF program (done in rcu critical section) are blocked - * because of this mutex. + * When removing the dev a cmpxchg() is used to ensure the correct dev is + * removed, in the case of a concurrent update or delete operation it is + * possible that the initially referenced dev is no longer in the map. As the + * notifier hook walks the map we know that new dev references can not be + * added by the user because core infrastructure ensures dev_get_by_index() + * calls will fail at this point. */ #include #include @@ -68,7 +69,7 @@ struct bpf_dtab { struct list_head list; }; -static DEFINE_MUTEX(dev_map_list_mutex); +static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -128,9 +129,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; - mutex_lock(&dev_map_list_mutex); - list_add_tail(&dtab->list, &dev_map_list); - mutex_unlock(&dev_map_list_mutex); + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); return &dtab->map; free_dtab: @@ -169,7 +170,6 @@ static void dev_map_free(struct bpf_map *map) * at this point we we can still race with netdev notifier, hence the * lock. */ - mutex_lock(&dev_map_list_mutex); for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -184,8 +184,9 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ - list_del(&dtab->list); - mutex_unlock(&dev_map_list_mutex); + spin_lock(&dev_map_lock); + list_del_rcu(&dtab->list); + spin_unlock(&dev_map_lock); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -322,11 +323,9 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) * the driver tear down ensures all soft irqs are complete before * removing the net device in the case of dev_put equals zero. */ - mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); - mutex_unlock(&dev_map_list_mutex); return 0; } @@ -369,11 +368,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, * Remembering the driver side flush operation will happen before the * net device is removed. */ - mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[i], dev); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); - mutex_unlock(&dev_map_list_mutex); return 0; } @@ -396,22 +393,27 @@ static int dev_map_notification(struct notifier_block *notifier, switch (event) { case NETDEV_UNREGISTER: - mutex_lock(&dev_map_list_mutex); - list_for_each_entry(dtab, &dev_map_list, list) { + /* This rcu_read_lock/unlock pair is needed because + * dev_map_list is an RCU list AND to ensure a delete + * operation does not free a netdev_map entry while we + * are comparing it against the netdev being unregistered. + */ + rcu_read_lock(); + list_for_each_entry_rcu(dtab, &dev_map_list, list) { for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *dev, *odev; - dev = dtab->netdev_map[i]; + dev = READ_ONCE(dtab->netdev_map[i]); if (!dev || dev->dev->ifindex != netdev->ifindex) continue; - dev = xchg(&dtab->netdev_map[i], NULL); - if (dev) + odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); + if (dev == odev) call_rcu(&dev->rcu, __dev_map_entry_free); } } - mutex_unlock(&dev_map_list_mutex); + rcu_read_unlock(); break; default: break; -- cgit v1.3-6-gb490 From cf9d01405925e3f8144c99d7bf7b184449794066 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 23:35:12 -0700 Subject: bpf: devmap: remove unnecessary value size check In the devmap alloc map logic we check to ensure that the sizeof the values are not greater than KMALLOC_MAX_SIZE. But, in the dev map case we ensure the value size is 4bytes earlier in the function because all values should be netdev ifindex values. The second check is harmless but is not needed so remove it. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7192fb67d4de..18a72a8add43 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -83,12 +83,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags) return ERR_PTR(-EINVAL); - /* if value_size is bigger, the user space won't be able to - * access the elements. - */ - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); -- cgit v1.3-6-gb490 From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 18 Aug 2017 11:28:00 -0700 Subject: bpf: Allow selecting numa node during map creation The current map creation API does not allow to provide the numa-node preference. The memory usually comes from where the map-creation-process is running. The performance is not ideal if the bpf_prog is known to always run in a numa node different from the map-creation-process. One of the use case is sharding on CPU to different LRU maps (i.e. an array of LRU maps). Here is the test result of map_perf_test on the INNER_LRU_HASH_PREALLOC test if we force the lru map used by CPU0 to be allocated from a remote numa node: [ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ] ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec 4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec 3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec 6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec 2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec 1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec 7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec 0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec #<<< After specifying numa node: ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec 3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec 1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec 6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec 2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec 4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec 7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec 0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<< This patch adds one field, numa_node, to the bpf_attr. Since numa node 0 is a valid node, a new flag BPF_F_NUMA_NODE is also added. The numa_node field is honored if and only if the BPF_F_NUMA_NODE flag is set. Numa node selection is not supported for percpu map. This patch does not change all the kmalloc. F.e. 'htab = kzalloc()' is not changed since the object is small enough to stay in the cache. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 10 +++++++++- include/uapi/linux/bpf.h | 10 +++++++++- kernel/bpf/arraymap.c | 7 +++++-- kernel/bpf/devmap.c | 9 ++++++--- kernel/bpf/hashtab.c | 19 +++++++++++++++---- kernel/bpf/lpm_trie.c | 9 +++++++-- kernel/bpf/sockmap.c | 10 +++++++--- kernel/bpf/stackmap.c | 8 +++++--- kernel/bpf/syscall.c | 14 ++++++++++---- 9 files changed, 73 insertions(+), 23 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1cc6c5ff61ec..55b88e329804 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -51,6 +51,7 @@ struct bpf_map { u32 map_flags; u32 pages; u32 id; + int numa_node; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; @@ -264,7 +265,7 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); -void *bpf_map_area_alloc(size_t size); +void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; @@ -316,6 +317,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +/* Return map's numa specified by userspace */ +static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) +{ + return (attr->map_flags & BPF_F_NUMA_NODE) ? + attr->numa_node : NUMA_NO_NODE; +} + #else static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ecbe812a2cc..843818dff96d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_attach_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +/* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list @@ -173,6 +174,8 @@ enum bpf_attach_type { * across different LRU lists. */ #define BPF_F_NO_COMMON_LRU (1U << 1) +/* Specify numa node during map creation */ +#define BPF_F_NUMA_NODE (1U << 2) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ @@ -180,8 +183,13 @@ union bpf_attr { __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ - __u32 map_flags; /* prealloc or not */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d771a3872500..96e9c5c1dfc9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_array *array; u64 array_size; u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags) + attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size); + array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); @@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->map.map_flags = attr->map_flags; + array->map.numa_node = numa_node; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 18a72a8add43..67f4f00ce33a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -80,7 +80,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags) + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); @@ -93,6 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.value_size = attr->value_size; dtab->map.max_entries = attr->max_entries; dtab->map.map_flags = attr->map_flags; + dtab->map.numa_node = bpf_map_attr_numa_node(attr); err = -ENOMEM; @@ -119,7 +120,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) goto free_dtab; dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *)); + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); if (!dtab->netdev_map) goto free_dtab; @@ -344,7 +346,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); if (!dev) return -ENOMEM; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..47ae748c3a49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,6 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab_is_percpu(htab) && !htab_is_lru(htab)) num_entries += num_possible_cpus(); - htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, + htab->map.numa_node); if (!htab->elems) return -ENOMEM; @@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ return ERR_PTR(-EPERM); - if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); @@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (lru && !prealloc) return ERR_PTR(-ENOTSUPP); + if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; htab->map.map_flags = attr->map_flags; + htab->map.numa_node = numa_node; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * - sizeof(struct bucket)); + sizeof(struct bucket), + htab->map.numa_node); if (!htab->buckets) goto free_htab; @@ -689,7 +699,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!l_new) return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b09185f0f17d..1b767844a76f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, + trie->map.numa_node); if (!node) return NULL; @@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) + static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; @@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || - attr->map_flags != BPF_F_NO_PREALLOC || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->map_flags & ~LPM_CREATE_FLAG_MASK || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || @@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; trie->map.map_flags = attr->map_flags; + trie->map.numa_node = bpf_map_attr_numa_node(attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 39de541fbcdc..78b2bb9370ac 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -443,7 +443,9 @@ static struct smap_psock *smap_init_psock(struct sock *sock, { struct smap_psock *psock; - psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN); + psock = kzalloc_node(sizeof(struct smap_psock), + GFP_ATOMIC | __GFP_NOWARN, + stab->map.numa_node); if (!psock) return ERR_PTR(-ENOMEM); @@ -465,7 +467,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags) + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -481,6 +483,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) stab->map.value_size = attr->value_size; stab->map.max_entries = attr->max_entries; stab->map.map_flags = attr->map_flags; + stab->map.numa_node = bpf_map_attr_numa_node(attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -495,7 +498,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *)); + sizeof(struct sock *), + stab->map.numa_node); if (!stab->sock_map) goto free_stab; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 31147d730abf..135be433e9a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; int err; - smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries); + smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, + smap->map.numa_node); if (!smap->elems) return -ENOMEM; @@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags) + if (attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); /* check sanity of attributes */ @@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); - smap = bpf_map_area_alloc(cost); + smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); @@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b8cb1b3c9bfb..9378f3ba2cbf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -105,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size) +void *bpf_map_area_alloc(size_t size, int numa_node) { /* We definitely need __GFP_NORETRY, so OOM killer doesn't * trigger under memory pressure as we really just want to @@ -115,12 +115,13 @@ void *bpf_map_area_alloc(size_t size) void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc(size, GFP_USER | flags); + area = kmalloc_node(size, GFP_USER | flags, numa_node); if (area != NULL) return area; } - return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); + return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, + __builtin_return_address(0)); } void bpf_map_area_free(void *area) @@ -309,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd +#define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) { + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; int err; @@ -320,6 +322,10 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (numa_node != NUMA_NO_NODE && + (numa_node >= nr_node_ids || !node_online(numa_node))) + return -EINVAL; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) -- cgit v1.3-6-gb490 From 274043c6c95636e62f5b2514e78fdba82eb47601 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Aug 2017 01:48:12 +0200 Subject: bpf: fix double free from dev_map_notification() In the current code, dev_map_free() can still race with dev_map_notification(). In dev_map_free(), we remove dtab from the list of dtabs after we purged all entries from it. However, we don't do xchg() with NULL or the like, so the entry at that point is still pointing to the device. If a unregister notification comes in at the same time, we therefore risk a double-free, since the pointer is still present in the map, and then pushed again to __dev_map_entry_free(). All this is completely unnecessary. Just remove the dtab from the list right before the synchronize_rcu(), so all outstanding readers from the notifier list have finished by then, thus we don't need to deal with this corner case anymore and also wouldn't need to nullify dev entires. This is fine because we iterate over the map releasing all entries and therefore dev references anyway. Fixes: 4cc7b9544b9a ("bpf: devmap fix mutex in rcu critical section") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 67f4f00ce33a..fa08181d1c3d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -148,6 +148,11 @@ static void dev_map_free(struct bpf_map *map) * no further reads against netdev_map. It does __not__ ensure pending * flush operations (if any) are complete. */ + + spin_lock(&dev_map_lock); + list_del_rcu(&dtab->list); + spin_unlock(&dev_map_lock); + synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush @@ -162,10 +167,6 @@ static void dev_map_free(struct bpf_map *map) cpu_relax(); } - /* Although we should no longer have datapath or bpf syscall operations - * at this point we we can still race with netdev notifier, hence the - * lock. - */ for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -180,9 +181,6 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ - spin_lock(&dev_map_lock); - list_del_rcu(&dtab->list); - spin_unlock(&dev_map_lock); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); -- cgit v1.3-6-gb490 From af4d045ceeca04946d89453206269aea6c338a8e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Aug 2017 01:47:54 +0200 Subject: bpf: minor cleanups for dev_map Some minor code cleanups, while going over it I also noticed that we're accounting the bitmap only for one CPU currently, so fix that up as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 100 +++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 59 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index fa08181d1c3d..bfecabfd4974 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,30 +48,30 @@ * calls will fail at this point. */ #include -#include #include -#include -#include "percpu_freelist.h" -#include "bpf_lru_list.h" -#include "map_in_map.h" struct bpf_dtab_netdev { struct net_device *dev; - int key; - struct rcu_head rcu; struct bpf_dtab *dtab; + unsigned int bit; + struct rcu_head rcu; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long int __percpu *flush_needed; + unsigned long __percpu *flush_needed; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static u64 dev_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; @@ -95,11 +95,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.map_flags = attr->map_flags; dtab->map.numa_node = bpf_map_attr_numa_node(attr); - err = -ENOMEM; - /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); + cost += dev_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; @@ -110,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; - err = -ENOMEM; /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu( - BITS_TO_LONGS(attr->max_entries) * - sizeof(unsigned long), - __alignof__(unsigned long)); + dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), + __alignof__(unsigned long)); if (!dtab->flush_needed) goto free_dtab; @@ -128,12 +123,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); - return &dtab->map; + return &dtab->map; free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(err); + return ERR_PTR(-ENOMEM); } static void dev_map_free(struct bpf_map *map) @@ -178,9 +173,6 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - /* At this point bpf program is detached and all pending operations - * _must_ be complete - */ free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -190,7 +182,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; + u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; @@ -199,29 +191,16 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) if (index == dtab->map.max_entries - 1) return -ENOENT; - *next = index + 1; return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 key) +void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - __set_bit(key, bitmap); -} - -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; - - if (key >= map->max_entries) - return NULL; - - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + __set_bit(bit, bitmap); } /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled @@ -248,7 +227,6 @@ void __dev_map_flush(struct bpf_map *map) continue; netdev = dev->dev; - __clear_bit(bit, bitmap); if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) continue; @@ -261,43 +239,49 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev; - u32 i = *(u32 *)key; - if (i >= map->max_entries) + if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[i]); - return dev ? &dev->dev->ifindex : NULL; + dev = READ_ONCE(dtab->netdev_map[key]); + return dev ? dev->dev : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev) +static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + + return dev ? &dev->ifindex : NULL; +} + +static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { - if (old_dev->dev->netdev_ops->ndo_xdp_flush) { - struct net_device *fl = old_dev->dev; + if (dev->dev->netdev_ops->ndo_xdp_flush) { + struct net_device *fl = dev->dev; unsigned long *bitmap; int cpu; for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu); - __clear_bit(old_dev->key, bitmap); + bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); + __clear_bit(dev->bit, bitmap); - fl->netdev_ops->ndo_xdp_flush(old_dev->dev); + fl->netdev_ops->ndo_xdp_flush(dev->dev); } } } static void __dev_map_entry_free(struct rcu_head *rcu) { - struct bpf_dtab_netdev *old_dev; + struct bpf_dtab_netdev *dev; - old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(old_dev); - dev_put(old_dev->dev); - kfree(old_dev); + dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_map_flush_old(dev); + dev_put(dev->dev); + kfree(dev); } static int dev_map_delete_elem(struct bpf_map *map, void *key) @@ -309,8 +293,8 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - /* Use synchronize_rcu() here to ensure any rcu critical sections - * have completed, but this does not guarantee a flush has happened + /* Use call_rcu() here to ensure any rcu critical sections have + * completed, but this does not guarantee a flush has happened * yet. Because driver side rcu_read_lock/unlock only protects the * running XDP program. However, for pending flush operations the * dev and ctx are stored in another per cpu map. And additionally, @@ -334,10 +318,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; @@ -355,7 +337,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; } - dev->key = i; + dev->bit = i; dev->dtab = dtab; } -- cgit v1.3-6-gb490 From a5e2da6e9787187ff104c34aa048419703c1f9cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 24 Aug 2017 03:20:11 +0200 Subject: bpf: netdev is never null in __dev_map_flush No need to test for it in fast-path, every dev in bpf_dtab_netdev is guaranteed to be non-NULL, otherwise dev_map_update_elem() will fail in the first place. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfecabfd4974..ecf9f99ecc57 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -226,12 +226,10 @@ void __dev_map_flush(struct bpf_map *map) if (unlikely(!dev)) continue; - netdev = dev->dev; __clear_bit(bit, bitmap); - if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) - continue; - - netdev->netdev_ops->ndo_xdp_flush(netdev); + netdev = dev->dev; + if (likely(netdev->netdev_ops->ndo_xdp_flush)) + netdev->netdev_ops->ndo_xdp_flush(netdev); } } -- cgit v1.3-6-gb490 From 374fb014fc5b15e420faa00af036868a635eadd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:01:10 -0700 Subject: bpf: devmap, use cond_resched instead of cpu_relax Be a bit more friendly about waiting for flush bits to complete. Replace the cpu_relax() with a cond_resched(). Suggested-by: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ecf9f99ecc57..959c9a07f318 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,7 +159,7 @@ static void dev_map_free(struct bpf_map *map) unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); while (!bitmap_empty(bitmap, dtab->map.max_entries)) - cpu_relax(); + cond_resched(); } for (i = 0; i < dtab->map.max_entries; i++) { -- cgit v1.3-6-gb490 From 582db7e0c4c2fc5bb4f932f268035883385e3692 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 18 Sep 2017 15:03:46 +0200 Subject: bpf: devmap: pass on return value of bpf_map_precharge_memlock If bpf_map_precharge_memlock in dev_map_alloc, -ENOMEM is returned regardless of the actual error produced by bpf_map_precharge_memlock. Fix it by passing on the error returned by bpf_map_precharge_memlock. Also return -EINVAL instead of -ENOMEM if the page count overflow check fails. This makes dev_map_alloc match the behavior of other bpf maps' alloc functions wrt. return values. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err = -EINVAL; u64 cost; - int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; + /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), __alignof__(unsigned long)); @@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void dev_map_free(struct bpf_map *map) -- cgit v1.3-6-gb490