diff options
Diffstat (limited to 'kernel/bpf/devmap.c')
-rw-r--r-- | kernel/bpf/devmap.c | 597 |
1 files changed, 473 insertions, 124 deletions
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 58bdca5d978a..f9a87dcc5535 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,16 +48,17 @@ #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -#define DEV_MAP_BULK_SIZE 16 struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; + struct bpf_prog *xdp_prog; unsigned int count; }; @@ -65,13 +66,15 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; + struct bpf_devmap_val val; }; struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ + struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,12 +88,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) +static struct hlist_head *dev_map_create_hash(unsigned int entries, + int numa_node) { int i; struct hlist_head *hash; - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -106,12 +110,16 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - u64 cost = 0; - int err; + u32 valsize = attr->value_size; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -127,35 +135,24 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; - } else { - cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, cost); - if (err) - return -EINVAL; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, + dtab->map.numa_node); if (!dtab->dev_index_head) - goto free_charge; + return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + return -ENOMEM; } return 0; - -free_charge: - bpf_map_charge_finish(&dtab->map.memory); - return -ENOMEM; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -166,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER); + dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { - kfree(dtab); + bpf_map_area_free(dtab); return ERR_PTR(err); } @@ -218,20 +215,24 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } - kfree(dtab->dev_index_head); + bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; - dev = dtab->netdev_map[i]; + dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -239,7 +240,7 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - kfree(dtab); + bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -259,7 +260,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -318,72 +323,110 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, + struct xdp_frame **frames, int n, + struct net_device *dev) +{ + struct xdp_txq_info txq = { .dev = dev }; + struct xdp_buff xdp; + int i, nframes = 0; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + xdp_convert_frame_to_buff(xdpf, &xdp); + xdp.txq = &txq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (unlikely(err < 0)) + xdp_return_frame_rx_napi(xdpf); + else + frames[nframes++] = xdpf; + break; + default: + bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + xdp_return_frame_rx_napi(xdpf); + break; + } + } + return nframes; /* sent frames count */ +} + +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; - int sent = 0, drops = 0, err = 0; + unsigned int cnt = bq->count; + int sent = 0, err = 0; + int to_send = cnt; int i; - if (unlikely(!bq->count)) - return 0; + if (unlikely(!cnt)) + return; - for (i = 0; i < bq->count; i++) { + for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); + if (bq->xdp_prog) { + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + if (!to_send) + goto out; + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { + /* If ndo_xdp_xmit fails with an errno, no frames have + * been xmit'ed. + */ err = sent; sent = 0; - goto error; } - drops = bq->count - sent; -out: - bq->count = 0; - trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); - bq->dev_rx = NULL; - __list_del_clearprev(&bq->flush_node); - return 0; -error: - /* If ndo_xdp_xmit fails with an errno, no frames have been - * xmit'ed and it's our responsibility to them free all. + /* If not all frames have been transmitted, it is our + * responsibility to free them */ - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; + for (i = sent; unlikely(i < to_send); i++) + xdp_return_frame_rx_napi(bq->q[i]); - xdp_return_frame_rx_napi(xdpf); - drops++; - } - goto out; -} - -/* __dev_flush is called from xdp_do_flush() which _must_ be signaled - * from the driver before returning from its napi->poll() routine. The poll() - * routine is called either from busy_poll context or net_rx_action signaled - * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the flush list - * is empty before completing to ensure all flush operations have completed. - * When drivers update the bpf program they may need to ensure any flush ops - * are also complete. Using synchronize_rcu or call_rcu will suffice for this - * because both wait for napi context to exit. +out: + bq->count = 0; + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); +} + +/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the + * driver before returning from its napi->poll() routine. See the comment above + * xdp_do_flush() in filter.c. */ void __dev_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; - list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); + bq->dev_rx = NULL; + bq->xdp_prog = NULL; + __list_del_clearprev(&bq->flush_node); + } } -/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or - * update happens in parallel here a dev_put wont happen until after reading the - * ifindex. +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -391,15 +434,17 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - obj = READ_ONCE(dtab->netdev_map[key]); + obj = rcu_dereference_check(dtab->netdev_map[key], + rcu_read_lock_bh_held()); return obj; } -/* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. +/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu + * variable access, and map elements stick around. See comment above + * xdp_do_flush() in filter.c. */ -static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) +static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, + struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -410,50 +455,205 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. + * + * Do the same with xdp_prog and flush_list since these fields + * are only ever modified together. */ - if (!bq->dev_rx) + if (!bq->dev_rx) { bq->dev_rx = dev_rx; - - bq->q[bq->count++] = xdpf; - - if (!bq->flush_node.prev) + bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); + } - return 0; + bq->q[bq->count++] = xdpf; } -static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, - struct net_device *dev_rx) +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, + struct net_device *dev_rx, + struct bpf_prog *xdp_prog) { - struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; + bq_enqueue(dev, xdpf, dev_rx, xdp_prog); + return 0; +} + +static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) +{ + struct xdp_txq_info txq = { .dev = dst->dev }; + struct xdp_buff xdp; + u32 act; + + if (!dst->xdp_prog) + return XDP_PASS; + + __skb_pull(skb, skb->mac_len); + xdp.txq = &txq; + + act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); + switch (act) { + case XDP_PASS: + __skb_push(skb, skb->mac_len); + break; + default: + bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dst->dev, dst->xdp_prog, act); + fallthrough; + case XDP_DROP: + kfree_skb(skb); + break; + } - return bq_enqueue(dev, xdpf, dev_rx); + return act; } -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx); + return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; - return __xdp_enqueue(dev, xdp, dev_rx); + return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); +} + +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +{ + if (!obj || + !obj->dev->netdev_ops->ndo_xdp_xmit) + return false; + + if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) + return false; + + return true; +} + +static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, + struct net_device *dev_rx, + struct xdp_frame *xdpf) +{ + struct xdp_frame *nxdpf; + + nxdpf = xdpf_clone(xdpf); + if (!nxdpf) + return -ENOMEM; + + bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); + + return 0; +} + +static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) +{ + while (num_excluded--) { + if (ifindex == excluded[num_excluded]) + return true; + } + return false; +} + +/* Get ifindex of each upper device. 'indexes' must be able to hold at + * least MAX_NEST_DEV elements. + * Returns the number of ifindexes added. + */ +static int get_upper_ifindexes(struct net_device *dev, int *indexes) +{ + struct net_device *upper; + struct list_head *iter; + int n = 0; + + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + indexes[n++] = upper->ifindex; + } + return n; +} + +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dst, *last_dst = NULL; + int excluded_devices[1+MAX_NEST_DEV]; + struct hlist_head *head; + int num_excluded = 0; + unsigned int i; + int err; + + if (exclude_ingress) { + num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); + excluded_devices[num_excluded++] = dev_rx->ifindex; + } + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + for (i = 0; i < map->max_entries; i++) { + dst = rcu_dereference_check(dtab->netdev_map[i], + rcu_read_lock_bh_held()); + if (!is_valid_dst(dst, xdpf)) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); + if (err) + return err; + + last_dst = dst; + } + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ + for (i = 0; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + hlist_for_each_entry_rcu(dst, head, index_hlist, + lockdep_is_held(&dtab->index_lock)) { + if (!is_valid_dst(dst, xdpf)) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, + dst->dev->ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); + if (err) + return err; + + last_dst = dst; + } + } + } + + /* consume the last copy of the frame */ + if (last_dst) + bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); + else + xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ + + return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -464,27 +664,128 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; + + /* Redirect has already succeeded semantically at this point, so we just + * return 0 even if packet is dropped. Helper below takes care of + * freeing skb. + */ + if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) + return 0; + skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); return 0; } +static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct sk_buff *nskb; + int err; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + err = dev_map_generic_redirect(dst, nskb, xdp_prog); + if (unlikely(err)) { + consume_skb(nskb); + return err; + } + + return 0; +} + +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dst, *last_dst = NULL; + int excluded_devices[1+MAX_NEST_DEV]; + struct hlist_head *head; + struct hlist_node *next; + int num_excluded = 0; + unsigned int i; + int err; + + if (exclude_ingress) { + num_excluded = get_upper_ifindexes(dev, excluded_devices); + excluded_devices[num_excluded++] = dev->ifindex; + } + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + for (i = 0; i < map->max_entries; i++) { + dst = rcu_dereference_check(dtab->netdev_map[i], + rcu_read_lock_bh_held()); + if (!dst) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_redirect_clone(last_dst, skb, xdp_prog); + if (err) + return err; + + last_dst = dst; + + } + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ + for (i = 0; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + hlist_for_each_entry_safe(dst, next, head, index_hlist) { + if (!dst) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, + dst->dev->ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_redirect_clone(last_dst, skb, xdp_prog); + if (err) + return err; + + last_dst = dst; + } + } + } + + /* consume the first skb and return */ + if (last_dst) + return dev_map_generic_redirect(last_dst, skb, xdp_prog); + + /* dtab is empty */ + consume_skb(skb); + return 0; +} + static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) @@ -492,6 +793,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -505,14 +808,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - /* Use call_rcu() here to ensure any rcu critical sections have - * completed as well as any flush operations because call_rcu - * will wait for preempt-disable region to complete, NAPI in this - * context. And additionally, the driver tear down ensures all - * soft irqs are complete before removing the net device in the - * case of dev_put equals zero. - */ - old_dev = xchg(&dtab->netdev_map[k], NULL); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; @@ -542,26 +838,51 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, - u32 ifindex, + struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - dtab->map.numa_node); + dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), + GFP_NOWAIT | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); + dev->dev = dev_get_by_index(net, val->ifindex); + if (!dev->dev) + goto err_out; + + if (val->bpf_prog.fd > 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP || + !bpf_prog_map_compatible(&dtab->map, prog)) + goto err_put_prog; } dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } + dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); +err_out: + kfree(dev); + return ERR_PTR(-EINVAL); } static int __dev_map_update_elem(struct net *net, struct bpf_map *map, @@ -569,7 +890,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -579,10 +900,16 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (!ifindex) { + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd > 0) + return -EINVAL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); + dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -591,7 +918,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, * Remembering the driver side flush operation will happen before the * net device is removed. */ - old_dev = xchg(&dtab->netdev_map[i], dev); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); @@ -610,12 +937,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - if (unlikely(map_flags > BPF_EXIST || !ifindex)) + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -624,7 +954,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; @@ -662,7 +992,23 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, + __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, + __dev_map_hash_lookup_elem); +} + +BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, @@ -670,9 +1016,12 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_id = &dev_map_btf_ids[0], + .map_redirect = dev_map_redirect, }; const struct bpf_map_ops dev_map_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, @@ -680,6 +1029,8 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_id = &dev_map_btf_ids[0], + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, @@ -721,9 +1072,7 @@ static int dev_map_notification(struct notifier_block *notifier, break; /* will be freed in free_netdev() */ - netdev->xdp_bulkq = - __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), - sizeof(void *), GFP_ATOMIC); + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; @@ -746,10 +1095,10 @@ static int dev_map_notification(struct notifier_block *notifier, for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; - dev = READ_ONCE(dtab->netdev_map[i]); + dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; - odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); + odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) call_rcu(&dev->rcu, __dev_map_entry_free); |