aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c28
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/filter.h7
-rw-r--r--include/linux/netdevice.h5
-rw-r--r--kernel/bpf/devmap.c84
-rw-r--r--net/core/filter.c55
6 files changed, 166 insertions, 15 deletions
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 38f7ff97d636..0f867dcda65f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2415,6 +2415,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
*/
wmb();
writel(ring->next_to_use, ring->tail);
+
+ xdp_do_flush_map();
}
u64_stats_update_begin(&rx_ring->syncp);
@@ -5817,6 +5819,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
usleep_range(10000, 20000);
+ /* synchronize_sched() needed for pending XDP buffers to drain */
+ if (adapter->xdp_ring[0])
+ synchronize_sched();
netif_tx_stop_all_queues(netdev);
/* call carrier off first to avoid false dev_watchdog timeouts */
@@ -9850,15 +9855,31 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (err != IXGBE_XDP_TX)
return -ENOMEM;
+ return 0;
+}
+
+static void ixgbe_xdp_flush(struct net_device *dev)
+{
+ struct ixgbe_adapter *adapter = netdev_priv(dev);
+ struct ixgbe_ring *ring;
+
+ /* Its possible the device went down between xdp xmit and flush so
+ * we need to ensure device is still up.
+ */
+ if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
+ return;
+
+ ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL;
+ if (unlikely(!ring))
+ return;
+
/* Force memory writes to complete before letting h/w know there
* are new descriptors to fetch.
*/
wmb();
-
- ring = adapter->xdp_ring[smp_processor_id()];
writel(ring->next_to_use, ring->tail);
- return 0;
+ return;
}
static const struct net_device_ops ixgbe_netdev_ops = {
@@ -9908,6 +9929,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_features_check = ixgbe_features_check,
.ndo_xdp = ixgbe_xdp,
.ndo_xdp_xmit = ixgbe_xdp_xmit,
+ .ndo_xdp_flush = ixgbe_xdp_flush,
};
/**
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0d3281ac678..6850a760dc94 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -381,5 +381,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
/* Map specifics */
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
+void __dev_map_flush(struct bpf_map *map);
#endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ce8211fa91c7..3323ee91c172 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -712,10 +712,17 @@ bool bpf_helper_changes_pkt_data(void *func);
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
+/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
+ * same cpu context. Further for best results no more than a single map
+ * for the do_redirect/do_flush pair should be used. This limitation is
+ * because we only track one map and force a flush when the map changes.
+ * This does not appear to be a real limiation for existing software.
+ */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
int xdp_do_redirect(struct net_device *dev,
struct xdp_buff *xdp,
struct bpf_prog *prog);
+void xdp_do_flush_map(void);
void bpf_warn_invalid_xdp_action(u32 act);
void bpf_warn_invalid_xdp_redirect(u32 ifindex);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 77f5376005e6..03b104908235 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1142,7 +1142,9 @@ struct xfrmdev_ops {
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
* This function is used to submit a XDP packet for transmit on a
* netdevice.
- *
+ * void (*ndo_xdp_flush)(struct net_device *dev);
+ * This function is used to inform the driver to flush a paticular
+ * xpd tx queue. Must be called on same CPU as xdp_xmit.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@@ -1329,6 +1331,7 @@ struct net_device_ops {
struct netdev_xdp *xdp);
int (*ndo_xdp_xmit)(struct net_device *dev,
struct xdp_buff *xdp);
+ void (*ndo_xdp_flush)(struct net_device *dev);
};
/**
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 36dc13deb2e1..b2ef04a1c86a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -53,6 +53,7 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
+ unsigned long int __percpu *flush_needed;
};
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
if (cost >= U32_MAX - PAGE_SIZE)
goto free_dtab;
@@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (err)
goto free_dtab;
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu(
+ BITS_TO_LONGS(attr->max_entries) *
+ sizeof(unsigned long),
+ __alignof__(unsigned long));
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *));
if (!dtab->netdev_map)
@@ -105,6 +115,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
return &dtab->map;
free_dtab:
+ free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}
@@ -112,7 +123,7 @@ free_dtab:
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i;
+ int i, cpu;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
@@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map)
*/
synchronize_rcu();
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cpu_relax();
+ }
+
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
@@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map)
/* At this point bpf program is detached and all pending operations
* _must_ be complete
*/
+ free_percpu(dtab->flush_needed);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
@@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(key, bitmap);
+}
+
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
return dev ? dev->dev : NULL;
}
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ netdev = dev->dev;
+
+ __clear_bit(bit, bitmap);
+ if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
+ continue;
+
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
@@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->dev->ifindex : NULL;
}
+static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
+{
+ if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = old_dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
+ __clear_bit(old_dev->key, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
+ }
+ }
+}
+
static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *old_dev;
old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(old_dev);
dev_put(old_dev->dev);
kfree(old_dev);
}
diff --git a/net/core/filter.c b/net/core/filter.c
index e93a558324b5..e23aa6fa1119 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1780,6 +1780,7 @@ struct redirect_info {
u32 ifindex;
u32 flags;
struct bpf_map *map;
+ struct bpf_map *map_to_flush;
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -2438,34 +2439,68 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
+static int __bpf_tx_xdp(struct net_device *dev,
+ struct bpf_map *map,
+ struct xdp_buff *xdp,
+ u32 index)
{
- if (dev->netdev_ops->ndo_xdp_xmit) {
- dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
- return 0;
+ int err;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit) {
+ bpf_warn_invalid_xdp_redirect(dev->ifindex);
+ return -EOPNOTSUPP;
}
- bpf_warn_invalid_xdp_redirect(dev->ifindex);
- return -EOPNOTSUPP;
+
+ err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+ if (err)
+ return err;
+
+ if (map)
+ __dev_map_insert_ctx(map, index);
+ else
+ dev->netdev_ops->ndo_xdp_flush(dev);
+
+ return err;
}
+void xdp_do_flush_map(void)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_map *map = ri->map_to_flush;
+
+ ri->map = NULL;
+ ri->map_to_flush = NULL;
+
+ if (map)
+ __dev_map_flush(map);
+}
+EXPORT_SYMBOL_GPL(xdp_do_flush_map);
+
int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct bpf_map *map = ri->map;
+ u32 index = ri->ifindex;
struct net_device *fwd;
int err = -EINVAL;
ri->ifindex = 0;
ri->map = NULL;
- fwd = __dev_map_lookup_elem(map, ri->ifindex);
+ fwd = __dev_map_lookup_elem(map, index);
if (!fwd)
goto out;
- trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
- err = __bpf_tx_xdp(fwd, xdp);
+ if (ri->map_to_flush && (ri->map_to_flush != map))
+ xdp_do_flush_map();
+
+ err = __bpf_tx_xdp(fwd, map, xdp, index);
+ if (likely(!err))
+ ri->map_to_flush = map;
+
out:
+ trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
return err;
}
@@ -2488,7 +2523,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
- return __bpf_tx_xdp(fwd, xdp);
+ return __bpf_tx_xdp(fwd, NULL, xdp, 0);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);