aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-04-28 14:07:00 -0700
committerJakub Kicinski <kuba@kernel.org>2025-04-28 14:07:00 -0700
commitc0b0a360ed4ca077cf72a9c3cd15ac89f6093f3a (patch)
tree50330018e048d9c73241561508401c58209dad44
parentmdio: fix CONFIG_MDIO_DEVRES selects (diff)
parentveth: apply qdisc backpressure on full ptr_ring to reduce TX drops (diff)
downloadwireguard-linux-c0b0a360ed4ca077cf72a9c3cd15ac89f6093f3a.tar.xz
wireguard-linux-c0b0a360ed4ca077cf72a9c3cd15ac89f6093f3a.zip
Merge branch 'veth-qdisc-backpressure-and-qdisc-check-refactor'
Jesper Dangaard Brouer says: ==================== veth: qdisc backpressure and qdisc check refactor This patch series addresses TX drops seen on veth devices under load, particularly when using threaded NAPI, which is our setup in production. The root cause is that the NAPI consumer often runs on a different CPU than the producer. Combined with scheduling delays or simply slower consumption, this increases the chance that the ptr_ring fills up before packets are drained, resulting in drops from veth_xmit() (ndo_start_xmit()). To make this easier to reproduce, we’ve created a script that sets up a test scenario using network namespaces. The script inserts 1000 iptables rules in the consumer namespace to slow down packet processing and amplify the issue. Reproducer script: https://github.com/xdp-project/xdp-project/blob/main/areas/core/veth_setup01_NAPI_TX_drops.sh This series first introduces a helper to detect no-queue qdiscs and then uses it in the veth driver to conditionally apply qdisc-level backpressure when a real qdisc is attached. The behavior is off by default and opt-in, ensuring minimal impact and easy activation. v6: https://lore.kernel.org/174549933665.608169.392044991754158047.stgit@firesoul v5: https://lore.kernel.org/174489803410.355490.13216831426556849084.stgit@firesoul v4 https://lore.kernel.org/174472463778.274639.12670590457453196991.stgit@firesoul v3: https://lore.kernel.org/174464549885.20396.6987653753122223942.stgit@firesoul v2: https://lore.kernel.org/174412623473.3702169.4235683143719614624.stgit@firesoul RFC-v1: https://lore.kernel.org/174377814192.3376479.16481605648460889310.stgit@firesoul ==================== Link: https://patch.msgid.link/174559288731.827981.8748257839971869213.stgit@firesoul Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--drivers/net/veth.c57
-rw-r--r--drivers/net/vrf.c4
-rw-r--r--include/net/sch_generic.h8
3 files changed, 56 insertions, 13 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 7bb53961c0ea..e58a0f1b5c5b 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -307,12 +307,10 @@ static void __veth_xdp_flush(struct veth_rq *rq)
static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
{
- if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
- dev_kfree_skb_any(skb);
- return NET_RX_DROP;
- }
+ if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb)))
+ return NETDEV_TX_BUSY; /* signal qdisc layer */
- return NET_RX_SUCCESS;
+ return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */
}
static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
@@ -346,11 +344,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct veth_rq *rq = NULL;
- int ret = NETDEV_TX_OK;
+ struct netdev_queue *txq;
struct net_device *rcv;
int length = skb->len;
bool use_napi = false;
- int rxq;
+ int ret, rxq;
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
@@ -373,17 +371,45 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
}
skb_tx_timestamp(skb);
- if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
+
+ ret = veth_forward_skb(rcv, skb, rq, use_napi);
+ switch (ret) {
+ case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */
if (!use_napi)
dev_sw_netstats_tx_add(dev, 1, length);
else
__veth_xdp_flush(rq);
- } else {
+ break;
+ case NETDEV_TX_BUSY:
+ /* If a qdisc is attached to our virtual device, returning
+ * NETDEV_TX_BUSY is allowed.
+ */
+ txq = netdev_get_tx_queue(dev, rxq);
+
+ if (qdisc_txq_has_no_queue(txq)) {
+ dev_kfree_skb_any(skb);
+ goto drop;
+ }
+ /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */
+ __skb_push(skb, ETH_HLEN);
+ /* Depend on prior success packets started NAPI consumer via
+ * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped,
+ * paired with empty check in veth_poll().
+ */
+ netif_tx_stop_queue(txq);
+ smp_mb__after_atomic();
+ if (unlikely(__ptr_ring_empty(&rq->xdp_ring)))
+ netif_tx_wake_queue(txq);
+ break;
+ case NET_RX_DROP: /* same as NET_XMIT_DROP */
drop:
atomic64_inc(&priv->dropped);
ret = NET_XMIT_DROP;
+ break;
+ default:
+ net_crit_ratelimited("%s(%s): Invalid return code(%d)",
+ __func__, dev->name, ret);
}
-
rcu_read_unlock();
return ret;
@@ -874,9 +900,17 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats)
{
+ struct veth_priv *priv = netdev_priv(rq->dev);
+ int queue_idx = rq->xdp_rxq.queue_index;
+ struct netdev_queue *peer_txq;
+ struct net_device *peer_dev;
int i, done = 0, n_xdpf = 0;
void *xdpf[VETH_XDP_BATCH];
+ /* NAPI functions as RCU section */
+ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held());
+ peer_txq = netdev_get_tx_queue(peer_dev, queue_idx);
+
for (i = 0; i < budget; i++) {
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
@@ -925,6 +959,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
rq->stats.vs.xdp_packets += done;
u64_stats_update_end(&rq->stats.syncp);
+ if (unlikely(netif_tx_queue_stopped(peer_txq)))
+ netif_tx_wake_queue(peer_txq);
+
return done;
}
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 7168b33adadb..9a4beea6ee0c 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -343,15 +343,13 @@ unlock:
static bool qdisc_tx_is_default(const struct net_device *dev)
{
struct netdev_queue *txq;
- struct Qdisc *qdisc;
if (dev->num_tx_queues > 1)
return false;
txq = netdev_get_tx_queue(dev, 0);
- qdisc = rcu_access_pointer(txq->qdisc);
- return !qdisc->enqueue;
+ return qdisc_txq_has_no_queue(txq);
}
/* Local traffic destined to local address. Reinsert the packet to rx
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d48c657191cd..b6c177f7141c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
return false;
}
+/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */
+static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq)
+{
+ struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc);
+
+ return qdisc->enqueue == NULL;
+}
+
/* Is the device using the noop qdisc on all queues? */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{