From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:27 -0700 Subject: net: napi: add hard irqs deferral feature Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b6055738b ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fb61522b1ce1..67585484ad32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) + if (work_done) { + if (n->gro_bitmask) timeout = n->dev->gro_flush_timeout; - + n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = n->dev->gro_flush_timeout; + if (timeout) + ret = false; + } + if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); -- cgit v1.2.3-59-g8ed1b