From a74f0fa082b76c6a76cba5672f36218518bfdc09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Dec 2018 07:58:17 -0800 Subject: tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12 as a step to enable bigger tcp sndbuf limits. It works reasonably well, but the following happens : Once the limit is reached, TCP stack generates an [E]POLLOUT event for every incoming ACK packet. This causes a high number of context switches. This patch implements the strategy David Miller added in sock_def_write_space() : - If TCP socket has a notsent_lowat constraint of X bytes, allow sendmsg() to fill up to X bytes, but send [E]POLLOUT only if number of notsent bytes is below X/2 This considerably reduces TCP_NOTSENT_LOWAT overhead, while allowing to keep the pipe full. Tested: 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM A:/# cat /proc/sys/net/ipv4/tcp_wmem 4096 262144 64000000 A:/# super_netperf 100 -H B -l 1000 -- -K bbr & A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/ A:/# vmstat 5 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0 2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0 0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0 1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0 2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0 A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow A:/# vmstat 5 5 # check that context switches have not inflated too much. procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0 0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0 1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0 1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0 1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0 Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0681afc62354..e0a65c067662 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1870,12 +1870,16 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } -static inline bool tcp_stream_memory_free(const struct sock *sk) +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = tp->write_seq - tp->snd_nxt; - return notsent_bytes < tcp_notsent_lowat(tp); + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } #ifdef CONFIG_PROC_FS -- cgit v1.2.3-59-g8ed1b