From 3b24d854cb35383c30642116e5992fd619bdc9bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:17 -0700 Subject: tcp/dccp: do not touch listener sk_refcnt under synflood When a SYNFLOOD targets a non SO_REUSEPORT listener, multiple cpus contend on sk->sk_refcnt and sk->sk_wmem_alloc changes. By letting listeners use SOCK_RCU_FREE infrastructure, we can relax TCP_LISTEN lookup rules and avoid touching sk_refcnt Note that we still use SLAB_DESTROY_BY_RCU rules for other sockets, only listeners are impacted by this change. Peak performance under SYNFLOOD is increased by ~33% : On my test machine, I could process 3.2 Mpps instead of 2.4 Mpps Most consuming functions are now skb_set_owner_w() and sock_wfree() contending on sk->sk_wmem_alloc when cooking SYNACK and freeing them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 3 +- net/ipv4/inet_hashtables.c | 73 ++++++++++++++++------------------------------ net/ipv4/tcp_ipv4.c | 66 ++++++++++++++++++++--------------------- 3 files changed, 59 insertions(+), 83 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ea8df527b279..bd591eb81ec9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -775,13 +775,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock_bh(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->head) { + sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 387338d71dcd..98ba03b6f87d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* - * Don't inline this cruft. Here are some nice properties to exploit here. The - * BSD API does not allow a listening sock to specify the remote port nor the + * Here are some nice properties to exploit here. The BSD API + * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ - +/* called with rcu_read_lock() : No refcount taken on the socket */ struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -212,37 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore, matches = 0, reuseport = 0; - bool select_ok = true; + int score, hiscore = 0, matches = 0, reuseport = 0; + struct sock *sk, *result = NULL; u32 phash = 0; -begin: - result = NULL; - hiscore = 0; - sk_nulls_for_each_rcu(sk, node, &ilb->head) { + sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { - result = sk; - hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, phash, - skb, doff); - if (sk2) { - result = sk2; - goto found; - } - } + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; matches = 1; } + result = sk; + hiscore = score; } else if (score == hiscore && reuseport) { matches++; if (reciprocal_scale(phash, matches) == 0) @@ -250,24 +240,6 @@ begin: phash = next_pseudo_random32(phash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, daddr, - dif) < hiscore)) { - sock_put(result); - select_ok = false; - goto begin; - } - } return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -508,7 +480,8 @@ int __inet_hash(struct sock *sk, struct sock *osk, if (err) goto unlock; } - __sk_nulls_add_node_rcu(sk, &ilb->head); + hlist_add_head_rcu(&sk->sk_node, &ilb->head); + sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb->lock); @@ -535,20 +508,25 @@ void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; spinlock_t *lock; + bool listener = false; int done; if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) + if (sk->sk_state == TCP_LISTEN) { lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; - else + listener = true; + } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - + } spin_lock_bh(lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - done = __sk_nulls_del_node_init_rcu(sk); + if (listener) + done = __sk_del_node_init(sk); + else + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -684,9 +662,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, - i + LISTENING_NULLS_BASE); - } + INIT_HLIST_HEAD(&h->listening_hash[i].head); + } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad450509029b..e5f924b29946 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -628,6 +628,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) @@ -646,16 +647,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ if (!sk1) - return; - rcu_read_lock(); + goto out; + key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) &ip_hdr(skb)->saddr, AF_INET); if (!key) - goto release_sk1; + goto out; + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) - goto release_sk1; + goto out; + } if (key) { @@ -698,11 +701,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); #ifdef CONFIG_TCP_MD5SIG -release_sk1: - if (sk1) { - rcu_read_unlock(); - sock_put(sk1); - } +out: + rcu_read_unlock(); #endif } @@ -1538,11 +1538,12 @@ EXPORT_SYMBOL(tcp_prequeue); int tcp_v4_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; + bool refcounted; struct sock *sk; int ret; - struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1588,7 +1589,7 @@ int tcp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest); + th->dest, &refcounted); if (!sk) goto no_tcp_socket; @@ -1609,7 +1610,11 @@ process: inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ sock_hold(sk); + refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1665,7 +1670,8 @@ process: bh_unlock_sock(sk); put_and_return: - sock_put(sk); + if (refcounted) + sock_put(sk); return ret; @@ -1688,7 +1694,8 @@ discard_it: return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; do_time_wait: @@ -1712,6 +1719,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + refcounted = false; goto process; } /* Fall through to ACK */ @@ -1845,17 +1853,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct inet_connection_sock *icsk; - struct hlist_nulls_node *node; - struct sock *sk = cur; - struct inet_listen_hashbucket *ilb; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + struct inet_listen_hashbucket *ilb; + struct inet_connection_sock *icsk; + struct sock *sk = cur; if (!sk) { +get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); + sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; } @@ -1863,28 +1871,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - sk = sk_nulls_next(sk); + sk = sk_next(sk); get_sk: - sk_nulls_for_each_from(sk, node) { + sk_for_each_from(sk) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == st->family) { - cur = sk; - goto out; - } + if (sk->sk_family == st->family) + return sk; icsk = inet_csk(sk); } spin_unlock_bh(&ilb->lock); st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) { - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); - goto get_sk; - } - cur = NULL; -out: - return cur; + if (++st->bucket < INET_LHTABLE_SIZE) + goto get_head; + return NULL; } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) -- cgit v1.2.3-59-g8ed1b