From 61b7c691c7317529375f90f0a81a331990b1ec1b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:31 -0800 Subject: inet: Add a 2nd listener hashtable (port+addr) The current listener hashtable is hashed by port only. When a process is listening at many IP addresses with the same port (e.g. [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener() performance is degraded to a link list. It is prone to syn attack. UDP had a similar issue and a second hashtable was added to resolve it. This patch adds a second hashtable for the listener's sockets. The second hashtable is hashed by port and address. It cannot reuse the existing skc_portaddr_node which is shared with skc_bind_node. TCP listener needs to use skc_bind_node. Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to the inet_connection_sock which the listener (like TCP) also belongs to. The new portaddr hashtable may need two lookup (First by IP:PORT. Second by INADDR_ANY:PORT if the IP:PORT is a not found). Hence, it implements a similar cut off as UDP such that it will only consult the new portaddr hashtable if the current port-only hashtable has >10 sk in the link-list. lhash2 and lhash2_mask are added to 'struct inet_hashinfo'. I take this chance to plug a 4 bytes hole. It is done by first moving the existing bind_bucket_cachep up and then add the new (int lhash2_mask, *lhash2) after the existing bhash_size. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 168 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 159 insertions(+), 9 deletions(-) (limited to 'net/ipv4/inet_hashtables.c') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 80cfd3fa21ca..f6f58108b4c5 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) } EXPORT_SYMBOL_GPL(__inet_inherit_port); +static struct inet_listen_hashbucket * +inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + else +#endif + hash = ipv4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + if (sk->sk_reuseport && sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + else + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif, bool exact_dif) @@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net, */ /* called with rcu_read_lock() : No refcount taken on the socket */ +static struct sock *inet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const __be32 saddr, __be16 sport, + const __be32 daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with INADDR_ANY */ + + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); @@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + inet_hash2(hashinfo, sk); ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk) struct inet_listen_hashbucket *ilb; spinlock_t *lock; bool listener = false; - int done; if (sk_unhashed(sk)) return; @@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); } spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (listener) - done = __sk_del_node_init(sk); - else - done = __sk_nulls_del_node_init_rcu(sk); - if (done) { - if (listener) - ilb->count--; - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + if (listener) { + inet_unhash2(hashinfo, sk); + __sk_del_node_init(sk); + ilb->count--; + } else { + __sk_nulls_del_node_init_rcu(sk); } + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +unlock: spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h) INIT_HLIST_HEAD(&h->listening_hash[i].head); h->listening_hash[i].count = 0; } + + h->lhash2 = NULL; } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned int i; + + h->lhash2 = alloc_large_system_hash(name, + sizeof(*h->lhash2), + numentries, + scale, + 0, + NULL, + &h->lhash2_mask, + low_limit, + high_limit); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); -- cgit v1.2.3-59-g8ed1b