diff options
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r-- | net/ipv4/inet_hashtables.c | 673 |
1 files changed, 525 insertions, 148 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2bbaaf0c7176..d3dc28156622 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,6 +20,9 @@ #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/inet6_hashtables.h> +#endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> @@ -89,12 +92,79 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket } } +bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, + unsigned short port, int l3mdev) +{ + return net_eq(ib_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev; +} + +static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, + struct net *net, + struct inet_bind_hashbucket *head, + unsigned short port, int l3mdev, + const struct sock *sk) +{ + write_pnet(&tb->ib_net, net); + tb->l3mdev = l3mdev; + tb->port = port; +#if IS_ENABLED(CONFIG_IPV6) + tb->family = sk->sk_family; + if (sk->sk_family == AF_INET6) + tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; + else +#endif + tb->rcv_saddr = sk->sk_rcv_saddr; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); +} + +struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, + struct net *net, + struct inet_bind_hashbucket *head, + unsigned short port, + int l3mdev, + const struct sock *sk) +{ + struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); + + if (tb) + inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); + + return tb; +} + +/* Caller must hold hashbucket lock for this tb with local BH disabled */ +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(cachep, tb); + } +} + +static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, + const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family != tb2->family) + return false; + + if (sk->sk_family == AF_INET6) + return ipv6_addr_equal(&tb2->v6_rcv_saddr, + &sk->sk_v6_rcv_saddr); +#endif + return tb2->rcv_saddr == sk->sk_rcv_saddr; +} + void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum) + struct inet_bind2_bucket *tb2, unsigned short port) { - inet_sk(sk)->inet_num = snum; + inet_sk(sk)->inet_num = port; sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; + sk_add_bind2_node(sk, &tb2->owners); + inet_csk(sk)->icsk_bind2_hash = tb2; } /* @@ -102,11 +172,15 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, - hashinfo->bhash_size); - struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; + struct net *net = sock_net(sk); struct inet_bind_bucket *tb; + int bhash; + + bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); + head = &hashinfo->bhash[bhash]; + head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; @@ -114,6 +188,17 @@ static void __inet_put_port(struct sock *sk) inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + + spin_lock(&head2->lock); + if (inet_csk(sk)->icsk_bind2_hash) { + struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; + + __sk_del_bind2_node(sk); + inet_csk(sk)->icsk_bind2_hash = NULL; + inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + } + spin_unlock(&head2->lock); + spin_unlock(&head->lock); } @@ -127,17 +212,26 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; + struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; - const int bhash = inet_bhashfn(sock_net(sk), port, - table->bhash_size); - struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_hashbucket *head, *head2; + bool created_inet_bind_bucket = false; + struct net *net = sock_net(sk); + bool update_fastreuse = false; + struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; - int l3mdev; + int bhash, l3mdev; + + bhash = inet_bhashfn(net, port, table->bhash_size); + head = &table->bhash[bhash]; + head2 = inet_bhashfn_portaddr(table, child, net, port); spin_lock(&head->lock); + spin_lock(&head2->lock); tb = inet_csk(sk)->icsk_bind_hash; - if (unlikely(!tb)) { + tb2 = inet_csk(sk)->icsk_bind2_hash; + if (unlikely(!tb || !tb2)) { + spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOENT; } @@ -150,24 +244,49 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), sock_net(sk)) && - tb->l3mdev == l3mdev && tb->port == port) + if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - sock_net(sk), head, port, - l3mdev); + net, head, port, l3mdev); if (!tb) { + spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } + created_inet_bind_bucket = true; + } + update_fastreuse = true; + + goto bhash2_find; + } else if (!inet_bind2_bucket_addr_match(tb2, child)) { + l3mdev = inet_sk_bound_l3mdev(sk); + +bhash2_find: + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); + if (!tb2) { + tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, + net, head2, port, + l3mdev, child); + if (!tb2) + goto error; } } - inet_bind_hash(child, tb, port); + if (update_fastreuse) + inet_csk_update_fastreuse(tb, child); + inet_bind_hash(child, tb, tb2, port); + spin_unlock(&head2->lock); spin_unlock(&head->lock); return 0; + +error: + if (created_inet_bind_bucket) + inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); + spin_unlock(&head2->lock); + spin_unlock(&head->lock); + return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); @@ -189,45 +308,9 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) return inet_lhash2_bucket(h, hash); } -static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) -{ - struct inet_listen_hashbucket *ilb2; - - if (!h->lhash2) - return; - - ilb2 = inet_lhash2_bucket_sk(h, sk); - - spin_lock(&ilb2->lock); - if (sk->sk_reuseport && sk->sk_family == AF_INET6) - hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - else - hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - ilb2->count++; - spin_unlock(&ilb2->lock); -} - -static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) -{ - struct inet_listen_hashbucket *ilb2; - - if (!h->lhash2 || - WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) - return; - - ilb2 = inet_lhash2_bucket_sk(h, sk); - - spin_lock(&ilb2->lock); - hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); - ilb2->count--; - spin_unlock(&ilb2->lock); -} - static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif, const int sdif, bool exact_dif) + const int dif, const int sdif) { int score = -1; @@ -238,14 +321,31 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; + score = sk->sk_bound_dev_if ? 2 : 1; - score = sk->sk_family == PF_INET ? 2 : 1; + if (sk->sk_family == PF_INET) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 phash; + + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); + } + return reuse_sk; +} + /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the @@ -261,25 +361,18 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - bool exact_dif = inet_exact_dif_match(net, skb); - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - u32 phash = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { + score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - return result; - } + result = lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); + if (result) + return result; + result = sk; hiscore = score; } @@ -288,6 +381,29 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != net->ipv4.tcp_death_row.hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, + daddr, hnum, dif, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -299,6 +415,14 @@ struct sock *__inet_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum, dif); + if (result) + goto done; + } + hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -364,13 +488,11 @@ begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; - if (likely(INET_MATCH(sk, net, acookie, - saddr, daddr, ports, dif, sdif))) { + if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; - if (unlikely(!INET_MATCH(sk, net, acookie, - saddr, daddr, ports, - dif, sdif))) { + if (unlikely(!inet_match(net, sk, acookie, + ports, dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -419,8 +541,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (sk2->sk_hash != hash) continue; - if (likely(INET_MATCH(sk2, net, acookie, - saddr, daddr, ports, dif, sdif))) { + if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) @@ -458,7 +579,7 @@ not_unique: return -EADDRNOTAVAIL; } -static u32 inet_sk_port_offset(const struct sock *sk) +static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -467,14 +588,54 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -/* insert a socket into ehash, and eventually remove another one - * (The another one can be a SYN_RECV or TIMEWAIT +/* Searches for an exsiting socket in the ehash bucket list. + * Returns true if found, false otherwise. */ -bool inet_ehash_insert(struct sock *sk, struct sock *osk) +static bool inet_ehash_lookup_by_sk(struct sock *sk, + struct hlist_nulls_head *list) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct hlist_nulls_head *list; + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); + const int sdif = sk->sk_bound_dev_if; + const int dif = sk->sk_bound_dev_if; + const struct hlist_nulls_node *node; + struct net *net = sock_net(sk); + struct sock *esk; + + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); + + sk_nulls_for_each_rcu(esk, node, list) { + if (esk->sk_hash != sk->sk_hash) + continue; + if (sk->sk_family == AF_INET) { + if (unlikely(inet_match(net, esk, acookie, + ports, dif, sdif))) { + return true; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (unlikely(inet6_match(net, esk, + &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#endif + } + return false; +} + +/* Insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT) + * If an existing socket already exists, socket sk is not inserted, + * and sets found_dup_sk parameter to true. + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) +{ + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_ehash_bucket *head; + struct hlist_nulls_head *list; spinlock_t *lock; bool ret = true; @@ -489,21 +650,28 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk) if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); + if (*found_dup_sk) + ret = false; } + if (ret) __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); + return ret; } -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - bool ok = inet_ehash_insert(sk, osk); + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); @@ -537,34 +705,34 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb; + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { - inet_ehash_nolisten(sk, osk); + local_bh_disable(); + inet_ehash_nolisten(sk, osk, NULL); + local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); - spin_lock(&ilb->lock); + spin_lock(&ilb2->lock); if (sk->sk_reuseport) { - err = inet_reuseport_add_sock(sk, ilb); + err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) - __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else - __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); - inet_hash2(hashinfo, sk); - ilb->count++; + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: - spin_unlock(&ilb->lock); + spin_unlock(&ilb2->lock); return err; } @@ -574,11 +742,8 @@ int inet_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } @@ -586,51 +751,187 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb = NULL; - spinlock_t *lock; + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &ilb->lock; + struct inet_listen_hashbucket *ilb2; + + ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); + /* Don't disable bottom halves while acquiring the lock to + * avoid circular locking dependency on PREEMPT_RT. + */ + spin_lock(&ilb2->lock); + if (sk_unhashed(sk)) { + spin_unlock(&ilb2->lock); + return; + } + + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_stop_listen_sock(sk); + + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_unlock(&ilb2->lock); } else { - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - } - spin_lock_bh(lock); - if (sk_unhashed(sk)) - goto unlock; + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); - if (ilb) { - inet_unhash2(hashinfo, sk); - ilb->count--; + spin_lock_bh(lock); + if (sk_unhashed(sk)) { + spin_unlock_bh(lock); + return; + } + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_unlock_bh(lock); } - __sk_nulls_del_node_init_rcu(sk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -unlock: - spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); +static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, + const struct net *net, unsigned short port, + int l3mdev, const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family != tb->family) + return false; + + if (sk->sk_family == AF_INET6) + return net_eq(ib2_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev && + ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); + else +#endif + return net_eq(ib2_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; +} + +bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, + unsigned short port, int l3mdev, const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr addr_any = {}; + + if (sk->sk_family != tb->family) + return false; + + if (sk->sk_family == AF_INET6) + return net_eq(ib2_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev && + ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any); + else +#endif + return net_eq(ib2_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev && tb->rcv_saddr == 0; +} + +/* The socket's bhash2 hashbucket spinlock must be held when this is called */ +struct inet_bind2_bucket * +inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, + unsigned short port, int l3mdev, const struct sock *sk) +{ + struct inet_bind2_bucket *bhash2 = NULL; + + inet_bind_bucket_for_each(bhash2, &head->chain) + if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) + break; + + return bhash2; +} + +struct inet_bind_hashbucket * +inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) +{ + struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + u32 hash; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr addr_any = {}; + + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(net, &addr_any, port); + else +#endif + hash = ipv4_portaddr_hash(net, 0, port); + + return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; +} + +int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) +{ + struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind2_bucket *tb2, *new_tb2; + int l3mdev = inet_sk_bound_l3mdev(sk); + struct inet_bind_hashbucket *head2; + int port = inet_sk(sk)->inet_num; + struct net *net = sock_net(sk); + + /* Allocate a bind2 bucket ahead of time to avoid permanently putting + * the bhash2 table in an inconsistent state if a new tb2 bucket + * allocation fails. + */ + new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); + if (!new_tb2) + return -ENOMEM; + + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + + if (prev_saddr) { + spin_lock_bh(&prev_saddr->lock); + __sk_del_bind2_node(sk); + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, + inet_csk(sk)->icsk_bind2_hash); + spin_unlock_bh(&prev_saddr->lock); + } + + spin_lock_bh(&head2->lock); + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); + if (!tb2) { + tb2 = new_tb2; + inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); + } + sk_add_bind2_node(sk, &tb2->owners); + inet_csk(sk)->icsk_bind2_hash = tb2; + spin_unlock_bh(&head2->lock); + + if (tb2 != new_tb2) + kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); + + return 0; +} +EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); + +/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm + * Note that we use 32bit integers (vs RFC 'short integers') + * because 2^16 is not a multiple of num_ephemeral and this + * property might be used by clever attacker. + * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though + * attacks were since demonstrated, thus we use 65536 instead to really + * give more isolation and privacy, at the expense of 256kB of kernel + * memory. + */ +#define INET_TABLE_PERTURB_SHIFT 16 +#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) +static u32 *table_perturb; + int __inet_hash_connect(struct inet_timewait_death_row *death_row, - struct sock *sk, u32 port_offset, + struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_bind_hashbucket *head, *head2; struct inet_timewait_sock *tw = NULL; - struct inet_bind_hashbucket *head; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; + bool tb_created = false; u32 remaining, offset; int ret, i, low, high; - static u32 hint; int l3mdev; + u32 index; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -638,7 +939,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); spin_unlock_bh(&head->lock); return 0; } @@ -657,7 +958,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (likely(remaining > 1)) remaining &= ~1U; - offset = (hint + port_offset) % remaining; + get_random_sleepable_once(table_perturb, + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); + index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); + + offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); + offset %= remaining; + /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ @@ -677,8 +984,7 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && - tb->port == port) { + if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -696,6 +1002,7 @@ other_parity_scan: spin_unlock_bh(&head->lock); return -ENOMEM; } + tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; @@ -711,13 +1018,36 @@ next_port: return -EADDRNOTAVAIL; ok: - hint += i + 2; + /* Find the corresponding tb2 bucket since we need to + * add the socket to the bhash2 table as well + */ + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + spin_lock(&head2->lock); + + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); + if (!tb2) { + tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, + head2, port, l3mdev, sk); + if (!tb2) + goto error; + } + + /* Here we want to add a little bit of randomness to the next source + * port that will be chosen. We use a max() with a random here so that + * on low contention the randomness is maximal and on high contention + * it may be inexistent. + */ + i = max_t(int, i, prandom_u32_max(8) * 2); + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, tb2, port); + + spin_unlock(&head2->lock); + if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); @@ -726,6 +1056,13 @@ ok: inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; + +error: + spin_unlock(&head2->lock); + if (tb_created) + inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + spin_unlock_bh(&head->lock); + return -ENOMEM; } /* @@ -734,7 +1071,7 @@ ok: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - u32 port_offset = 0; + u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); @@ -743,29 +1080,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, } EXPORT_SYMBOL_GPL(inet_hash_connect); -void inet_hashinfo_init(struct inet_hashinfo *h) -{ - int i; - - for (i = 0; i < INET_LHTABLE_SIZE; i++) { - spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, - i + LISTENING_NULLS_BASE); - h->listening_hash[i].count = 0; - } - - h->lhash2 = NULL; -} -EXPORT_SYMBOL_GPL(inet_hashinfo_init); - static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_HEAD(&h->lhash2[i].head); - h->lhash2[i].count = 0; + INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, + i + LISTENING_NULLS_BASE); } } @@ -784,6 +1106,14 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, low_limit, high_limit); init_hashinfo_lhash2(h); + + /* this one is used for source ports of outgoing connections */ + table_perturb = alloc_large_system_hash("Table-perturb", + sizeof(*table_perturb), + INET_TABLE_PERTURB_SIZE, + 0, 0, NULL, NULL, + INET_TABLE_PERTURB_SIZE, + INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) @@ -825,3 +1155,50 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); + +struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, + unsigned int ehash_entries) +{ + struct inet_hashinfo *new_hashinfo; + int i; + + new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); + if (!new_hashinfo) + goto err; + + new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), + GFP_KERNEL_ACCOUNT); + if (!new_hashinfo->ehash) + goto free_hashinfo; + + new_hashinfo->ehash_mask = ehash_entries - 1; + + if (inet_ehash_locks_alloc(new_hashinfo)) + goto free_ehash; + + for (i = 0; i < ehash_entries; i++) + INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); + + new_hashinfo->pernet = true; + + return new_hashinfo; + +free_ehash: + vfree(new_hashinfo->ehash); +free_hashinfo: + kfree(new_hashinfo); +err: + return NULL; +} +EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); + +void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) +{ + if (!hashinfo->pernet) + return; + + inet_ehash_locks_free(hashinfo); + vfree(hashinfo->ehash); + kfree(hashinfo); +} +EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); |