aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJoanne Koong <joannelkoong@gmail.com>2022-05-19 17:18:33 -0700
committerJakub Kicinski <kuba@kernel.org>2022-05-20 18:16:24 -0700
commitd5a42de8bdbe25081f07b801d8b35f4d75a791f4 (patch)
tree7bc53e45ea3628a75c25931f9f1d5c7f64d63cd5 /include
parentwwan: iosm: use a flexible array rather than allocate short objects (diff)
downloadlinux-dev-d5a42de8bdbe25081f07b801d8b35f4d75a791f4.tar.xz
linux-dev-d5a42de8bdbe25081f07b801d8b35f4d75a791f4.zip
net: Add a second bind table hashed by port and address
We currently have one tcp bind table (bhash) which hashes by port number only. In the socket bind path, we check for bind conflicts by traversing the specified port's inet_bind2_bucket while holding the bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()). In instances where there are tons of sockets hashed to the same port at different addresses, checking for a bind conflict is time-intensive and can cause softirq cpu lockups, as well as stops new tcp connections since __inet_inherit_port() also contests for the spinlock. This patch proposes adding a second bind table, bhash2, that hashes by port and ip address. Searching the bhash2 table leads to significantly faster conflict resolution and less time holding the spinlock. Signed-off-by: Joanne Koong <joannelkoong@gmail.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/net/inet_connection_sock.h3
-rw-r--r--include/net/inet_hashtables.h68
-rw-r--r--include/net/sock.h14
3 files changed, 84 insertions, 1 deletions
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85cd695e7fd1..077cd730ce2f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -25,6 +25,7 @@
#undef INET_CSK_CLEAR_TIMERS
struct inet_bind_bucket;
+struct inet_bind2_bucket;
struct tcp_congestion_ops;
/*
@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
*
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
+ * @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout
@@ -83,6 +85,7 @@ struct inet_connection_sock {
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
+ struct inet_bind2_bucket *icsk_bind2_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ebfa3df6f8dc..a0887b70967b 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -90,11 +90,32 @@ struct inet_bind_bucket {
struct hlist_head owners;
};
+struct inet_bind2_bucket {
+ possible_net_t ib_net;
+ int l3mdev;
+ unsigned short port;
+ union {
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr v6_rcv_saddr;
+#endif
+ __be32 rcv_saddr;
+ };
+ /* Node in the inet2_bind_hashbucket chain */
+ struct hlist_node node;
+ /* List of sockets hashed to this bucket */
+ struct hlist_head owners;
+};
+
static inline struct net *ib_net(struct inet_bind_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
+static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
+{
+ return read_pnet(&ib->ib_net);
+}
+
#define inet_bind_bucket_for_each(tb, head) \
hlist_for_each_entry(tb, head, node)
@@ -103,6 +124,15 @@ struct inet_bind_hashbucket {
struct hlist_head chain;
};
+/* This is synchronized using the inet_bind_hashbucket's spinlock.
+ * Instead of having separate spinlocks, the inet_bind2_hashbucket can share
+ * the inet_bind_hashbucket's given that in every case where the bhash2 table
+ * is useful, a lookup in the bhash table also occurs.
+ */
+struct inet_bind2_hashbucket {
+ struct hlist_head chain;
+};
+
/* Sockets can be hashed in established or listening table.
* We must use different 'nulls' end-of-chain value for all hash buckets :
* A socket might transition from ESTABLISH to LISTEN state without
@@ -134,6 +164,12 @@ struct inet_hashinfo {
*/
struct kmem_cache *bind_bucket_cachep;
struct inet_bind_hashbucket *bhash;
+ /* The 2nd binding table hashed by port and address.
+ * This is used primarily for expediting the resolution of bind
+ * conflicts.
+ */
+ struct kmem_cache *bind2_bucket_cachep;
+ struct inet_bind2_hashbucket *bhash2;
unsigned int bhash_size;
/* The 2nd listener table hashed by local port and address */
@@ -193,6 +229,36 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb);
+static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
+ struct net *net,
+ const unsigned short port,
+ int l3mdev)
+{
+ return net_eq(ib_net(tb), net) && tb->port == port &&
+ tb->l3mdev == l3mdev;
+}
+
+struct inet_bind2_bucket *
+inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
+ struct inet_bind2_hashbucket *head,
+ const unsigned short port, int l3mdev,
+ const struct sock *sk);
+
+void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
+ struct inet_bind2_bucket *tb);
+
+struct inet_bind2_bucket *
+inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
+ const unsigned short port, int l3mdev,
+ struct sock *sk,
+ struct inet_bind2_hashbucket **head);
+
+bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
+ struct net *net,
+ const unsigned short port,
+ int l3mdev,
+ const struct sock *sk);
+
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
{
@@ -200,7 +266,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
- const unsigned short snum);
+ struct inet_bind2_bucket *tb2, const unsigned short snum);
/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);
diff --git a/include/net/sock.h b/include/net/sock.h
index 72ca97ccb460..c585ef6565d9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -348,6 +348,7 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
+ * @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
@@ -537,6 +538,7 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
+ struct hlist_node sk_bind2_node;
};
enum sk_pacing {
@@ -817,6 +819,16 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list);
}
+static inline void __sk_del_bind2_node(struct sock *sk)
+{
+ __hlist_del(&sk->sk_bind2_node);
+}
+
+static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
+{
+ hlist_add_head(&sk->sk_bind2_node, list);
+}
+
#define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
@@ -834,6 +846,8 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
+#define sk_for_each_bound_bhash2(__sk, list) \
+ hlist_for_each_entry(__sk, list, sk_bind2_node)
/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset