aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-10-05 03:34:59 -0700
committerDavid S. Miller <davem@davemloft.net>2015-10-05 03:35:29 -0700
commit7e2832f17f9bec41643d420d55efd96b09d1fb45 (patch)
treefeccf26a3c9fa86e110b4b7a6b35cd9413c4d5df
parentMerge branch 'xgbe-next' (diff)
parentRDS-TCP: Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in rds_tcp_xmit (diff)
downloadlinux-dev-7e2832f17f9bec41643d420d55efd96b09d1fb45.tar.xz
linux-dev-7e2832f17f9bec41643d420d55efd96b09d1fb45.zip
Merge branch 'rds-perf'
Sowmini Varadhan says: ==================== RDS: RDS-TCP perf enhancements A 3-part patchset that (a) improves current RDS-TCP perf by 2X-3X and (b) refactors earlier robustness code for better observability/scaling. Patch 1 is an enhancment of earlier robustness fixes that had used separate sockets for client and server endpoints to resolve race conditions. It is possible to have an equivalent solution that does not use 2 sockets. The benefit of a single socket solution is that it results in more predictable and observable behavior for the underlying TCP pipe of an RDS connection Patches 2 and 3 are simple, straightforward perf bug fixes that align the RDS TCP socket with other parts of the kernel stack. v2: fix kbuild-test-robot warnings, comments from Sergei Shtylov and Santosh Shilimkar. ==================== Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/rds/connection.c22
-rw-r--r--net/rds/rds.h4
-rw-r--r--net/rds/tcp.c16
-rw-r--r--net/rds/tcp_listen.c22
-rw-r--r--net/rds/tcp_send.c8
5 files changed, 29 insertions, 43 deletions
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 49adeef8090c..d4564036a339 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -128,10 +128,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
- struct rds_transport *otrans = trans;
- if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
- goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(net, head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
@@ -147,7 +144,6 @@ static struct rds_connection *__rds_conn_create(struct net *net,
if (conn)
goto out;
-new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
@@ -207,6 +203,7 @@ new_conn:
atomic_set(&conn->c_state, RDS_CONN_DOWN);
conn->c_send_gen = 0;
+ conn->c_outgoing = (is_outgoing ? 1 : 0);
conn->c_reconnect_jiffies = 0;
INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
@@ -243,22 +240,13 @@ new_conn:
/* Creating normal conn */
struct rds_connection *found;
- if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
- found = NULL;
- else
- found = rds_conn_lookup(net, head, laddr, faddr, trans);
+ found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
- if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
- (otrans->t_type != RDS_TRANS_TCP)) {
- /* Only the active side should be added to
- * reconnect list for TCP.
- */
- hlist_add_head_rcu(&conn->c_hash_node, head);
- }
+ hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
}
@@ -337,7 +325,9 @@ void rds_conn_shutdown(struct rds_connection *conn)
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
- rds_queue_reconnect(conn);
+ if (conn->c_trans->t_type != RDS_TRANS_TCP ||
+ conn->c_outgoing == 1)
+ rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index afb4048d0cfd..b4c7ac021d5b 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -86,7 +86,9 @@ struct rds_connection {
struct hlist_node c_hash_node;
__be32 c_laddr;
__be32 c_faddr;
- unsigned int c_loopback:1;
+ unsigned int c_loopback:1,
+ c_outgoing:1,
+ c_pad_to_32:30;
struct rds_connection *c_passive;
struct rds_cong_map *c_lcong;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index c42b60bf4c68..9d6ddbacd875 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -67,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock)
set_fs(oldfs);
}
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation. In general these
+ * customizations should be tunable via module_param()
+ */
void rds_tcp_tune(struct socket *sock)
{
- struct sock *sk = sock->sk;
-
rds_tcp_nonagle(sock);
-
- /*
- * We're trying to saturate gigabit with the default,
- * see svc_sock_setbufsize().
- */
- lock_sock(sk);
- sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
- sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
- release_sock(sk);
}
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 444d78d0bd77..1d90240e5d82 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -110,28 +110,24 @@ int rds_tcp_accept_one(struct socket *sock)
goto out;
}
/* An incoming SYN request came in, and TCP just accepted it.
- * We always create a new conn for listen side of TCP, and do not
- * add it to the c_hash_list.
*
* If the client reboots, this conn will need to be cleaned up.
* rds_tcp_state_change() will do that cleanup
*/
rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
- WARN_ON(!rs_tcp || rs_tcp->t_sock);
-
- /*
- * see the comment above rds_queue_delayed_reconnect()
- */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- if (rds_conn_state(conn) == RDS_CONN_UP)
- rds_tcp_stats_inc(s_tcp_listen_closed_stale);
- else
- rds_tcp_stats_inc(s_tcp_connect_raced);
- rds_conn_drop(conn);
+ if (rs_tcp->t_sock &&
+ ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+ struct sock *nsk = new_sock->sk;
+
+ nsk->sk_user_data = NULL;
+ nsk->sk_prot->disconnect(nsk, 0);
+ tcp_done(nsk);
+ new_sock = NULL;
ret = 0;
goto out;
}
+ rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
rds_tcp_set_callbacks(new_sock, conn);
rds_connect_complete(conn);
new_sock = NULL;
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 53b17ca0dff5..2894e6095e3b 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
int ret = 0;
+ int more;
if (hdr_off == 0) {
/*
@@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
goto out;
}
+ more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0;
while (sg < rm->data.op_nents) {
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+
ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(&rm->data.op_sg[sg]),
rm->data.op_sg[sg].offset + off,
rm->data.op_sg[sg].length - off,
- MSG_DONTWAIT|MSG_NOSIGNAL);
+ flags);
rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
ret);
@@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
off = 0;
sg++;
}
+ if (sg == rm->data.op_nents - 1)
+ more = 0;
}
out: