From 9e2effba2c16fc3bd47da605116485afe01e0be0 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 12 Mar 2010 16:22:32 -0800
Subject: RDS: Fix BUG_ONs to not fire when in a tasklet

in_interrupt() is true in softirqs. The BUG_ONs are supposed
to check for if irqs are disabled, so we should use
BUG_ON(irqs_disabled()) instead, duh.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_rdma.c | 2 +-
 net/rds/rdma.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a54cd63f9e35..a92aebcb7fcf 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -441,7 +441,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 
 			/* FIXME we need a way to tell a r/w MR
 			 * from a r/o MR */
-			BUG_ON(in_interrupt());
+			BUG_ON(irqs_disabled());
 			set_page_dirty(page);
 			put_page(page);
 		}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 75fd13bb631b..3b442d4d64cf 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -440,7 +440,7 @@ void rds_rdma_free_op(struct rds_rdma_op *ro)
 		 * is the case for a RDMA_READ which copies from remote
 		 * to local memory */
 		if (!ro->r_write) {
-			BUG_ON(in_interrupt());
+			BUG_ON(irqs_disabled());
 			set_page_dirty(page);
 		}
 		put_page(page);
-- 
cgit v1.2.3-59-g8ed1b


From 35b52c70534cb7193b218ec12efe6bc595312097 Mon Sep 17 00:00:00 2001
From: Tina Yang <tina.yang@oracle.com>
Date: Thu, 1 Apr 2010 14:09:00 -0700
Subject: RDS: Fix corrupted rds_mrs

On second look at this bug (OFED #2002), it seems that the
collision is not with the retransmission queue (packet acked
by the peer), but with the local send completion.  A theoretical
sequence of events (from time t0 to t3) is thought to be as
follows,

Thread #1
t0:
    sock_release
    rds_release
    rds_send_drop_to /* wait on send completion */
t2:
    rds_rdma_drop_keys()   /* destroy & free all mrs */

Thread #2
t1:
    rds_ib_send_cq_comp_handler
    rds_ib_send_unmap_rm
    rds_message_unmapped   /* wake up #1 @ t0 */
t3:
    rds_message_put
    rds_message_purge
    rds_mr_put   /* memory corruption detected */

The problem with the rds_rdma_drop_keys() is it could
remove a mr's refcount more than its due (i.e. repeatedly
as long as it still remains in the tree (mr->r_refcount > 0)).
Theoretically it should remove only one reference - reference
by the tree.

        /* Release any MRs associated with this socket */
        while ((node = rb_first(&rs->rs_rdma_keys))) {
                mr = container_of(node, struct rds_mr, r_rb_node);
                if (mr->r_trans == rs->rs_transport)
                        mr->r_invalidate = 0;
                rds_mr_put(mr);
        }

I think the correct way of doing it is to remove the mr from
the tree and rds_destroy_mr it first, then a rds_mr_put()
to decrement its reference count by one.  Whichever thread
holds the last reference will free the mr via rds_mr_put().

Signed-off-by: Tina Yang <tina.yang@oracle.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rdma.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 3b442d4d64cf..463b458ff27e 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
 {
 	struct rds_mr *mr;
 	struct rb_node *node;
+	unsigned long flags;
 
 	/* Release any MRs associated with this socket */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	while ((node = rb_first(&rs->rs_rdma_keys))) {
 		mr = container_of(node, struct rds_mr, r_rb_node);
 		if (mr->r_trans == rs->rs_transport)
 			mr->r_invalidate = 0;
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		rds_destroy_mr(mr);
 		rds_mr_put(mr);
+		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
 	if (rs->rs_transport && rs->rs_transport->flush_mrs)
 		rs->rs_transport->flush_mrs();
-- 
cgit v1.2.3-59-g8ed1b


From 7c82eaf00ec7d460932be9314b29997006b799b6 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 19 Feb 2010 18:01:41 -0800
Subject: RDS: Rewrite rds_send_drop_to() for clarity

This function has been the source of numerous bugs; it's just
too complicated. Simplified to nest spinlocks cleanly within
the second loop body, and kick out early if there are no
rms to drop.

This will be a little slower because conn lock is grabbed for
each entry instead of "caching" the lock across rms, but this
should be entirely irrelevant to fastpath performance.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 64 ++++++++++++++++++++++++++--------------------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 9c1c6bcaa6c9..aee58f931b69 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -619,9 +619,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 {
 	struct rds_message *rm, *tmp;
 	struct rds_connection *conn;
-	unsigned long flags, flags2;
+	unsigned long flags;
 	LIST_HEAD(list);
-	int wake = 0;
 
 	/* get all the messages we're dropping under the rs lock */
 	spin_lock_irqsave(&rs->rs_lock, flags);
@@ -631,59 +630,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
 			continue;
 
-		wake = 1;
 		list_move(&rm->m_sock_item, &list);
 		rds_send_sndbuf_remove(rs, rm);
 		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
 	}
 
 	/* order flag updates with the rs lock */
-	if (wake)
-		smp_mb__after_clear_bit();
+	smp_mb__after_clear_bit();
 
 	spin_unlock_irqrestore(&rs->rs_lock, flags);
 
-	conn = NULL;
+	if (list_empty(&list))
+		return;
 
-	/* now remove the messages from the conn list as needed */
+	/* Remove the messages from the conn */
 	list_for_each_entry(rm, &list, m_sock_item) {
-		/* We do this here rather than in the loop above, so that
-		 * we don't have to nest m_rs_lock under rs->rs_lock */
-		spin_lock_irqsave(&rm->m_rs_lock, flags2);
-		/* If this is a RDMA operation, notify the app. */
-		spin_lock(&rs->rs_lock);
-		__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
-		spin_unlock(&rs->rs_lock);
-		rm->m_rs = NULL;
-		spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
+
+		conn = rm->m_inc.i_conn;
+		spin_lock_irqsave(&conn->c_lock, flags);
 
 		/*
-		 * If we see this flag cleared then we're *sure* that someone
-		 * else beat us to removing it from the conn.  If we race
-		 * with their flag update we'll get the lock and then really
-		 * see that the flag has been cleared.
+		 * Maybe someone else beat us to removing rm from the conn.
+		 * If we race with their flag update we'll get the lock and
+		 * then really see that the flag has been cleared.
 		 */
-		if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+			spin_unlock_irqrestore(&conn->c_lock, flags);
 			continue;
-
-		if (conn != rm->m_inc.i_conn) {
-			if (conn)
-				spin_unlock_irqrestore(&conn->c_lock, flags);
-			conn = rm->m_inc.i_conn;
-			spin_lock_irqsave(&conn->c_lock, flags);
 		}
 
-		if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
-			list_del_init(&rm->m_conn_item);
-			rds_message_put(rm);
-		}
-	}
+		/*
+		 * Couldn't grab m_rs_lock in top loop (lock ordering),
+		 * but we can now.
+		 */
+		spin_lock(&rm->m_rs_lock);
 
-	if (conn)
+		spin_lock(&rs->rs_lock);
+		__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
+
+		rm->m_rs = NULL;
+		spin_unlock(&rm->m_rs_lock);
+
+		list_del_init(&rm->m_conn_item);
+		rds_message_put(rm);
 		spin_unlock_irqrestore(&conn->c_lock, flags);
+	}
 
-	if (wake)
-		rds_wake_sk_sleep(rs);
+	rds_wake_sk_sleep(rs);
 
 	while (!list_empty(&list)) {
 		rm = list_entry(list.next, struct rds_message, m_sock_item);
-- 
cgit v1.2.3-59-g8ed1b


From 9de0864cf55927a7383b5ba6e48834ff3ef053de Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 16:50:54 -0700
Subject: RDS: Fix locking in send on m_rs_lock

Do not nest m_rs_lock under c_lock

Disable interrupts in {rdma,atomic}_send_complete

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index aee58f931b69..725fb0419797 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -415,8 +415,9 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 	struct rds_sock *rs = NULL;
 	struct rds_rdma_op *ro;
 	struct rds_notifier *notifier;
+	unsigned long flags;
 
-	spin_lock(&rm->m_rs_lock);
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
 	ro = rm->m_rdma_op;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
@@ -433,7 +434,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 		ro->r_notifier = NULL;
 	}
 
-	spin_unlock(&rm->m_rs_lock);
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 	if (rs) {
 		rds_wake_sk_sleep(rs);
@@ -647,8 +648,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 	list_for_each_entry(rm, &list, m_sock_item) {
 
 		conn = rm->m_inc.i_conn;
-		spin_lock_irqsave(&conn->c_lock, flags);
 
+		spin_lock_irqsave(&conn->c_lock, flags);
 		/*
 		 * Maybe someone else beat us to removing rm from the conn.
 		 * If we race with their flag update we'll get the lock and
@@ -658,23 +659,23 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 			spin_unlock_irqrestore(&conn->c_lock, flags);
 			continue;
 		}
+		list_del_init(&rm->m_conn_item);
+		spin_unlock_irqrestore(&conn->c_lock, flags);
 
 		/*
 		 * Couldn't grab m_rs_lock in top loop (lock ordering),
 		 * but we can now.
 		 */
-		spin_lock(&rm->m_rs_lock);
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
 
 		spin_lock(&rs->rs_lock);
 		__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
 		spin_unlock(&rs->rs_lock);
 
 		rm->m_rs = NULL;
-		spin_unlock(&rm->m_rs_lock);
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
-		list_del_init(&rm->m_conn_item);
 		rds_message_put(rm);
-		spin_unlock_irqrestore(&conn->c_lock, flags);
 	}
 
 	rds_wake_sk_sleep(rs);
-- 
cgit v1.2.3-59-g8ed1b


From 2dc393573430f853e56e25bf4b41c34ba2aa8fd6 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 11 Jun 2010 13:49:13 -0700
Subject: RDS: move rds_shutdown_worker impl. to rds_conn_shutdown

This fits better in connection.c, rather than threads.c.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/connection.c | 53 +++++++++++++++++++++++++++++++++++++++++++++
 net/rds/rds.h        |  2 ++
 net/rds/threads.c    | 61 +++++++---------------------------------------------
 3 files changed, 63 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7619b671ca28..895e39cdc6a6 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -263,6 +263,59 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+	/* shut it down unless it's down already */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+		/*
+		 * Quiesce the connection mgmt handlers before we start tearing
+		 * things down. We don't hold the mutex for the entire
+		 * duration of the shutdown operation, else we may be
+		 * deadlocking with the CM handler. Instead, the CM event
+		 * handler is supposed to check for state DISCONNECTING
+		 */
+		mutex_lock(&conn->c_cm_lock);
+		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+		 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+			rds_conn_error(conn, "shutdown called in state %d\n",
+					atomic_read(&conn->c_state));
+			mutex_unlock(&conn->c_cm_lock);
+			return;
+		}
+		mutex_unlock(&conn->c_cm_lock);
+
+		mutex_lock(&conn->c_send_lock);
+		conn->c_trans->conn_shutdown(conn);
+		rds_conn_reset(conn);
+		mutex_unlock(&conn->c_send_lock);
+
+		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproduceable with loopback connections.
+			 * Mostly harmless.
+			 */
+			rds_conn_error(conn,
+				"%s: failed to transition to state DOWN, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+			return;
+		}
+	}
+
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
+	cancel_delayed_work_sync(&conn->c_conn_w);
+	if (!hlist_unhashed(&conn->c_hash_node))
+		rds_queue_reconnect(conn);
+}
+
+/*
+ * Stop and free a connection.
+ */
 void rds_conn_destroy(struct rds_connection *conn)
 {
 	struct rds_message *rm, *rtmp;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c224b5bb3ba9..1d3eef67137f 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -527,6 +527,7 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
 				       struct rds_transport *trans, gfp_t gfp);
 struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
 			       struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_reset(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
@@ -681,6 +682,7 @@ extern unsigned int  rds_sysctl_trace_level;
 int __init rds_threads_init(void);
 void rds_threads_exit(void);
 extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
 void rds_connect_worker(struct work_struct *);
 void rds_shutdown_worker(struct work_struct *);
 void rds_send_worker(struct work_struct *);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 786c20eaaf5e..6e2e43d5f576 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
  * We should *always* start with a random backoff; otherwise a broken connection
  * will always take several iterations to be re-established.
  */
-static void rds_queue_reconnect(struct rds_connection *conn)
+void rds_queue_reconnect(struct rds_connection *conn)
 {
 	unsigned long rand;
 
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
 	}
 }
 
-void rds_shutdown_worker(struct work_struct *work)
-{
-	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
-	/* shut it down unless it's down already */
-	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
-		/*
-		 * Quiesce the connection mgmt handlers before we start tearing
-		 * things down. We don't hold the mutex for the entire
-		 * duration of the shutdown operation, else we may be
-		 * deadlocking with the CM handler. Instead, the CM event
-		 * handler is supposed to check for state DISCONNECTING
-		 */
-		mutex_lock(&conn->c_cm_lock);
-		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
-		    !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
-			rds_conn_error(conn, "shutdown called in state %d\n",
-					atomic_read(&conn->c_state));
-			mutex_unlock(&conn->c_cm_lock);
-			return;
-		}
-		mutex_unlock(&conn->c_cm_lock);
-
-		mutex_lock(&conn->c_send_lock);
-		conn->c_trans->conn_shutdown(conn);
-		rds_conn_reset(conn);
-		mutex_unlock(&conn->c_send_lock);
-
-		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
-			/* This can happen - eg when we're in the middle of tearing
-			 * down the connection, and someone unloads the rds module.
-			 * Quite reproduceable with loopback connections.
-			 * Mostly harmless.
-			 */
-			rds_conn_error(conn,
-				"%s: failed to transition to state DOWN, "
-				"current state is %d\n",
-				__func__,
-				atomic_read(&conn->c_state));
-			return;
-		}
-	}
-
-	/* Then reconnect if it's still live.
-	 * The passive side of an IB loopback connection is never added
-	 * to the conn hash, so we never trigger a reconnect on this
-	 * conn - the reconnect is always triggered by the active peer. */
-	cancel_delayed_work(&conn->c_conn_w);
-	if (!hlist_unhashed(&conn->c_hash_node))
-		rds_queue_reconnect(conn);
-}
-
 void rds_send_worker(struct work_struct *work)
 {
 	struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -252,6 +200,13 @@ void rds_recv_worker(struct work_struct *work)
 	}
 }
 
+void rds_shutdown_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+	rds_conn_shutdown(conn);
+}
+
 void rds_threads_exit(void)
 {
 	destroy_workqueue(rds_wq);
-- 
cgit v1.2.3-59-g8ed1b


From 8690bfa17aea4c42da1bcf90a7af93d161eca624 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 11:56:44 -0800
Subject: RDS: cleanup: remove "== NULL"s and "!= NULL"s in ptr comparisons

Favor "if (foo)" style over "if (foo != NULL)".

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/af_rds.c      |  2 +-
 net/rds/bind.c        |  4 ++--
 net/rds/cong.c        |  6 +++---
 net/rds/connection.c  |  4 ++--
 net/rds/ib_cm.c       | 14 +++++++-------
 net/rds/ib_recv.c     | 20 ++++++++++----------
 net/rds/ib_send.c     |  4 ++--
 net/rds/ib_sysctl.c   |  2 +-
 net/rds/info.c        | 12 ++++++------
 net/rds/iw_cm.c       | 14 +++++++-------
 net/rds/iw_recv.c     | 20 ++++++++++----------
 net/rds/iw_send.c     |  4 ++--
 net/rds/iw_sysctl.c   |  2 +-
 net/rds/loop.c        |  2 +-
 net/rds/message.c     |  4 ++--
 net/rds/page.c        |  4 ++--
 net/rds/rdma.c        | 18 +++++++++---------
 net/rds/recv.c        |  4 ++--
 net/rds/send.c        | 14 +++++++-------
 net/rds/sysctl.c      |  2 +-
 net/rds/tcp.c         |  4 ++--
 net/rds/tcp_connect.c |  2 +-
 net/rds/tcp_listen.c  |  4 ++--
 net/rds/tcp_recv.c    | 10 +++++-----
 net/rds/tcp_send.c    |  2 +-
 net/rds/threads.c     |  2 +-
 26 files changed, 90 insertions(+), 90 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index aebfecbdb841..63474e1f4dd8 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -62,7 +62,7 @@ static int rds_release(struct socket *sock)
 	struct rds_sock *rs;
 	unsigned long flags;
 
-	if (sk == NULL)
+	if (!sk)
 		goto out;
 
 	rs = rds_sk_to_rs(sk);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5d95fc007f1a..65de5cbdb576 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -121,7 +121,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 	do {
 		if (rover == 0)
 			rover++;
-		if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+		if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) {
 			*port = cpu_to_be16(rover);
 			ret = 0;
 			break;
@@ -184,7 +184,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 
 	trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
-	if (trans == NULL) {
+	if (!trans) {
 		ret = -EADDRNOTAVAIL;
 		rds_remove_bound(rs);
 		if (printk_ratelimit())
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 0871a29f0780..c741e906d49f 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 	unsigned long flags;
 
 	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
-	if (map == NULL)
+	if (!map)
 		return NULL;
 
 	map->m_addr = addr;
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 	ret = rds_cong_tree_walk(addr, map);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);
 
-	if (ret == NULL) {
+	if (!ret) {
 		ret = map;
 		map = NULL;
 	}
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
 	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
 	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
 
-	if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+	if (!(conn->c_lcong && conn->c_fcong))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 895e39cdc6a6..9c9afb58a143 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -148,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 		goto out;
 
 	conn = kmem_cache_zalloc(rds_conn_slab, gfp);
-	if (conn == NULL) {
+	if (!conn) {
 		conn = ERR_PTR(-ENOMEM);
 		goto out;
 	}
@@ -502,7 +502,7 @@ int __init rds_conn_init(void)
 	rds_conn_slab = kmem_cache_create("rds_connection",
 					  sizeof(struct rds_connection),
 					  0, 0, NULL);
-	if (rds_conn_slab == NULL)
+	if (!rds_conn_slab)
 		return -ENOMEM;
 
 	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f68832798db2..b46bc2f22ab6 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -230,7 +230,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	 * the rds_ibdev at all.
 	 */
 	rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
-	if (rds_ibdev == NULL) {
+	if (!rds_ibdev) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
 					dev->name);
@@ -306,7 +306,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 					   ic->i_send_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_send_hdrs_dma, GFP_KERNEL);
-	if (ic->i_send_hdrs == NULL) {
+	if (!ic->i_send_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent send failed\n");
 		goto out;
@@ -316,7 +316,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 					   ic->i_recv_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
-	if (ic->i_recv_hdrs == NULL) {
+	if (!ic->i_recv_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent recv failed\n");
 		goto out;
@@ -324,14 +324,14 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
 	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
 				       &ic->i_ack_dma, GFP_KERNEL);
-	if (ic->i_ack == NULL) {
+	if (!ic->i_ack) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent ack failed\n");
 		goto out;
 	}
 
 	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
-	if (ic->i_sends == NULL) {
+	if (!ic->i_sends) {
 		ret = -ENOMEM;
 		rdsdebug("send allocation failed\n");
 		goto out;
@@ -339,7 +339,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
 
 	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
-	if (ic->i_recvs == NULL) {
+	if (!ic->i_recvs) {
 		ret = -ENOMEM;
 		rdsdebug("recv allocation failed\n");
 		goto out;
@@ -693,7 +693,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 	/* XXX too lazy? */
 	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
-	if (ic == NULL)
+	if (!ic)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&ic->ib_node);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c74e9904a6b2..d0ee9c114c6c 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -53,7 +53,7 @@ static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
 static void rds_ib_frag_free(struct rds_page_frag *frag)
 {
 	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	BUG_ON(frag->f_page != NULL);
+	BUG_ON(frag->f_page);
 	kmem_cache_free(rds_ib_frag_slab, frag);
 }
 
@@ -143,14 +143,14 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
 
-	if (recv->r_ibinc == NULL) {
+	if (!recv->r_ibinc) {
 		if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
 			rds_ib_stats_inc(s_ib_rx_alloc_limit);
 			goto out;
 		}
 		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
 						 kptr_gfp);
-		if (recv->r_ibinc == NULL) {
+		if (!recv->r_ibinc) {
 			atomic_dec(&rds_ib_allocation);
 			goto out;
 		}
@@ -158,17 +158,17 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 		rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
 	}
 
-	if (recv->r_frag == NULL) {
+	if (!recv->r_frag) {
 		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
-		if (recv->r_frag == NULL)
+		if (!recv->r_frag)
 			goto out;
 		INIT_LIST_HEAD(&recv->r_frag->f_item);
 		recv->r_frag->f_page = NULL;
 	}
 
-	if (ic->i_frag.f_page == NULL) {
+	if (!ic->i_frag.f_page) {
 		ic->i_frag.f_page = alloc_page(page_gfp);
-		if (ic->i_frag.f_page == NULL)
+		if (!ic->i_frag.f_page)
 			goto out;
 		ic->i_frag.f_offset = 0;
 	}
@@ -757,7 +757,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	 * into the inc and save the inc so we can hang upcoming fragments
 	 * off its list.
 	 */
-	if (ibinc == NULL) {
+	if (!ibinc) {
 		ibinc = recv->r_ibinc;
 		recv->r_ibinc = NULL;
 		ic->i_ibinc = ibinc;
@@ -940,13 +940,13 @@ int __init rds_ib_recv_init(void)
 	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
 					sizeof(struct rds_ib_incoming),
 					0, 0, NULL);
-	if (rds_ib_incoming_slab == NULL)
+	if (!rds_ib_incoming_slab)
 		goto out;
 
 	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
 					sizeof(struct rds_page_frag),
 					0, 0, NULL);
-	if (rds_ib_frag_slab == NULL)
+	if (!rds_ib_frag_slab)
 		kmem_cache_destroy(rds_ib_incoming_slab);
 	else
 		ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 17fa80803ab0..0b0090d2ee01 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -86,7 +86,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 		     rm->m_sg, rm->m_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->m_rdma_op != NULL) {
+	if (rm->m_rdma_op) {
 		rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
 
 		/* If the user asked for a completion notification on this
@@ -525,7 +525,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* map the message the first time we see it */
-	if (ic->i_rm == NULL) {
+	if (!ic->i_rm) {
 		/*
 		printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
 				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index 03f01cb4e0fe..c070524c4d95 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -135,7 +135,7 @@ void rds_ib_sysctl_exit(void)
 int __init rds_ib_sysctl_init(void)
 {
 	rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
-	if (rds_ib_sysctl_hdr == NULL)
+	if (!rds_ib_sysctl_hdr)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/info.c b/net/rds/info.c
index c45c4173a44d..4fdf1b6e84ff 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
 	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
 
 	spin_lock(&rds_info_lock);
-	BUG_ON(rds_info_funcs[offset] != NULL);
+	BUG_ON(rds_info_funcs[offset]);
 	rds_info_funcs[offset] = func;
 	spin_unlock(&rds_info_lock);
 }
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
  */
 void rds_info_iter_unmap(struct rds_info_iterator *iter)
 {
-	if (iter->addr != NULL) {
+	if (iter->addr) {
 		kunmap_atomic(iter->addr, KM_USER0);
 		iter->addr = NULL;
 	}
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
 	unsigned long this;
 
 	while (bytes) {
-		if (iter->addr == NULL)
+		if (!iter->addr)
 			iter->addr = kmap_atomic(*iter->pages, KM_USER0);
 
 		this = min(bytes, PAGE_SIZE - iter->offset);
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 			>> PAGE_SHIFT;
 
 	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 
 call_func:
 	func = rds_info_funcs[optname - RDS_INFO_FIRST];
-	if (func == NULL) {
+	if (!func) {
 		ret = -ENOPROTOOPT;
 		goto out;
 	}
@@ -234,7 +234,7 @@ call_func:
 		ret = -EFAULT;
 
 out:
-	for (i = 0; pages != NULL && i < nr_pages; i++)
+	for (i = 0; pages && i < nr_pages; i++)
 		put_page(pages[i]);
 	kfree(pages);
 
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index b5dd6ac39be8..712cf2d1f28e 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 	 * the rds_iwdev at all.
 	 */
 	rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
-	if (rds_iwdev == NULL) {
+	if (!rds_iwdev) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
 					dev->name);
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 					   ic->i_send_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_send_hdrs_dma, GFP_KERNEL);
-	if (ic->i_send_hdrs == NULL) {
+	if (!ic->i_send_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent send failed\n");
 		goto out;
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 					   ic->i_recv_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
-	if (ic->i_recv_hdrs == NULL) {
+	if (!ic->i_recv_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent recv failed\n");
 		goto out;
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 
 	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
 				       &ic->i_ack_dma, GFP_KERNEL);
-	if (ic->i_ack == NULL) {
+	if (!ic->i_ack) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent ack failed\n");
 		goto out;
 	}
 
 	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
-	if (ic->i_sends == NULL) {
+	if (!ic->i_sends) {
 		ret = -ENOMEM;
 		rdsdebug("send allocation failed\n");
 		goto out;
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 	rds_iw_send_init_ring(ic);
 
 	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
-	if (ic->i_recvs == NULL) {
+	if (!ic->i_recvs) {
 		ret = -ENOMEM;
 		rdsdebug("recv allocation failed\n");
 		goto out;
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 	/* XXX too lazy? */
 	ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
-	if (ic == NULL)
+	if (!ic)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&ic->iw_node);
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 3d479067d54d..48bcf4f2bf3c 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
 static void rds_iw_frag_free(struct rds_page_frag *frag)
 {
 	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	BUG_ON(frag->f_page != NULL);
+	BUG_ON(frag->f_page);
 	kmem_cache_free(rds_iw_frag_slab, frag);
 }
 
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
 
-	if (recv->r_iwinc == NULL) {
+	if (!recv->r_iwinc) {
 		if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
 			rds_iw_stats_inc(s_iw_rx_alloc_limit);
 			goto out;
 		}
 		recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
 						 kptr_gfp);
-		if (recv->r_iwinc == NULL) {
+		if (!recv->r_iwinc) {
 			atomic_dec(&rds_iw_allocation);
 			goto out;
 		}
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
 		rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
 	}
 
-	if (recv->r_frag == NULL) {
+	if (!recv->r_frag) {
 		recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
-		if (recv->r_frag == NULL)
+		if (!recv->r_frag)
 			goto out;
 		INIT_LIST_HEAD(&recv->r_frag->f_item);
 		recv->r_frag->f_page = NULL;
 	}
 
-	if (ic->i_frag.f_page == NULL) {
+	if (!ic->i_frag.f_page) {
 		ic->i_frag.f_page = alloc_page(page_gfp);
-		if (ic->i_frag.f_page == NULL)
+		if (!ic->i_frag.f_page)
 			goto out;
 		ic->i_frag.f_offset = 0;
 	}
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
 	 * into the inc and save the inc so we can hang upcoming fragments
 	 * off its list.
 	 */
-	if (iwinc == NULL) {
+	if (!iwinc) {
 		iwinc = recv->r_iwinc;
 		recv->r_iwinc = NULL;
 		ic->i_iwinc = iwinc;
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
 	rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
 					sizeof(struct rds_iw_incoming),
 					0, 0, NULL);
-	if (rds_iw_incoming_slab == NULL)
+	if (!rds_iw_incoming_slab)
 		goto out;
 
 	rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
 					sizeof(struct rds_page_frag),
 					0, 0, NULL);
-	if (rds_iw_frag_slab == NULL)
+	if (!rds_iw_frag_slab)
 		kmem_cache_destroy(rds_iw_incoming_slab);
 	else
 		ret = 0;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 52182ff7519e..dced532f9cfb 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -86,7 +86,7 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		     rm->m_sg, rm->m_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->m_rdma_op != NULL) {
+	if (rm->m_rdma_op) {
 		rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
 
 		/* If the user asked for a completion notification on this
@@ -556,7 +556,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* map the message the first time we see it */
-	if (ic->i_rm == NULL) {
+	if (!ic->i_rm) {
 		/*
 		printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
 				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 1c4428a61a02..3cb0587d6f50 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -125,7 +125,7 @@ void rds_iw_sysctl_exit(void)
 int __init rds_iw_sysctl_init(void)
 {
 	rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
-	if (rds_iw_sysctl_hdr == NULL)
+	if (!rds_iw_sysctl_hdr)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dd9879379457..a74b469a844a 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -112,7 +112,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	unsigned long flags;
 
 	lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
-	if (lc == NULL)
+	if (!lc)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&lc->loop_node);
diff --git a/net/rds/message.c b/net/rds/message.c
index 9a1d67e001ba..809656c2b25c 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -240,7 +240,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	unsigned int i;
 
 	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
-	if (rm == NULL)
+	if (!rm)
 		return ERR_PTR(-ENOMEM);
 
 	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
@@ -284,7 +284,7 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
 	while (total_len) {
-		if (sg_page(sg) == NULL) {
+		if (!sg_page(sg)) {
 			ret = rds_page_remainder_alloc(sg, total_len,
 						       GFP_HIGHUSER);
 			if (ret)
diff --git a/net/rds/page.c b/net/rds/page.c
index 595a952d4b17..e5b2527ae257 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
 	/* jump straight to allocation if we're trying for a huge page */
 	if (bytes >= PAGE_SIZE) {
 		page = alloc_page(gfp);
-		if (page == NULL) {
+		if (!page) {
 			ret = -ENOMEM;
 		} else {
 			sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
 		rem = &per_cpu(rds_page_remainders, get_cpu());
 		local_irq_save(flags);
 
-		if (page == NULL) {
+		if (!page) {
 			ret = -ENOMEM;
 			break;
 		}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 463b458ff27e..dee698b979af 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -189,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 		goto out;
 	}
 
-	if (rs->rs_transport->get_mr == NULL) {
+	if (!rs->rs_transport->get_mr) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -205,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 
 	/* XXX clamp nr_pages to limit the size of this alloc? */
 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
-	if (mr == NULL) {
+	if (!mr) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -244,7 +244,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 
 	nents = ret;
 	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
-	if (sg == NULL) {
+	if (!sg) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -425,7 +425,7 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 	/* May have to issue a dma_sync on this memory region.
 	 * Note we could avoid this if the operation was a RDMA READ,
 	 * but at this point we can't tell. */
-	if (mr != NULL) {
+	if (mr) {
 		if (mr->r_trans->sync_mr)
 			mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
 
@@ -511,13 +511,13 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 	}
 
 	pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
-	if (op == NULL) {
+	if (!op) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -643,7 +643,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 	struct rds_rdma_op *op;
 
 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-	    rm->m_rdma_op != NULL)
+	    rm->m_rdma_op)
 		return -EINVAL;
 
 	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
@@ -681,7 +681,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-	if (mr == NULL)
+	if (!mr)
 		err = -EINVAL;	/* invalid r_key */
 	else
 		atomic_inc(&mr->r_refcount);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c93588c2d553..88f1f5aecfa6 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -210,7 +210,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 	}
 
 	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
-	if (rs == NULL) {
+	if (!rs) {
 		rds_stats_inc(s_recv_drop_no_sock);
 		goto out;
 	}
@@ -251,7 +251,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
 {
 	unsigned long flags;
 
-	if (*inc == NULL) {
+	if (!*inc) {
 		read_lock_irqsave(&rs->rs_recv_lock, flags);
 		if (!list_empty(&rs->rs_recv_queue)) {
 			*inc = list_entry(rs->rs_recv_queue.next,
diff --git a/net/rds/send.c b/net/rds/send.c
index 725fb0419797..817997daf785 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -164,7 +164,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		 * offset and S/G temporaries.
 		 */
 		rm = conn->c_xmit_rm;
-		if (rm != NULL &&
+		if (rm &&
 		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
 		    conn->c_xmit_sg == rm->m_nents) {
 			conn->c_xmit_rm = NULL;
@@ -180,8 +180,8 @@ int rds_send_xmit(struct rds_connection *conn)
 
 		/* If we're asked to send a cong map update, do so.
 		 */
-		if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
-			if (conn->c_trans->xmit_cong_map != NULL) {
+		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+			if (conn->c_trans->xmit_cong_map) {
 				conn->c_map_offset = 0;
 				conn->c_map_bytes = sizeof(struct rds_header) +
 					RDS_CONG_MAP_BYTES;
@@ -204,7 +204,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		 * the connction.  We can use this ref while holding the
 		 * send_sem.. rds_send_reset() is serialized with it.
 		 */
-		if (rm == NULL) {
+		if (!rm) {
 			unsigned int len;
 
 			spin_lock_irqsave(&conn->c_lock, flags);
@@ -224,7 +224,7 @@ int rds_send_xmit(struct rds_connection *conn)
 
 			spin_unlock_irqrestore(&conn->c_lock, flags);
 
-			if (rm == NULL) {
+			if (!rm) {
 				was_empty = 1;
 				break;
 			}
@@ -875,7 +875,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 
 	if ((rm->m_rdma_cookie || rm->m_rdma_op) &&
-	    conn->c_trans->xmit_rdma == NULL) {
+	    !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
 				rm->m_rdma_op, conn->c_trans->xmit_rdma);
@@ -961,7 +961,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	int ret = 0;
 
 	rm = rds_message_alloc(0, GFP_ATOMIC);
-	if (rm == NULL) {
+	if (!rm) {
 		ret = -ENOMEM;
 		goto out;
 	}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 7829a20325d3..5a6dd81de188 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -111,7 +111,7 @@ int __init rds_sysctl_init(void)
 	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
 
 	rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
-	if (rds_sysctl_reg_table == NULL)
+	if (!rds_sysctl_reg_table)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index babf4577ff7d..aebe10314fdb 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	struct rds_tcp_connection *tc;
 
 	tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
-	if (tc == NULL)
+	if (!tc)
 		return -ENOMEM;
 
 	tc->t_sock = NULL;
@@ -283,7 +283,7 @@ int __init rds_tcp_init(void)
 	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
 					      sizeof(struct rds_tcp_connection),
 					      0, 0, NULL);
-	if (rds_tcp_conn_slab == NULL) {
+	if (!rds_tcp_conn_slab) {
 		ret = -ENOMEM;
 		goto out;
 	}
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index c397524c039c..a65ee78db0c5 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk)
 
 	read_lock(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
-	if (conn == NULL) {
+	if (!conn) {
 		state_change = sk->sk_state_change;
 		goto out;
 	}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 975183fe6950..50b649eb6933 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
 
 	read_lock(&sk->sk_callback_lock);
 	ready = sk->sk_user_data;
-	if (ready == NULL) { /* check for teardown race */
+	if (!ready) { /* check for teardown race */
 		ready = sk->sk_data_ready;
 		goto out;
 	}
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
 	struct socket *sock = rds_tcp_listen_sock;
 	struct sock *sk;
 
-	if (sock == NULL)
+	if (!sock)
 		return;
 
 	sk = sock->sk;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 1aba6878fa5d..ea7382908aa5 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 	 * processing.
 	 */
 	while (left) {
-		if (tinc == NULL) {
+		if (!tinc) {
 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
 					        arg->gfp);
-			if (tinc == NULL) {
+			if (!tinc) {
 				desc->error = -ENOMEM;
 				goto out;
 			}
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 
 		if (left && tc->t_tinc_data_rem) {
 			clone = skb_clone(skb, arg->gfp);
-			if (clone == NULL) {
+			if (!clone) {
 				desc->error = -ENOMEM;
 				goto out;
 			}
@@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
 
 	read_lock(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
-	if (conn == NULL) { /* check for teardown race */
+	if (!conn) { /* check for teardown race */
 		ready = sk->sk_data_ready;
 		goto out;
 	}
@@ -347,7 +347,7 @@ int __init rds_tcp_recv_init(void)
 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
 					sizeof(struct rds_tcp_incoming),
 					0, 0, NULL);
-	if (rds_tcp_incoming_slab == NULL)
+	if (!rds_tcp_incoming_slab)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index a28b895ff0d1..e5f6ccef79ef 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -226,7 +226,7 @@ void rds_tcp_write_space(struct sock *sk)
 
 	read_lock(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
-	if (conn == NULL) {
+	if (!conn) {
 		write_space = sk->sk_write_space;
 		goto out;
 	}
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 6e2e43d5f576..7a8ca7a1d983 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -215,7 +215,7 @@ void rds_threads_exit(void)
 int __init rds_threads_init(void)
 {
 	rds_wq = create_workqueue("krdsd");
-	if (rds_wq == NULL)
+	if (!rds_wq)
 		return -ENOMEM;
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From e779137aa76d38d5c33a98ed887092ae4e4f016f Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 12:15:02 -0800
Subject: RDS: break out rdma and data ops into nested structs in rds_message

Clearly separate rdma-related variables in rm from data-related ones.
This is in anticipation of adding atomic support.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c  | 44 +++++++++++++++++++++++---------------------
 net/rds/iw_send.c  | 38 ++++++++++++++++++++------------------
 net/rds/message.c  | 30 +++++++++++++++---------------
 net/rds/rdma.c     |  9 +++++----
 net/rds/rds.h      | 16 +++++++++++-----
 net/rds/send.c     | 30 +++++++++++++++---------------
 net/rds/tcp_send.c | 14 +++++++-------
 7 files changed, 96 insertions(+), 85 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 0b0090d2ee01..53750203c9e5 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -83,11 +83,11 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
 	ib_dma_unmap_sg(ic->i_cm_id->device,
-		     rm->m_sg, rm->m_nents,
-		     DMA_TO_DEVICE);
+			rm->data.m_sg, rm->data.m_nents,
+			DMA_TO_DEVICE);
 
-	if (rm->m_rdma_op) {
-		rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+	if (rm->rdma.m_rdma_op) {
+		rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -111,10 +111,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 		 */
 		rds_ib_send_rdma_complete(rm, wc_status);
 
-		if (rm->m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+		if (rm->rdma.m_rdma_op->r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -244,8 +244,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 				rm = rds_send_get_message(conn, send->s_op);
 				if (rm) {
-					if (rm->m_rdma_op)
-						rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+					if (rm->rdma.m_rdma_op)
+						rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
 					rds_ib_send_rdma_complete(rm, wc.status);
 					rds_message_put(rm);
 				}
@@ -532,18 +532,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 				rm->m_inc.i_hdr.h_flags,
 				be32_to_cpu(rm->m_inc.i_hdr.h_len));
 		   */
-		if (rm->m_nents) {
-			rm->m_count = ib_dma_map_sg(dev,
-					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
-			if (rm->m_count == 0) {
+		if (rm->data.m_nents) {
+			rm->data.m_count = ib_dma_map_sg(dev,
+							    rm->data.m_sg,
+							    rm->data.m_nents,
+							    DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count);
+			if (rm->data.m_count == 0) {
 				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
 				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 				ret = -ENOMEM; /* XXX ? */
 				goto out;
 			}
 		} else {
-			rm->m_count = 0;
+			rm->data.m_count = 0;
 		}
 
 		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
@@ -559,10 +561,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->m_rdma_op) {
+		if (rm->rdma.m_rdma_op) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -590,7 +592,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &rm->m_sg[sg];
+	scat = &rm->data.m_sg[sg];
 	sent = 0;
 	i = 0;
 
@@ -600,7 +602,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
@@ -619,7 +621,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+	for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) {
 		unsigned int len;
 
 		send = &ic->i_sends[pos];
@@ -697,7 +699,7 @@ add_header:
 		sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &rm->m_sg[rm->m_count]) {
+	if (scat == &rm->data.m_sg[rm->data.m_count]) {
 		prev->s_rm = ic->i_rm;
 		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index dced532f9cfb..c187e8fdeab1 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -83,11 +83,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
 	ib_dma_unmap_sg(ic->i_cm_id->device,
-		     rm->m_sg, rm->m_nents,
+		     rm->data.m_sg, rm->data.m_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->m_rdma_op) {
-		rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+	if (rm->rdma.m_rdma_op) {
+		rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -111,10 +111,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		 */
 		rds_iw_send_rdma_complete(rm, wc_status);
 
-		if (rm->m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+		if (rm->rdma.m_rdma_op->r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -563,18 +563,20 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 				rm->m_inc.i_hdr.h_flags,
 				be32_to_cpu(rm->m_inc.i_hdr.h_len));
 		   */
-		if (rm->m_nents) {
-			rm->m_count = ib_dma_map_sg(dev,
-					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
-			if (rm->m_count == 0) {
+		if (rm->data.m_nents) {
+			rm->data.m_count = ib_dma_map_sg(dev,
+						    rm->data.m_sg,
+						    rm->data.m_nents,
+						    DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count);
+			if (rm->data.m_count == 0) {
 				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
 				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
 				ret = -ENOMEM; /* XXX ? */
 				goto out;
 			}
 		} else {
-			rm->m_count = 0;
+			rm->data.m_count = 0;
 		}
 
 		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -590,10 +592,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->m_rdma_op) {
+		if (rm->rdma.m_rdma_op) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -621,7 +623,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &rm->m_sg[sg];
+	scat = &rm->data.m_sg[sg];
 	sent = 0;
 	i = 0;
 
@@ -631,7 +633,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
@@ -650,7 +652,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+	for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) {
 		unsigned int len;
 
 		send = &ic->i_sends[pos];
@@ -728,7 +730,7 @@ add_header:
 		sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &rm->m_sg[rm->m_count]) {
+	if (scat == &rm->data.m_sg[rm->data.m_count]) {
 		prev->s_rm = ic->i_rm;
 		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
diff --git a/net/rds/message.c b/net/rds/message.c
index 809656c2b25c..4421d160b1a4 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -63,17 +63,17 @@ static void rds_message_purge(struct rds_message *rm)
 	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
 		return;
 
-	for (i = 0; i < rm->m_nents; i++) {
-		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+	for (i = 0; i < rm->data.m_nents; i++) {
+		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.m_sg[i]));
 		/* XXX will have to put_page for page refs */
-		__free_page(sg_page(&rm->m_sg[i]));
+		__free_page(sg_page(&rm->data.m_sg[i]));
 	}
-	rm->m_nents = 0;
+	rm->data.m_nents = 0;
 
-	if (rm->m_rdma_op)
-		rds_rdma_free_op(rm->m_rdma_op);
-	if (rm->m_rdma_mr)
-		rds_mr_put(rm->m_rdma_mr);
+	if (rm->rdma.m_rdma_op)
+		rds_rdma_free_op(rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_mr)
+		rds_mr_put(rm->rdma.m_rdma_mr);
 }
 
 void rds_message_inc_purge(struct rds_incoming *inc)
@@ -224,7 +224,7 @@ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
 		goto out;
 
 	if (nents)
-		sg_init_table(rm->m_sg, nents);
+		sg_init_table(rm->data.m_sg, nents);
 	atomic_set(&rm->m_refcount, 1);
 	INIT_LIST_HEAD(&rm->m_sock_item);
 	INIT_LIST_HEAD(&rm->m_conn_item);
@@ -245,10 +245,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 
 	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
-	rm->m_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.m_nents = ceil(total_len, PAGE_SIZE);
 
-	for (i = 0; i < rm->m_nents; ++i) {
-		sg_set_page(&rm->m_sg[i],
+	for (i = 0; i < rm->data.m_nents; ++i) {
+		sg_set_page(&rm->data.m_sg[i],
 				virt_to_page(page_addrs[i]),
 				PAGE_SIZE, 0);
 	}
@@ -278,7 +278,7 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
 	/*
 	 * now allocate and copy in the data payload.
 	 */
-	sg = rm->m_sg;
+	sg = rm->data.m_sg;
 	iov = first_iov;
 	iov_off = 0;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
@@ -289,7 +289,7 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
 						       GFP_HIGHUSER);
 			if (ret)
 				goto out;
-			rm->m_nents++;
+			rm->data.m_nents++;
 			sg_off = 0;
 		}
 
@@ -348,7 +348,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
 
 	iov = first_iov;
 	iov_off = 0;
-	sg = rm->m_sg;
+	sg = rm->data.m_sg;
 	vec_off = 0;
 	copied = 0;
 
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index dee698b979af..24274bb9e329 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -643,14 +643,14 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 	struct rds_rdma_op *op;
 
 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-	    rm->m_rdma_op)
+	    rm->rdma.m_rdma_op)
 		return -EINVAL;
 
 	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 	rds_stats_inc(s_send_rdma);
-	rm->m_rdma_op = op;
+	rm->rdma.m_rdma_op = op;
 	return 0;
 }
 
@@ -679,6 +679,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 	 */
 	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
 
+
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 	if (!mr)
@@ -689,7 +690,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
 	if (mr) {
 		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
-		rm->m_rdma_mr = mr;
+		rm->rdma.m_rdma_mr = mr;
 	}
 	return err;
 }
@@ -707,5 +708,5 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 	    rm->m_rdma_cookie != 0)
 		return -EINVAL;
 
-	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.m_rdma_mr);
 }
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 1d3eef67137f..07a750b3fb31 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -259,12 +259,18 @@ struct rds_message {
 	 */
 	spinlock_t		m_rs_lock;
 	struct rds_sock		*m_rs;
-	struct rds_rdma_op	*m_rdma_op;
 	rds_rdma_cookie_t	m_rdma_cookie;
-	struct rds_mr		*m_rdma_mr;
-	unsigned int		m_nents;
-	unsigned int		m_count;
-	struct scatterlist	m_sg[0];
+	struct {
+		struct {
+			struct rds_rdma_op	*m_rdma_op;
+			struct rds_mr		*m_rdma_mr;
+		} rdma;
+		struct {
+			unsigned int		m_nents;
+			unsigned int		m_count;
+			struct scatterlist	m_sg[0];
+		} data;
+	};
 };
 
 /*
diff --git a/net/rds/send.c b/net/rds/send.c
index 817997daf785..19dfd025498e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -166,7 +166,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		rm = conn->c_xmit_rm;
 		if (rm &&
 		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-		    conn->c_xmit_sg == rm->m_nents) {
+		    conn->c_xmit_sg == rm->data.m_nents) {
 			conn->c_xmit_rm = NULL;
 			conn->c_xmit_sg = 0;
 			conn->c_xmit_hdr_off = 0;
@@ -236,7 +236,7 @@ int rds_send_xmit(struct rds_connection *conn)
 			 * connection.
 			 * Therefore, we never retransmit messages with RDMA ops.
 			 */
-			if (rm->m_rdma_op &&
+			if (rm->rdma.m_rdma_op &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
 				spin_lock_irqsave(&conn->c_lock, flags);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
@@ -268,8 +268,8 @@ int rds_send_xmit(struct rds_connection *conn)
 		 * keep this simple and require that the transport either
 		 * send the whole rdma or none of it.
 		 */
-		if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
-			ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+		if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) {
+			ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op);
 			if (ret)
 				break;
 			conn->c_xmit_rdma_sent = 1;
@@ -279,7 +279,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		}
 
 		if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
-		    conn->c_xmit_sg < rm->m_nents) {
+		    conn->c_xmit_sg < rm->data.m_nents) {
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
 						  conn->c_xmit_sg,
@@ -295,7 +295,7 @@ int rds_send_xmit(struct rds_connection *conn)
 				ret -= tmp;
 			}
 
-			sg = &rm->m_sg[conn->c_xmit_sg];
+			sg = &rm->data.m_sg[conn->c_xmit_sg];
 			while (ret) {
 				tmp = min_t(int, ret, sg->length -
 						      conn->c_xmit_data_off);
@@ -306,7 +306,7 @@ int rds_send_xmit(struct rds_connection *conn)
 					sg++;
 					conn->c_xmit_sg++;
 					BUG_ON(ret != 0 &&
-					       conn->c_xmit_sg == rm->m_nents);
+					       conn->c_xmit_sg == rm->data.m_nents);
 				}
 			}
 		}
@@ -419,7 +419,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	ro = rm->m_rdma_op;
+	ro = rm->rdma.m_rdma_op;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
 	    ro && ro->r_notify && ro->r_notifier) {
 		notifier = ro->r_notifier;
@@ -453,7 +453,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
 {
 	struct rds_rdma_op *ro;
 
-	ro = rm->m_rdma_op;
+	ro = rm->rdma.m_rdma_op;
 	if (ro && ro->r_notify && ro->r_notifier) {
 		ro->r_notifier->n_status = status;
 		list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
@@ -477,7 +477,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	spin_lock_irqsave(&conn->c_lock, flags);
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (rm->m_rdma_op == op) {
+		if (rm->rdma.m_rdma_op == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			goto out;
@@ -485,7 +485,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	}
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (rm->m_rdma_op == op) {
+		if (rm->rdma.m_rdma_op == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			break;
@@ -545,7 +545,7 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
 		spin_lock(&rs->rs_lock);
 
 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-			struct rds_rdma_op *ro = rm->m_rdma_op;
+			struct rds_rdma_op *ro = rm->rdma.m_rdma_op;
 			struct rds_notifier *notifier;
 
 			list_del_init(&rm->m_sock_item);
@@ -557,7 +557,7 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
 						&rs->rs_notify_queue);
 				if (!notifier->n_status)
 					notifier->n_status = status;
-				rm->m_rdma_op->r_notifier = NULL;
+				rm->rdma.m_rdma_op->r_notifier = NULL;
 			}
 			was_on_sock = 1;
 			rm->m_rs = NULL;
@@ -874,11 +874,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	if (ret)
 		goto out;
 
-	if ((rm->m_rdma_cookie || rm->m_rdma_op) &&
+	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) &&
 	    !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-				rm->m_rdma_op, conn->c_trans->xmit_rdma);
+				rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index e5f6ccef79ef..d63aa35ac673 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -166,21 +166,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 			goto out;
 	}
 
-	while (sg < rm->m_nents) {
+	while (sg < rm->data.m_nents) {
 		ret = tc->t_sock->ops->sendpage(tc->t_sock,
-						sg_page(&rm->m_sg[sg]),
-						rm->m_sg[sg].offset + off,
-						rm->m_sg[sg].length - off,
+						sg_page(&rm->data.m_sg[sg]),
+						rm->data.m_sg[sg].offset + off,
+						rm->data.m_sg[sg].length - off,
 						MSG_DONTWAIT|MSG_NOSIGNAL);
-		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]),
-			 rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off,
+		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.m_sg[sg]),
+			 rm->data.m_sg[sg].offset + off, rm->data.m_sg[sg].length - off,
 			 ret);
 		if (ret <= 0)
 			break;
 
 		off += ret;
 		done += ret;
-		if (off == rm->m_sg[sg].length) {
+		if (off == rm->data.m_sg[sg].length) {
 			off = 0;
 			sg++;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 3ef13f3c22aaea28aff383cb0883481d24885456 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 12:37:17 -0800
Subject: RDS: cleanup/fix rds_rdma_unuse

First, it looks to me like the atomic_inc is wrong.
We should be decrementing refcount only once here, no? It's
already being done by the mr_put() at the end.

Second, simplify the logic a bit by bailing early (with a warning)
if !mr.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rdma.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 24274bb9e329..5011efa62a92 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -414,27 +414,30 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-	if (mr && (mr->r_use_once || force)) {
+	if (!mr) {
+		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		return;
+	}
+
+	if (mr->r_use_once || force) {
 		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
 		RB_CLEAR_NODE(&mr->r_rb_node);
 		zot_me = 1;
-	} else if (mr)
-		atomic_inc(&mr->r_refcount);
+	}
 	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
 	/* May have to issue a dma_sync on this memory region.
 	 * Note we could avoid this if the operation was a RDMA READ,
 	 * but at this point we can't tell. */
-	if (mr) {
-		if (mr->r_trans->sync_mr)
-			mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+	if (mr->r_trans->sync_mr)
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
 
-		/* If the MR was marked as invalidate, this will
-		 * trigger an async flush. */
-		if (zot_me)
-			rds_destroy_mr(mr);
-		rds_mr_put(mr);
-	}
+	/* If the MR was marked as invalidate, this will
+	 * trigger an async flush. */
+	if (zot_me)
+		rds_destroy_mr(mr);
+	rds_mr_put(mr);
 }
 
 void rds_rdma_free_op(struct rds_rdma_op *ro)
-- 
cgit v1.2.3-59-g8ed1b


From fc445084f185cdd877bec323bfe724a361e2292a Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 12:56:06 -0800
Subject: RDS: Explicitly allocate rm in sendmsg()

r_m_copy_from_user used to allocate the rm as well as kernel
buffers for the data, and then copy the data in. Now, sendmsg()
allocates the rm, although the data buffer alloc still happens
in r_m_copy_from_user.

SGs are still allocated with rm, but now r_m_alloc_sgs() is
used to reserve them. This allows multiple SG lists to be
allocated from the one rm -- this is important once we also
want to alloc our rdma sgl from this pool.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c | 51 ++++++++++++++++++++++++++++++---------------------
 net/rds/rds.h     |  7 +++++--
 net/rds/send.c    | 31 +++++++++++++++++++++++++++----
 3 files changed, 62 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 4421d160b1a4..3498cbcc7542 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -214,17 +214,22 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
 }
 EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
 
-struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
 {
 	struct rds_message *rm;
 
-	rm = kzalloc(sizeof(struct rds_message) +
-		     (nents * sizeof(struct scatterlist)), gfp);
+	rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
 	if (!rm)
 		goto out;
 
-	if (nents)
-		sg_init_table(rm->data.m_sg, nents);
+	rm->m_used_sgs = 0;
+	rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
 	atomic_set(&rm->m_refcount, 1);
 	INIT_LIST_HEAD(&rm->m_sock_item);
 	INIT_LIST_HEAD(&rm->m_conn_item);
@@ -234,6 +239,23 @@ out:
 	return rm;
 }
 
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+	struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+	struct scatterlist *sg_ret;
+
+	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+
+	sg_ret = &sg_first[rm->m_used_sgs];
+
+	rm->m_used_sgs += nents;
+
+	return sg_ret;
+}
+
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
 {
 	struct rds_message *rm;
@@ -256,22 +278,15 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	return rm;
 }
 
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 					       size_t total_len)
 {
 	unsigned long to_copy;
 	unsigned long iov_off;
 	unsigned long sg_off;
-	struct rds_message *rm;
 	struct iovec *iov;
 	struct scatterlist *sg;
-	int ret;
-
-	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
-	if (rm == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	int ret = 0;
 
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
 
@@ -320,14 +335,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
 			sg++;
 	}
 
-	ret = 0;
 out:
-	if (ret) {
-		if (rm)
-			rds_message_put(rm);
-		rm = ERR_PTR(ret);
-	}
-	return rm;
+	return ret;
 }
 
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 07a750b3fb31..d29c71aabbd4 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -268,9 +268,11 @@ struct rds_message {
 		struct {
 			unsigned int		m_nents;
 			unsigned int		m_count;
-			struct scatterlist	m_sg[0];
+			struct scatterlist	*m_sg;
 		} data;
 	};
+	unsigned int		m_used_sgs;
+	unsigned int		m_total_sgs;
 };
 
 /*
@@ -573,7 +575,8 @@ rds_conn_connecting(struct rds_connection *conn)
 
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 					       size_t total_len);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
diff --git a/net/rds/send.c b/net/rds/send.c
index 19dfd025498e..28d09447207b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -758,6 +758,19 @@ out:
 	return *queued;
 }
 
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+	int size = 0;
+
+	size +=	ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+	return size;
+}
+
 static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			 struct msghdr *msg, int *allocated_mr)
 {
@@ -845,13 +858,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
-	if (IS_ERR(rm)) {
-		ret = PTR_ERR(rm);
-		rm = NULL;
+	/* size of rm including all sgs */
+	ret = rds_rm_size(msg, payload_len);
+	if (ret < 0)
+		goto out;
+
+	rm = rds_message_alloc(ret, GFP_KERNEL);
+	if (!rm) {
+		ret = -ENOMEM;
 		goto out;
 	}
 
+	rm->data.m_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+	/* XXX fix this to not allocate memory */
+	ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+	if (ret)
+		goto out;
+
 	rm->m_daddr = daddr;
 
 	/* rds_conn_create has a spinlock that runs with IRQ off.
-- 
cgit v1.2.3-59-g8ed1b


From 21f79afa5fda2820671a8f64c3d0e43bb118053b Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 12:57:27 -0800
Subject: RDS: fold rdma.h into rds.h

RDMA is now an intrinsic part of RDS, so it's easier to just have
a single header.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/af_rds.c     |  1 -
 net/rds/connection.c |  1 -
 net/rds/ib_rdma.c    |  1 -
 net/rds/ib_send.c    |  1 -
 net/rds/iw_rdma.c    |  1 -
 net/rds/iw_send.c    |  1 -
 net/rds/message.c    |  1 -
 net/rds/rdma.c       |  2 +-
 net/rds/rdma.h       | 85 ----------------------------------------------------
 net/rds/rds.h        | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 net/rds/recv.c       |  1 -
 net/rds/send.c       |  1 -
 12 files changed, 77 insertions(+), 95 deletions(-)
 delete mode 100644 net/rds/rdma.h

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 63474e1f4dd8..ef09340cf7a9 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,6 @@
 #include <net/sock.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 /* this is just used for stats gathering :/ */
 static DEFINE_SPINLOCK(rds_sock_lock);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 9c9afb58a143..88bcaf3f3e16 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -37,7 +37,6 @@
 
 #include "rds.h"
 #include "loop.h"
-#include "rdma.h"
 
 #define RDS_CONNECTION_HASH_BITS 12
 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a92aebcb7fcf..0f3b5a2f3fe0 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "ib.h"
 
 
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 53750203c9e5..575fce463c65 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,7 +36,6 @@
 #include <linux/dmapool.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "ib.h"
 
 static void rds_ib_send_rdma_complete(struct rds_message *rm,
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 13dc1862d862..4e152e2daa3d 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "iw.h"
 
 
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index c187e8fdeab1..62234b804d93 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -36,7 +36,6 @@
 #include <linux/dmapool.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "iw.h"
 
 static void rds_iw_send_rdma_complete(struct rds_message *rm,
diff --git a/net/rds/message.c b/net/rds/message.c
index 3498cbcc7542..fb382fbb5b6f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
 
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 5011efa62a92..a21edad33950 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -35,7 +35,7 @@
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
 
-#include "rdma.h"
+#include "rds.h"
 
 /*
  * XXX
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
deleted file mode 100644
index 909c39835a5d..000000000000
--- a/net/rds/rdma.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _RDS_RDMA_H
-#define _RDS_RDMA_H
-
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/scatterlist.h>
-
-#include "rds.h"
-
-struct rds_mr {
-	struct rb_node		r_rb_node;
-	atomic_t		r_refcount;
-	u32			r_key;
-
-	/* A copy of the creation flags */
-	unsigned int		r_use_once:1;
-	unsigned int		r_invalidate:1;
-	unsigned int		r_write:1;
-
-	/* This is for RDS_MR_DEAD.
-	 * It would be nice & consistent to make this part of the above
-	 * bit field here, but we need to use test_and_set_bit.
-	 */
-	unsigned long		r_state;
-	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
-	struct rds_transport	*r_trans;
-	void			*r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD		0
-
-struct rds_rdma_op {
-	u32			r_key;
-	u64			r_remote_addr;
-	unsigned int		r_write:1;
-	unsigned int		r_fence:1;
-	unsigned int		r_notify:1;
-	unsigned int		r_recverr:1;
-	unsigned int		r_mapped:1;
-	struct rds_notifier	*r_notifier;
-	unsigned int		r_bytes;
-	unsigned int		r_nents;
-	unsigned int		r_count;
-	struct scatterlist	r_sg[0];
-};
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
-	return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
-	return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
-	return cookie >> 32;
-}
-
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rds_rdma_op *ro);
-void rds_rdma_send_complete(struct rds_message *rm, int);
-
-extern void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
-	if (atomic_dec_and_test(&mr->r_refcount))
-		__rds_put_mr_final(mr);
-}
-
-#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
index d29c71aabbd4..7c4adbe8c284 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -206,6 +206,60 @@ struct rds_incoming {
 	rds_rdma_cookie_t	i_rdma_cookie;
 };
 
+struct rds_mr {
+	struct rb_node		r_rb_node;
+	atomic_t		r_refcount;
+	u32			r_key;
+
+	/* A copy of the creation flags */
+	unsigned int		r_use_once:1;
+	unsigned int		r_invalidate:1;
+	unsigned int		r_write:1;
+
+	/* This is for RDS_MR_DEAD.
+	 * It would be nice & consistent to make this part of the above
+	 * bit field here, but we need to use test_and_set_bit.
+	 */
+	unsigned long		r_state;
+	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
+	struct rds_transport	*r_trans;
+	void			*r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD		0
+
+struct rds_rdma_op {
+	u32			r_key;
+	u64			r_remote_addr;
+	unsigned int		r_write:1;
+	unsigned int		r_fence:1;
+	unsigned int		r_notify:1;
+	unsigned int		r_recverr:1;
+	unsigned int		r_mapped:1;
+	unsigned int		r_active:1;
+	struct rds_notifier	*r_notifier;
+	unsigned int		r_bytes;
+	unsigned int		r_nents;
+	unsigned int		r_count;
+	struct scatterlist	*r_sg;
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+	return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+	return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+	return cookie >> 32;
+}
+
 /*
  * m_sock_item and m_conn_item are on lists that are serialized under
  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -654,6 +708,28 @@ struct rds_message *rds_send_get_message(struct rds_connection *,
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+	if (atomic_dec_and_test(&mr->r_refcount))
+		__rds_put_mr_final(mr);
+}
 
 /* stats.c */
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 88f1f5aecfa6..5188763b609b 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -36,7 +36,6 @@
 #include <linux/in.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr)
diff --git a/net/rds/send.c b/net/rds/send.c
index 28d09447207b..89e26ffdc812 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -37,7 +37,6 @@
 #include <linux/list.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 /* When transmitting messages in rds_send_xmit, we need to emerge from
  * time to time and briefly release the CPU. Otherwise the softlock watchdog
-- 
cgit v1.2.3-59-g8ed1b


From ff87e97a9d70c9ae133d3d3d7792b26ab85f4297 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 14:13:15 -0800
Subject: RDS: make m_rdma_op a member of rds_message

This eliminates a separate memory alloc, although
it is now necessary to add an "r_active" flag, since
it is no longer to use the m_rdma_op pointer as an
indicator of if an rdma op is present.

rdma SGs allocated from rm sg pool.

rds_rm_size also gets bigger. It's a little inefficient to
run through CMSGs twice, but it makes later steps a lot smoother.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c |  20 +++++-----
 net/rds/iw_send.c |  16 ++++----
 net/rds/message.c |   9 +++--
 net/rds/rdma.c    | 113 +++++++++++++++++++++++++++++-------------------------
 net/rds/rds.h     |   2 +-
 net/rds/send.c    |  59 ++++++++++++++++++++--------
 6 files changed, 129 insertions(+), 90 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 575fce463c65..f0edfdb2866c 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -85,8 +85,8 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 			rm->data.m_sg, rm->data.m_nents,
 			DMA_TO_DEVICE);
 
-	if (rm->rdma.m_rdma_op) {
-		rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_op.r_active) {
+		rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 		 */
 		rds_ib_send_rdma_complete(rm, wc_status);
 
-		if (rm->rdma.m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+		if (rm->rdma.m_rdma_op.r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -243,8 +243,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 				rm = rds_send_get_message(conn, send->s_op);
 				if (rm) {
-					if (rm->rdma.m_rdma_op)
-						rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+					if (rm->rdma.m_rdma_op.r_active)
+						rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
 					rds_ib_send_rdma_complete(rm, wc.status);
 					rds_message_put(rm);
 				}
@@ -560,10 +560,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.m_rdma_op) {
+		if (rm->rdma.m_rdma_op.r_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -601,7 +601,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
+	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 62234b804d93..9b79a1b10445 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -85,8 +85,8 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		     rm->data.m_sg, rm->data.m_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->rdma.m_rdma_op) {
-		rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_op.r_active) {
+		rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		 */
 		rds_iw_send_rdma_complete(rm, wc_status);
 
-		if (rm->rdma.m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+		if (rm->rdma.m_rdma_op.r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.m_rdma_op) {
+		if (rm->rdma.m_rdma_op.r_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
+	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
diff --git a/net/rds/message.c b/net/rds/message.c
index fb382fbb5b6f..4352ce79b376 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -69,8 +69,8 @@ static void rds_message_purge(struct rds_message *rm)
 	}
 	rm->data.m_nents = 0;
 
-	if (rm->rdma.m_rdma_op)
-		rds_rdma_free_op(rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_op.r_active)
+		rds_rdma_free_op(&rm->rdma.m_rdma_op);
 	if (rm->rdma.m_rdma_mr)
 		rds_mr_put(rm->rdma.m_rdma_mr);
 }
@@ -259,14 +259,17 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 {
 	struct rds_message *rm;
 	unsigned int i;
+	int num_sgs = ceil(total_len, PAGE_SIZE);
+	int extra_bytes = num_sgs * sizeof(struct scatterlist);
 
-	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+	rm = rds_message_alloc(extra_bytes, GFP_KERNEL);
 	if (!rm)
 		return ERR_PTR(-ENOMEM);
 
 	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
 	rm->data.m_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs);
 
 	for (i = 0; i < rm->data.m_nents; ++i) {
 		sg_set_page(&rm->data.m_sg[i],
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index a21edad33950..7ff3379bab14 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -458,26 +458,60 @@ void rds_rdma_free_op(struct rds_rdma_op *ro)
 	}
 
 	kfree(ro->r_notifier);
-	kfree(ro);
+	ro->r_notifier = NULL;
+	ro->r_active = 0;
+}
+
+/*
+ * Count the number of pages needed to describe an incoming iovec.
+ */
+static int rds_rdma_pages(struct rds_rdma_args *args)
+{
+	struct rds_iovec vec;
+	struct rds_iovec __user *local_vec;
+	unsigned int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec)))
+			return -EFAULT;
+
+		nr_pages = rds_pages_in_vec(&vec);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+	}
+
+	return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+	return rds_rdma_pages(args) * sizeof(struct scatterlist);
 }
 
 /*
  * args is a pointer to an in-kernel copy in the sendmsg cmsg.
  */
-static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
-					    struct rds_rdma_args *args)
+static int rds_rdma_prepare(struct rds_message *rm,
+			    struct rds_sock *rs,
+			    struct rds_rdma_args *args)
 {
 	struct rds_iovec vec;
-	struct rds_rdma_op *op = NULL;
+	struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
 	unsigned int nr_pages;
-	unsigned int max_pages;
 	unsigned int nr_bytes;
 	struct page **pages = NULL;
 	struct rds_iovec __user *local_vec;
-	struct scatterlist *sg;
 	unsigned int nr;
 	unsigned int i, j;
-	int ret;
+	int ret = 0;
 
 
 	if (rs->rs_bound_addr == 0) {
@@ -490,44 +524,21 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 		goto out;
 	}
 
-	nr_pages = 0;
-	max_pages = 0;
-
-	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
-	/* figure out the number of pages in the vector */
-	for (i = 0; i < args->nr_local; i++) {
-		if (copy_from_user(&vec, &local_vec[i],
-				   sizeof(struct rds_iovec))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		nr = rds_pages_in_vec(&vec);
-		if (nr == 0) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		max_pages = max(nr, max_pages);
-		nr_pages += nr;
-	}
-
-	pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
-	if (!pages) {
-		ret = -ENOMEM;
+	nr_pages = rds_rdma_pages(args);
+	if (nr_pages < 0)
 		goto out;
-	}
 
-	op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
-	if (!op) {
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
+	op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
 	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
 	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
 	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->r_active = 1;
 	op->r_recverr = rs->rs_recverr;
 	WARN_ON(!nr_pages);
 	sg_init_table(op->r_sg, nr_pages);
@@ -564,6 +575,8 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 	       (unsigned long long)args->remote_vec.addr,
 	       op->r_key);
 
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
 	for (i = 0; i < args->nr_local; i++) {
 		if (copy_from_user(&vec, &local_vec[i],
 				   sizeof(struct rds_iovec))) {
@@ -580,11 +593,6 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 		rs->rs_user_addr = vec.addr;
 		rs->rs_user_bytes = vec.bytes;
 
-		/* did the user change the vec under us? */
-		if (nr > max_pages || op->r_nents + nr > nr_pages) {
-			ret = -EINVAL;
-			goto out;
-		}
 		/* If it's a WRITE operation, we want to pin the pages for reading.
 		 * If it's a READ operation, we need to pin the pages for writing.
 		 */
@@ -599,6 +607,7 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 
 		for (j = 0; j < nr; j++) {
 			unsigned int offset = vec.addr & ~PAGE_MASK;
+			struct scatterlist *sg;
 
 			sg = &op->r_sg[op->r_nents + j];
 			sg_set_page(sg, pages[j],
@@ -628,12 +637,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 	ret = 0;
 out:
 	kfree(pages);
-	if (ret) {
-		if (op)
-			rds_rdma_free_op(op);
-		op = ERR_PTR(ret);
-	}
-	return op;
+	if (ret)
+		rds_rdma_free_op(op);
+
+	return ret;
 }
 
 /*
@@ -643,17 +650,17 @@ out:
 int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 			  struct cmsghdr *cmsg)
 {
-	struct rds_rdma_op *op;
+	int ret;
 
 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-	    rm->rdma.m_rdma_op)
+	    rm->rdma.m_rdma_op.r_active)
 		return -EINVAL;
 
-	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
-	if (IS_ERR(op))
-		return PTR_ERR(op);
+	ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg));
+	if (ret)
+		return ret;
+
 	rds_stats_inc(s_send_rdma);
-	rm->rdma.m_rdma_op = op;
 	return 0;
 }
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 7c4adbe8c284..0bb4957e0cfc 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -316,7 +316,7 @@ struct rds_message {
 	rds_rdma_cookie_t	m_rdma_cookie;
 	struct {
 		struct {
-			struct rds_rdma_op	*m_rdma_op;
+			struct rds_rdma_op	m_rdma_op;
 			struct rds_mr		*m_rdma_mr;
 		} rdma;
 		struct {
diff --git a/net/rds/send.c b/net/rds/send.c
index 89e26ffdc812..72dbe7fc4f54 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -235,7 +235,7 @@ int rds_send_xmit(struct rds_connection *conn)
 			 * connection.
 			 * Therefore, we never retransmit messages with RDMA ops.
 			 */
-			if (rm->rdma.m_rdma_op &&
+			if (rm->rdma.m_rdma_op.r_active &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
 				spin_lock_irqsave(&conn->c_lock, flags);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
@@ -267,8 +267,8 @@ int rds_send_xmit(struct rds_connection *conn)
 		 * keep this simple and require that the transport either
 		 * send the whole rdma or none of it.
 		 */
-		if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) {
-			ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op);
+		if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) {
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op);
 			if (ret)
 				break;
 			conn->c_xmit_rdma_sent = 1;
@@ -418,9 +418,9 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	ro = rm->rdma.m_rdma_op;
+	ro = &rm->rdma.m_rdma_op;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro && ro->r_notify && ro->r_notifier) {
+	    ro->r_active && ro->r_notify && ro->r_notifier) {
 		notifier = ro->r_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
@@ -452,8 +452,8 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
 {
 	struct rds_rdma_op *ro;
 
-	ro = rm->rdma.m_rdma_op;
-	if (ro && ro->r_notify && ro->r_notifier) {
+	ro = &rm->rdma.m_rdma_op;
+	if (ro->r_active && ro->r_notify && ro->r_notifier) {
 		ro->r_notifier->n_status = status;
 		list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
 		ro->r_notifier = NULL;
@@ -476,7 +476,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	spin_lock_irqsave(&conn->c_lock, flags);
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (rm->rdma.m_rdma_op == op) {
+		if (&rm->rdma.m_rdma_op == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			goto out;
@@ -484,7 +484,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	}
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (rm->rdma.m_rdma_op == op) {
+		if (&rm->rdma.m_rdma_op == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			break;
@@ -544,19 +544,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
 		spin_lock(&rs->rs_lock);
 
 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-			struct rds_rdma_op *ro = rm->rdma.m_rdma_op;
+			struct rds_rdma_op *ro = &rm->rdma.m_rdma_op;
 			struct rds_notifier *notifier;
 
 			list_del_init(&rm->m_sock_item);
 			rds_send_sndbuf_remove(rs, rm);
 
-			if (ro && ro->r_notifier && (status || ro->r_notify)) {
+			if (ro->r_active && ro->r_notifier &&
+			    (status || ro->r_notify)) {
 				notifier = ro->r_notifier;
 				list_add_tail(&notifier->n_list,
 						&rs->rs_notify_queue);
 				if (!notifier->n_status)
 					notifier->n_status = status;
-				rm->rdma.m_rdma_op->r_notifier = NULL;
+				rm->rdma.m_rdma_op.r_notifier = NULL;
 			}
 			was_on_sock = 1;
 			rm->m_rs = NULL;
@@ -763,9 +764,37 @@ out:
  */
 static int rds_rm_size(struct msghdr *msg, int data_len)
 {
+	struct cmsghdr *cmsg;
 	int size = 0;
+	int retval;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+			if (retval < 0)
+				return retval;
+			size += retval;
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+		case RDS_CMSG_RDMA_MAP:
+			/* these are valid but do no add any size */
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	}
 
-	size +=	ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
 
 	return size;
 }
@@ -896,11 +925,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	if (ret)
 		goto out;
 
-	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) &&
+	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) &&
 	    !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-				rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
+				&rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From f4dd96f7b27743e568cec519eff0f951c56833c6 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 14:17:31 -0800
Subject: RDS: make sure all sgs alloced are initialized

rds_message_alloc_sgs() now returns correctly-initialized
sg lists, so calleds need not do this themselves.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c | 2 +-
 net/rds/rdma.c    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 4352ce79b376..7d678e217820 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -249,7 +249,7 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
 	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
 
 	sg_ret = &sg_first[rm->m_used_sgs];
-
+	sg_init_table(sg_ret, nents);
 	rm->m_used_sgs += nents;
 
 	return sg_ret;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 7ff3379bab14..4fda33045598 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -534,14 +534,13 @@ static int rds_rdma_prepare(struct rds_message *rm,
 		goto out;
 	}
 
-	op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
 	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
 	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
 	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
 	op->r_active = 1;
 	op->r_recverr = rs->rs_recverr;
 	WARN_ON(!nr_pages);
-	sg_init_table(op->r_sg, nr_pages);
+	op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
 
 	if (op->r_notify || op->r_recverr) {
 		/* We allocate an uninitialized notifier here, because
-- 
cgit v1.2.3-59-g8ed1b


From a63273d4992603979ddb181b6a8f07082839b39f Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 14:19:32 -0800
Subject: RDS: Clear up some confusing code in send_remove_from_sock

The previous code was correct, but made the assumption that
if r_notifier was non-NULL then either r_recverr or r_notify
was true. Valid, but fragile. Changed to explicitly check
r_recverr (shows up in greps for recverr now, too.)

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 72dbe7fc4f54..b751a8e77c41 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -551,7 +551,7 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
 			rds_send_sndbuf_remove(rs, rm);
 
 			if (ro->r_active && ro->r_notifier &&
-			    (status || ro->r_notify)) {
+			    (ro->r_notify || (ro->r_recverr && status))) {
 				notifier = ro->r_notifier;
 				list_add_tail(&notifier->n_list,
 						&rs->rs_notify_queue);
-- 
cgit v1.2.3-59-g8ed1b


From 15133f6e67d8d646d0744336b4daa3135452cb0d Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 14:33:38 -0800
Subject: RDS: Implement atomic operations

Implement a CMSG-based interface to do FADD and CSWP ops.

Alter send routines to handle atomic ops.

Add atomic counters to stats.

Add xmit_atomic() to struct rds_transport

Inline rds_ib_send_unmap_rdma into unmap_rm

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 include/linux/rds.h |  19 +++++++
 net/rds/ib.c        |   1 +
 net/rds/ib.h        |   1 +
 net/rds/ib_rdma.c   |   4 +-
 net/rds/ib_send.c   | 140 ++++++++++++++++++++++++++++++++++++++++++++++------
 net/rds/rdma.c      |  73 +++++++++++++++++++++++++++
 net/rds/rds.h       |  33 +++++++++++--
 net/rds/send.c      |  71 ++++++++++++++++++++++++--
 net/rds/stats.c     |   2 +
 9 files changed, 321 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/linux/rds.h b/include/linux/rds.h
index 7f3971d9fc5c..9239152abf7a 100644
--- a/include/linux/rds.h
+++ b/include/linux/rds.h
@@ -73,6 +73,8 @@
 #define RDS_CMSG_RDMA_MAP		3
 #define RDS_CMSG_RDMA_STATUS		4
 #define RDS_CMSG_CONG_UPDATE		5
+#define RDS_CMSG_ATOMIC_FADD		6
+#define RDS_CMSG_ATOMIC_CSWP		7
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000
@@ -237,6 +239,23 @@ struct rds_rdma_args {
 	u_int64_t	user_token;
 };
 
+struct rds_atomic_args {
+	rds_rdma_cookie_t cookie;
+	uint64_t 	local_addr;
+	uint64_t 	remote_addr;
+	union {
+		struct {
+			uint64_t	compare;
+			uint64_t	swap;
+		} cswp;
+		struct {
+			uint64_t	add;
+		} fadd;
+	};
+	uint64_t	flags;
+	uint64_t	user_token;
+};
+
 struct rds_rdma_notify {
 	u_int64_t	user_token;
 	int32_t		status;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8f2d6dd7700a..f0d29656baff 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -264,6 +264,7 @@ struct rds_transport rds_ib_transport = {
 	.xmit			= rds_ib_xmit,
 	.xmit_cong_map		= NULL,
 	.xmit_rdma		= rds_ib_xmit_rdma,
+	.xmit_atomic		= rds_ib_xmit_atomic,
 	.recv			= rds_ib_recv,
 	.conn_alloc		= rds_ib_conn_alloc,
 	.conn_free		= rds_ib_conn_free,
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 64df4e79b29f..d2fd0aa4fde7 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -336,6 +336,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
 			     u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0f3b5a2f3fe0..242231f09464 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -298,7 +298,9 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
 			(IB_ACCESS_LOCAL_WRITE |
 			 IB_ACCESS_REMOTE_READ |
-			 IB_ACCESS_REMOTE_WRITE),
+			 IB_ACCESS_REMOTE_WRITE|
+			 IB_ACCESS_REMOTE_ATOMIC),
+
 			&pool->fmr_attr);
 	if (IS_ERR(ibmr->fmr)) {
 		err = PTR_ERR(ibmr->fmr);
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index f0edfdb2866c..b2bd164434ad 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -62,15 +62,17 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
 	rds_rdma_send_complete(rm, notify_status);
 }
 
-static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
-				   struct rds_rdma_op *op)
+static void rds_ib_send_atomic_complete(struct rds_message *rm,
+				      int wc_status)
 {
-	if (op->r_mapped) {
-		ib_dma_unmap_sg(ic->i_cm_id->device,
-			op->r_sg, op->r_nents,
-			op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		op->r_mapped = 0;
-	}
+	int notify_status;
+
+	if (wc_status != IB_WC_SUCCESS)
+		notify_status = RDS_RDMA_OTHER_ERROR;
+	else
+		notify_status = RDS_RDMA_SUCCESS;
+
+	rds_atomic_send_complete(rm, notify_status);
 }
 
 static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
@@ -86,7 +88,14 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 			DMA_TO_DEVICE);
 
 	if (rm->rdma.m_rdma_op.r_active) {
-		rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
+		struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
+
+		if (op->r_mapped) {
+			ib_dma_unmap_sg(ic->i_cm_id->device,
+					op->r_sg, op->r_nents,
+					op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+			op->r_mapped = 0;
+		}
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -116,6 +125,24 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 	}
 
+	if (rm->atomic.op_active) {
+		struct rm_atomic_op *op = &rm->atomic;
+
+		/* unmap atomic recvbuf */
+		if (op->op_mapped) {
+			ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+					DMA_FROM_DEVICE);
+			op->op_mapped = 0;
+		}
+
+		rds_ib_send_atomic_complete(rm, wc_status);
+
+		if (rm->atomic.op_type == RDS_ATOMIC_TYPE_CSWP)
+			rds_stats_inc(s_atomic_cswp);
+		else
+			rds_stats_inc(s_atomic_fadd);
+	}
+
 	/* If anyone waited for this message to get flushed out, wake
 	 * them up now */
 	rds_message_unmapped(rm);
@@ -158,12 +185,9 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
 	u32 i;
 
 	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-		if (send->s_wr.opcode == 0xdead)
+		if (!send->s_rm || send->s_wr.opcode == 0xdead)
 			continue;
-		if (send->s_rm)
-			rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
-		if (send->s_op)
-			rds_ib_send_unmap_rdma(ic, send->s_op);
+		rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
 	}
 }
 
@@ -218,6 +242,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 				break;
 			case IB_WR_RDMA_WRITE:
 			case IB_WR_RDMA_READ:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_ATOMIC_CMP_AND_SWP:
 				/* Nothing to be done - the SG list will be unmapped
 				 * when the SEND completes. */
 				break;
@@ -243,8 +269,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 				rm = rds_send_get_message(conn, send->s_op);
 				if (rm) {
-					if (rm->rdma.m_rdma_op.r_active)
-						rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
+					rds_ib_send_unmap_rm(ic, send, wc.status);
 					rds_ib_send_rdma_complete(rm, wc.status);
 					rds_message_put(rm);
 				}
@@ -736,6 +761,89 @@ out:
 	return ret;
 }
 
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_device *rds_ibdev;
+	u32 pos;
+	u32 work_alloc;
+	int ret;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* address of send request in ring */
+	send = &ic->i_sends[pos];
+	send->s_queued = jiffies;
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+		send->s_wr.opcode = IB_WR_ATOMIC_CMP_AND_SWP;
+		send->s_wr.wr.atomic.compare_add = op->op_compare;
+		send->s_wr.wr.atomic.swap = op->op_swap_add;
+	} else { /* FADD */
+		send->s_wr.opcode = IB_WR_ATOMIC_FETCH_AND_ADD;
+		send->s_wr.wr.atomic.compare_add = op->op_swap_add;
+		send->s_wr.wr.atomic.swap = 0;
+	}
+	send->s_wr.send_flags = IB_SEND_SIGNALED;
+	send->s_wr.num_sge = 1;
+	send->s_wr.next = NULL;
+	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+	send->s_wr.wr.atomic.rkey = op->op_rkey;
+
+	/* map 8 byte retval buffer to the device */
+	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+	if (ret != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+		ret = -ENOMEM; /* XXX ? */
+		goto out;
+	}
+
+	/* Convert our struct scatterlist to struct ib_sge */
+	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].lkey = ic->i_mr->lkey;
+
+	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+		 send->s_sge[0].addr, send->s_sge[0].length);
+
+	failed_wr = &send->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+	rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+		 send, &send->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &send->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &send->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &send->s_wr);
+	}
+
+out:
+	return ret;
+}
+
 int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4fda33045598..a7019df38c70 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -719,3 +719,76 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 
 	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.m_rdma_mr);
 }
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg)
+{
+	struct page *page = NULL;
+	struct rds_atomic_args *args;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+	 || rm->atomic.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	if (cmsg->cmsg_type == RDS_CMSG_ATOMIC_CSWP) {
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_swap_add = args->cswp.swap;
+		rm->atomic.op_compare = args->cswp.compare;
+	} else {
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_swap_add = args->fadd.add;
+	}
+
+	rm->m_rdma_cookie = args->cookie;
+	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_recverr = rs->rs_recverr;
+	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+
+	/* verify 8 byte-aligned */
+	if (args->local_addr & 0x7) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+		if (!rm->atomic.op_notifier) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		rm->atomic.op_notifier->n_user_token = args->user_token;
+		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	rm->atomic.op_rkey = rds_rdma_cookie_key(rm->m_rdma_cookie);
+	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+	rm->atomic.op_active = 1;
+
+	return ret;
+err:
+	if (page)
+		put_page(page);
+	kfree(rm->atomic.op_notifier);
+
+	return ret;
+}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0bb4957e0cfc..830e2bbb3332 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -97,6 +97,7 @@ struct rds_connection {
 	unsigned int		c_xmit_hdr_off;
 	unsigned int		c_xmit_data_off;
 	unsigned int		c_xmit_rdma_sent;
+	unsigned int		c_xmit_atomic_sent;
 
 	spinlock_t		c_lock;		/* protect msg queues */
 	u64			c_next_tx_seq;
@@ -260,6 +261,10 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
 	return cookie >> 32;
 }
 
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP		0
+#define RDS_ATOMIC_TYPE_FADD		1
+
 /*
  * m_sock_item and m_conn_item are on lists that are serialized under
  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -315,11 +320,27 @@ struct rds_message {
 	struct rds_sock		*m_rs;
 	rds_rdma_cookie_t	m_rdma_cookie;
 	struct {
-		struct {
+		struct rm_atomic_op {
+			int			op_type;
+			uint64_t		op_swap_add;
+			uint64_t		op_compare;
+
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_active:1;
+			struct rds_notifier	*op_notifier;
+			struct scatterlist	*op_sg;
+
+			struct rds_mr		*op_rdma_mr;
+		} atomic;
+		struct rm_rdma_op {
 			struct rds_rdma_op	m_rdma_op;
 			struct rds_mr		*m_rdma_mr;
 		} rdma;
-		struct {
+		struct rm_data_op {
 			unsigned int		m_nents;
 			unsigned int		m_count;
 			struct scatterlist	*m_sg;
@@ -397,6 +418,7 @@ struct rds_transport {
 	int (*xmit_cong_map)(struct rds_connection *conn,
 			     struct rds_cong_map *map, unsigned long offset);
 	int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
 	int (*recv)(struct rds_connection *conn);
 	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
 				size_t size);
@@ -546,6 +568,8 @@ struct rds_statistics {
 	uint64_t	s_cong_update_received;
 	uint64_t	s_cong_send_error;
 	uint64_t	s_cong_send_blocked;
+	uint64_t	s_atomic_cswp;
+	uint64_t	s_atomic_fadd;
 };
 
 /* af_rds.c */
@@ -722,7 +746,10 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 			  struct cmsghdr *cmsg);
 void rds_rdma_free_op(struct rds_rdma_op *ro);
-void rds_rdma_send_complete(struct rds_message *rm, int);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg);
 
 extern void __rds_put_mr_final(struct rds_mr *mr);
 static inline void rds_mr_put(struct rds_mr *mr)
diff --git a/net/rds/send.c b/net/rds/send.c
index b751a8e77c41..f3f4e79274bf 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -73,6 +73,7 @@ void rds_send_reset(struct rds_connection *conn)
 	conn->c_xmit_hdr_off = 0;
 	conn->c_xmit_data_off = 0;
 	conn->c_xmit_rdma_sent = 0;
+	conn->c_xmit_atomic_sent = 0;
 
 	conn->c_map_queued = 0;
 
@@ -171,6 +172,7 @@ int rds_send_xmit(struct rds_connection *conn)
 			conn->c_xmit_hdr_off = 0;
 			conn->c_xmit_data_off = 0;
 			conn->c_xmit_rdma_sent = 0;
+			conn->c_xmit_atomic_sent = 0;
 
 			/* Release the reference to the previous message. */
 			rds_message_put(rm);
@@ -262,6 +264,17 @@ int rds_send_xmit(struct rds_connection *conn)
 			conn->c_xmit_rm = rm;
 		}
 
+
+		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+			if (ret)
+				break;
+			conn->c_xmit_atomic_sent = 1;
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
 		/*
 		 * Try and send an rdma message.  Let's see if we can
 		 * keep this simple and require that the transport either
@@ -442,6 +455,41 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 }
 EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
 
+/*
+ * Just like above, except looks at atomic op
+ */
+void rds_atomic_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_atomic_op *ao;
+	struct rds_notifier *notifier;
+
+	spin_lock(&rm->m_rs_lock);
+
+	ao = &rm->atomic;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+	    && ao->op_active && ao->op_notify && ao->op_notifier) {
+		notifier = ao->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ao->op_notifier = NULL;
+	}
+
+	spin_unlock(&rm->m_rs_lock);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
+
 /*
  * This is the same as rds_rdma_send_complete except we
  * don't do any locking - we have all the ingredients (message,
@@ -788,6 +836,11 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 			/* these are valid but do no add any size */
 			break;
 
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+			size += sizeof(struct scatterlist);
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -813,7 +866,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			continue;
 
 		/* As a side effect, RDMA_DEST and RDMA_MAP will set
-		 * rm->m_rdma_cookie and rm->m_rdma_mr.
+		 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
 		 */
 		switch (cmsg->cmsg_type) {
 		case RDS_CMSG_RDMA_ARGS:
@@ -829,6 +882,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			if (!ret)
 				*allocated_mr = 1;
 			break;
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+			ret = rds_cmsg_atomic(rs, rm, cmsg);
+			break;
 
 		default:
 			return -EINVAL;
@@ -926,10 +983,18 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 
 	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) &&
-	    !conn->c_trans->xmit_rdma) {
+	       !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-				&rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
+			       &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+			       &rm->atomic, conn->c_trans->xmit_atomic);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 7598eb07cfb1..c66d95d9c262 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -75,6 +75,8 @@ static const char *const rds_stat_names[] = {
 	"cong_update_received",
 	"cong_send_error",
 	"cong_send_blocked",
+	"s_atomic_cswp",
+	"s_atomic_fadd",
 };
 
 void rds_stats_info_copy(struct rds_info_iterator *iter,
-- 
cgit v1.2.3-59-g8ed1b


From 40589e74f7ba855f3a887c9d4abe9d100c5b039c Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 10:50:48 -0800
Subject: RDS: Base init_depth and responder_resources on hw values

Instead of using a constant for initiator_depth and
responder_resources, read the per-QP values when the
device is enumerated, and then use these values when creating
the connection.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.c    |  3 +++
 net/rds/ib.h    |  2 ++
 net/rds/ib_cm.c | 27 ++++++++++++++++++---------
 net/rds/rdma.c  |  3 +--
 4 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index f0d29656baff..72a5116c11de 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -91,6 +91,9 @@ void rds_ib_add_one(struct ib_device *device)
 			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
 			fmr_pool_size;
 
+	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device);
 	if (IS_ERR(rds_ibdev->pd))
diff --git a/net/rds/ib.h b/net/rds/ib.h
index d2fd0aa4fde7..a303f13111c2 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -164,6 +164,8 @@ struct rds_ib_device {
 	unsigned int		max_fmrs;
 	int			max_sge;
 	unsigned int		max_wrs;
+	unsigned int		max_initiator_depth;
+	unsigned int		max_responder_resources;
 	spinlock_t		spinlock;	/* protect the above */
 };
 
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b46bc2f22ab6..3134336ca17d 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -153,18 +153,25 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 			struct rdma_conn_param *conn_param,
 			struct rds_ib_connect_private *dp,
-			u32 protocol_version)
+			u32 protocol_version,
+			u32 max_responder_resources,
+			u32 max_initiator_depth)
 {
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev;
+
 	memset(conn_param, 0, sizeof(struct rdma_conn_param));
-	/* XXX tune these? */
-	conn_param->responder_resources = 1;
-	conn_param->initiator_depth = 1;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	conn_param->responder_resources =
+		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+	conn_param->initiator_depth =
+		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
 	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
 	conn_param->rnr_retry_count = 7;
 
 	if (dp) {
-		struct rds_ib_connection *ic = conn->c_transport_data;
-
 		memset(dp, 0, sizeof(*dp));
 		dp->dp_saddr = conn->c_laddr;
 		dp->dp_daddr = conn->c_faddr;
@@ -479,7 +486,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 		goto out;
 	}
 
-	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+		event->param.conn.responder_resources,
+		event->param.conn.initiator_depth);
 
 	/* rdma_accept() calls rdma_reject() internally if it fails */
 	err = rdma_accept(cm_id, &conn_param);
@@ -516,8 +525,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 		goto out;
 	}
 
-	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+		UINT_MAX, UINT_MAX);
 	ret = rdma_connect(cm_id, &conn_param);
 	if (ret)
 		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index a7019df38c70..abbc2979e7e5 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -745,7 +745,6 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 		rm->atomic.op_swap_add = args->fadd.add;
 	}
 
-	rm->m_rdma_cookie = args->cookie;
 	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
 	rm->atomic.op_recverr = rs->rs_recverr;
 	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
@@ -779,7 +778,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
 	}
 
-	rm->atomic.op_rkey = rds_rdma_cookie_key(rm->m_rdma_cookie);
+	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
 	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
 
 	rm->atomic.op_active = 1;
-- 
cgit v1.2.3-59-g8ed1b


From d22faec22c2ab2364fd8fc3c8159b0b5b28b0fd1 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 10:52:28 -0800
Subject: RDS: Do not mask address when pinning pages

This does not appear to be necessary.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index abbc2979e7e5..1929cb8f17d7 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -238,7 +238,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
 	 * the zero page.
 	 */
-	ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
 	if (ret < 0)
 		goto out;
 
@@ -595,7 +595,7 @@ static int rds_rdma_prepare(struct rds_message *rm,
 		/* If it's a WRITE operation, we want to pin the pages for reading.
 		 * If it's a READ operation, we need to pin the pages for writing.
 		 */
-		ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+		ret = rds_pin_pages(vec.addr, nr, pages, !op->r_write);
 		if (ret < 0)
 			goto out;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6200ed7799d9225f363f157ab61f1566cfd80e19 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 10:53:05 -0800
Subject: RDS: Whitespace

Tidy up some whitespace issues.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rdma.c | 1 -
 net/rds/recv.c | 4 ++--
 net/rds/send.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 1929cb8f17d7..4df545a6eab8 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -688,7 +688,6 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 	 */
 	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
 
-
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 	if (!mr)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 5188763b609b..68800f02aa30 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -333,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 
 		if (msghdr) {
 			cmsg.user_token = notifier->n_user_token;
-			cmsg.status  = notifier->n_status;
+			cmsg.status = notifier->n_status;
 
 			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
-					sizeof(cmsg), &cmsg);
+				       sizeof(cmsg), &cmsg);
 			if (err)
 				break;
 		}
diff --git a/net/rds/send.c b/net/rds/send.c
index f3f4e79274bf..5bc35d2f40ea 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -150,7 +150,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		 */
 		if (conn->c_map_bytes) {
 			ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
-						conn->c_map_offset);
+							   conn->c_map_offset);
 			if (ret <= 0)
 				break;
 
-- 
cgit v1.2.3-59-g8ed1b


From 809fa148a29467954280fe8b7f97c92403f6293c Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 14:41:46 -0800
Subject: RDS: inc_purge() transport function unused - remove it

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.c       | 1 -
 net/rds/ib.h       | 1 -
 net/rds/ib_recv.c  | 2 +-
 net/rds/iw.c       | 1 -
 net/rds/iw.h       | 1 -
 net/rds/iw_recv.c  | 2 +-
 net/rds/loop.c     | 1 -
 net/rds/message.c  | 6 ------
 net/rds/rds.h      | 2 --
 net/rds/tcp.c      | 1 -
 net/rds/tcp.h      | 1 -
 net/rds/tcp_recv.c | 2 +-
 12 files changed, 3 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 72a5116c11de..932dacbdbea1 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -274,7 +274,6 @@ struct rds_transport rds_ib_transport = {
 	.conn_connect		= rds_ib_conn_connect,
 	.conn_shutdown		= rds_ib_conn_shutdown,
 	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
-	.inc_purge		= rds_ib_inc_purge,
 	.inc_free		= rds_ib_inc_free,
 	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
 	.cm_handle_connect	= rds_ib_cm_handle_connect,
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a303f13111c2..426035ac54a1 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -301,7 +301,6 @@ void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
 int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 		       gfp_t page_gfp, int prefill);
-void rds_ib_inc_purge(struct rds_incoming *inc);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index d0ee9c114c6c..e294d00abc80 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -273,7 +273,7 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 	return ret;
 }
 
-void rds_ib_inc_purge(struct rds_incoming *inc)
+static void rds_ib_inc_purge(struct rds_incoming *inc)
 {
 	struct rds_ib_incoming *ibinc;
 	struct rds_page_frag *frag;
diff --git a/net/rds/iw.c b/net/rds/iw.c
index c8f3d3525cb9..e766aecd46c9 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -272,7 +272,6 @@ struct rds_transport rds_iw_transport = {
 	.conn_connect		= rds_iw_conn_connect,
 	.conn_shutdown		= rds_iw_conn_shutdown,
 	.inc_copy_to_user	= rds_iw_inc_copy_to_user,
-	.inc_purge		= rds_iw_inc_purge,
 	.inc_free		= rds_iw_inc_free,
 	.cm_initiate_connect	= rds_iw_cm_initiate_connect,
 	.cm_handle_connect	= rds_iw_cm_handle_connect,
diff --git a/net/rds/iw.h b/net/rds/iw.h
index eef2f0c28476..6f08300851ad 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -326,7 +326,6 @@ void rds_iw_recv_exit(void);
 int rds_iw_recv(struct rds_connection *conn);
 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 		       gfp_t page_gfp, int prefill);
-void rds_iw_inc_purge(struct rds_incoming *inc);
 void rds_iw_inc_free(struct rds_incoming *inc);
 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 48bcf4f2bf3c..7e929f701a0c 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 	return ret;
 }
 
-void rds_iw_inc_purge(struct rds_incoming *inc)
+static void rds_iw_inc_purge(struct rds_incoming *inc)
 {
 	struct rds_iw_incoming *iwinc;
 	struct rds_page_frag *frag;
diff --git a/net/rds/loop.c b/net/rds/loop.c
index a74b469a844a..9364de6dcd8b 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -176,7 +176,6 @@ struct rds_transport rds_loop_transport = {
 	.conn_connect		= rds_loop_conn_connect,
 	.conn_shutdown		= rds_loop_conn_shutdown,
 	.inc_copy_to_user	= rds_message_inc_copy_to_user,
-	.inc_purge		= rds_message_inc_purge,
 	.inc_free		= rds_message_inc_free,
 	.t_name			= "loopback",
 };
diff --git a/net/rds/message.c b/net/rds/message.c
index 7d678e217820..f681690b2bdc 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -75,12 +75,6 @@ static void rds_message_purge(struct rds_message *rm)
 		rds_mr_put(rm->rdma.m_rdma_mr);
 }
 
-void rds_message_inc_purge(struct rds_incoming *inc)
-{
-	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-	rds_message_purge(rm);
-}
-
 void rds_message_put(struct rds_message *rm)
 {
 	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 830e2bbb3332..0c610a102c20 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -422,7 +422,6 @@ struct rds_transport {
 	int (*recv)(struct rds_connection *conn);
 	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
 				size_t size);
-	void (*inc_purge)(struct rds_incoming *inc);
 	void (*inc_free)(struct rds_incoming *inc);
 
 	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -668,7 +667,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
 				 struct iovec *first_iov, size_t size);
-void rds_message_inc_purge(struct rds_incoming *inc);
 void rds_message_inc_free(struct rds_incoming *inc);
 void rds_message_addref(struct rds_message *rm);
 void rds_message_put(struct rds_message *rm);
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index aebe10314fdb..831881640e56 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -266,7 +266,6 @@ struct rds_transport rds_tcp_transport = {
 	.conn_connect		= rds_tcp_conn_connect,
 	.conn_shutdown		= rds_tcp_conn_shutdown,
 	.inc_copy_to_user	= rds_tcp_inc_copy_to_user,
-	.inc_purge		= rds_tcp_inc_purge,
 	.inc_free		= rds_tcp_inc_free,
 	.stats_info_copy	= rds_tcp_stats_info_copy,
 	.exit			= rds_tcp_exit,
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 844fa6b9cf5a..c639872bf701 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -70,7 +70,6 @@ int __init rds_tcp_recv_init(void);
 void rds_tcp_recv_exit(void);
 void rds_tcp_data_ready(struct sock *sk, int bytes);
 int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_purge(struct rds_incoming *inc);
 void rds_tcp_inc_free(struct rds_incoming *inc);
 int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ea7382908aa5..49c966010788 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -39,7 +39,7 @@
 
 static struct kmem_cache *rds_tcp_incoming_slab;
 
-void rds_tcp_inc_purge(struct rds_incoming *inc)
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
 {
 	struct rds_tcp_incoming *tinc;
 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-- 
cgit v1.2.3-59-g8ed1b


From 9c030391e8741695ff6114703e4edccccb634479 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 12 Jan 2010 14:43:06 -0800
Subject: RDS/IB: eliminate duplicate code

both atomics and rdmas need to convert ib-specific completion codes
into RDS status codes. Rename rds_ib_rdma_send_complete to
rds_ib_send_complete, and have it take a pointer to the function to
call with the new error code.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index b2bd164434ad..d2516d3f87e2 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -38,8 +38,13 @@
 #include "rds.h"
 #include "ib.h"
 
-static void rds_ib_send_rdma_complete(struct rds_message *rm,
-				      int wc_status)
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+				 int wc_status,
+				 void (*complete)(struct rds_message *rm, int status))
 {
 	int notify_status;
 
@@ -59,20 +64,7 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
 		notify_status = RDS_RDMA_OTHER_ERROR;
 		break;
 	}
-	rds_rdma_send_complete(rm, notify_status);
-}
-
-static void rds_ib_send_atomic_complete(struct rds_message *rm,
-				      int wc_status)
-{
-	int notify_status;
-
-	if (wc_status != IB_WC_SUCCESS)
-		notify_status = RDS_RDMA_OTHER_ERROR;
-	else
-		notify_status = RDS_RDMA_SUCCESS;
-
-	rds_atomic_send_complete(rm, notify_status);
+	complete(rm, notify_status);
 }
 
 static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
@@ -117,7 +109,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 		 * operation itself unmapped the RDMA buffers, which takes care
 		 * of synching.
 		 */
-		rds_ib_send_rdma_complete(rm, wc_status);
+		rds_ib_send_complete(rm, wc_status, rds_rdma_send_complete);
 
 		if (rm->rdma.m_rdma_op.r_write)
 			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
@@ -135,7 +127,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 			op->op_mapped = 0;
 		}
 
-		rds_ib_send_atomic_complete(rm, wc_status);
+		rds_ib_send_complete(rm, wc_status, rds_atomic_send_complete);
 
 		if (rm->atomic.op_type == RDS_ATOMIC_TYPE_CSWP)
 			rds_stats_inc(s_atomic_cswp);
@@ -270,7 +262,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 				rm = rds_send_get_message(conn, send->s_op);
 				if (rm) {
 					rds_ib_send_unmap_rm(ic, send, wc.status);
-					rds_ib_send_rdma_complete(rm, wc.status);
+					rds_ib_send_complete(rm, wc.status, rds_rdma_send_complete);
 					rds_message_put(rm);
 				}
 			}
-- 
cgit v1.2.3-59-g8ed1b


From f147dd9ecabf23fd63d2562ffe64252a0453ecde Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 13 Jan 2010 15:50:09 -0800
Subject: RDS/IB: Disallow connections less than RDS 3.1

RDS 3.0 connections (in OFED 1.3 and earlier) put the
header at the end. 3.1 connections put it at the head.
The code has significant added complexity in order to
handle both configurations. In OFED 1.6 we can
drop this and simplify the code by only supporting
"header-first" configuration.

This patch checks the protocol version, and if prior
to 3.1, does not complete the connection.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_cm.c   | 20 +++++++++++++++-----
 net/rds/ib_recv.c | 43 +------------------------------------------
 2 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 3134336ca17d..8b0c743c0900 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -111,11 +111,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		}
 	}
 
-	printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-			&conn->c_faddr,
-			RDS_PROTOCOL_MAJOR(conn->c_version),
-			RDS_PROTOCOL_MINOR(conn->c_version),
-			ic->i_flowctl ? ", flow control" : "");
+	if (conn->c_version < RDS_PROTOCOL(3,1)) {
+		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+		       " no longer supported\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version));
+		rds_conn_destroy(conn);
+		return;
+	} else {
+		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version),
+		       ic->i_flowctl ? ", flow control" : "");
+	}
 
 	/*
 	 * Init rings and fill recv. this needs to wait until protocol negotiation
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index e294d00abc80..a68a3a721c6c 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -557,47 +557,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
 	return rds_ib_get_ack(ic);
 }
 
-static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
-					    struct rds_ib_recv_work *recv,
-					    u32 data_len)
-{
-	struct rds_ib_connection *ic = conn->c_transport_data;
-	void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
-	void *addr;
-	u32 misplaced_hdr_bytes;
-
-	/*
-	 * Support header at the front (RDS 3.1+) as well as header-at-end.
-	 *
-	 * Cases:
-	 * 1) header all in header buff (great!)
-	 * 2) header all in data page (copy all to header buff)
-	 * 3) header split across hdr buf + data page
-	 *    (move bit in hdr buff to end before copying other bit from data page)
-	 */
-	if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
-	        return hdr_buff;
-
-	if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
-		addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
-		memcpy(hdr_buff,
-		       addr + recv->r_frag->f_offset + data_len,
-		       sizeof(struct rds_header));
-		kunmap_atomic(addr, KM_SOFTIRQ0);
-		return hdr_buff;
-	}
-
-	misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
-
-	memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
-
-	addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
-	memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
-	       sizeof(struct rds_header) - misplaced_hdr_bytes);
-	kunmap_atomic(addr, KM_SOFTIRQ0);
-	return hdr_buff;
-}
-
 /*
  * It's kind of lame that we're copying from the posted receive pages into
  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
@@ -710,7 +669,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	}
 	data_len -= sizeof(struct rds_header);
 
-	ihdr = rds_ib_get_header(conn, recv, data_len);
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
 
 	/* Validate the checksum. */
 	if (!rds_message_verify_checksum(ihdr)) {
-- 
cgit v1.2.3-59-g8ed1b


From 6f3d05db0da0b874afd2dd229bed715133532f8d Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 13 Jan 2010 16:29:37 -0800
Subject: RDS/IB: Remove dead code

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index d2516d3f87e2..e86908497fcf 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -542,12 +542,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 	/* map the message the first time we see it */
 	if (!ic->i_rm) {
-		/*
-		printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
-				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
-				rm->m_inc.i_hdr.h_flags,
-				be32_to_cpu(rm->m_inc.i_hdr.h_len));
-		   */
 		if (rm->data.m_nents) {
 			rm->data.m_count = ib_dma_map_sg(dev,
 							    rm->data.m_sg,
@@ -684,14 +678,6 @@ add_header:
 		 * have been set up to point to the right header buffer. */
 		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
 
-		if (0) {
-			struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
-			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
-				be16_to_cpu(hdr->h_dport),
-				hdr->h_flags,
-				be32_to_cpu(hdr->h_len));
-		}
 		if (adv_credits) {
 			struct rds_header *hdr = &ic->i_send_hdrs[pos];
 
-- 
cgit v1.2.3-59-g8ed1b


From 919ced4ce7d6ac62dd5be62d8993fe22a527d53a Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 13 Jan 2010 16:32:24 -0800
Subject: RDS/IB: Remove ib_[header/data]_sge() functions

These functions were to cope with differently ordered
sg entries depending on RDS 3.0 or 3.1+. Now that
we've dropped 3.0 compatibility we no longer need them.

Also, modify usage sites for these to refer to sge[0] or [1]
directly. Reorder code to initialize header sgs first.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      | 24 ------------------------
 net/rds/ib_recv.c | 22 +++++++++++-----------
 net/rds/ib_send.c | 32 ++++++++++++--------------------
 3 files changed, 23 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 426035ac54a1..dbf8d6e31493 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -356,28 +356,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation;
 extern unsigned int rds_ib_sysctl_flow_control;
 extern ctl_table rds_ib_sysctl_table[];
 
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- *
- * From version 3.1 onwards, header is in front of data in the sge.
- */
-static inline struct ib_sge *
-rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
-	if (ic->conn->c_version > RDS_PROTOCOL_3_0)
-		return &sge[0];
-	else
-		return &sge[1];
-}
-
-static inline struct ib_sge *
-rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
-	if (ic->conn->c_version > RDS_PROTOCOL_3_0)
-		return &sge[1];
-	else
-		return &sge[0];
-}
-
 #endif
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index a68a3a721c6c..e9fe08a86c40 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -95,15 +95,15 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 		recv->r_wr.sg_list = recv->r_sge;
 		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
 
-		sge = rds_ib_data_sge(ic, recv->r_sge);
-		sge->addr = 0;
-		sge->length = RDS_FRAG_SIZE;
-		sge->lkey = ic->i_mr->lkey;
-
-		sge = rds_ib_header_sge(ic, recv->r_sge);
+		sge = &recv->r_sge[0];
 		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
 		sge->lkey = ic->i_mr->lkey;
+
+		sge = &recv->r_sge[1];
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = ic->i_mr->lkey;
 	}
 }
 
@@ -190,14 +190,14 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	recv->r_frag->f_offset = ic->i_frag.f_offset;
 	recv->r_frag->f_mapped = dma_addr;
 
-	sge = rds_ib_data_sge(ic, recv->r_sge);
-	sge->addr = dma_addr;
-	sge->length = RDS_FRAG_SIZE;
-
-	sge = rds_ib_header_sge(ic, recv->r_sge);
+	sge = &recv->r_sge[0];
 	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
 	sge->length = sizeof(struct rds_header);
 
+	sge = &recv->r_sge[1];
+	sge->addr = dma_addr;
+	sge->length = RDS_FRAG_SIZE;
+
 	get_page(recv->r_frag->f_page);
 
 	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index e86908497fcf..46026d9091f1 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -156,18 +156,14 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 
 		send->s_wr.wr_id = i;
 		send->s_wr.sg_list = send->s_sge;
-		send->s_wr.num_sge = 1;
-		send->s_wr.opcode = IB_WR_SEND;
-		send->s_wr.send_flags = 0;
 		send->s_wr.ex.imm_data = 0;
 
-		sge = rds_ib_data_sge(ic, send->s_sge);
-		sge->lkey = ic->i_mr->lkey;
-
-		sge = rds_ib_header_sge(ic, send->s_sge);
+		sge = &send->s_sge[0];
 		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
 		sge->lkey = ic->i_mr->lkey;
+
+		send->s_sge[1].lkey = ic->i_mr->lkey;
 	}
 }
 
@@ -441,28 +437,24 @@ rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
 
 	send->s_wr.send_flags = send_flags;
 	send->s_wr.opcode = IB_WR_SEND;
-	send->s_wr.num_sge = 2;
+	send->s_wr.num_sge = 1;
 	send->s_wr.next = NULL;
 	send->s_queued = jiffies;
 	send->s_op = NULL;
 
+	sge = &send->s_sge[0];
+	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = ic->i_mr->lkey;
+
 	if (length != 0) {
-		sge = rds_ib_data_sge(ic, send->s_sge);
+		send->s_wr.num_sge = 2;
+
+		sge = &send->s_sge[1];
 		sge->addr = buffer;
 		sge->length = length;
 		sge->lkey = ic->i_mr->lkey;
-
-		sge = rds_ib_header_sge(ic, send->s_sge);
-	} else {
-		/* We're sending a packet with no payload. There is only
-		 * one SGE */
-		send->s_wr.num_sge = 1;
-		sge = &send->s_sge[0];
 	}
-
-	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
-	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From da5a06cef5724737af4315715632f0a07dd5e116 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Thu, 14 Jan 2010 12:18:11 -0800
Subject: RDS: rewrite rds_ib_xmit

Now that the header always goes first, it is possible to
simplify rds_ib_xmit. Instead of having a path to handle 0-byte
dgrams and another path to handle >0, these can both be handled
in one path. This lets us eliminate xmit_populate_wr().

Rename sent to bytes_sent, to differentiate better from other
variable named "send".

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c | 123 ++++++++++++++++++++----------------------------------
 1 file changed, 45 insertions(+), 78 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 46026d9091f1..06c1d7e032d2 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -425,38 +425,6 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 }
 
-static inline void
-rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
-		struct rds_ib_send_work *send, unsigned int pos,
-		unsigned long buffer, unsigned int length,
-		int send_flags)
-{
-	struct ib_sge *sge;
-
-	WARN_ON(pos != send - ic->i_sends);
-
-	send->s_wr.send_flags = send_flags;
-	send->s_wr.opcode = IB_WR_SEND;
-	send->s_wr.num_sge = 1;
-	send->s_wr.next = NULL;
-	send->s_queued = jiffies;
-	send->s_op = NULL;
-
-	sge = &send->s_sge[0];
-	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
-	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
-
-	if (length != 0) {
-		send->s_wr.num_sge = 2;
-
-		sge = &send->s_sge[1];
-		sge->addr = buffer;
-		sge->length = length;
-		sge->lkey = ic->i_mr->lkey;
-	}
-}
-
 /*
  * This can be called multiple times for a given message.  The first time
  * we see a message we map its scatterlist into the IB device so that
@@ -483,11 +451,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	u32 pos;
 	u32 i;
 	u32 work_alloc;
-	u32 credit_alloc;
+	u32 credit_alloc = 0;
 	u32 posted;
 	u32 adv_credits = 0;
 	int send_flags = 0;
-	int sent;
+	int bytes_sent = 0;
 	int ret;
 	int flow_controlled = 0;
 
@@ -515,7 +483,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		goto out;
 	}
 
-	credit_alloc = work_alloc;
 	if (ic->i_flowctl) {
 		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
 		adv_credits += posted;
@@ -591,13 +558,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		BUG_ON(adv_credits > 255);
 	}
 
-	send = &ic->i_sends[pos];
-	first = send;
-	prev = NULL;
-	scat = &rm->data.m_sg[sg];
-	sent = 0;
-	i = 0;
-
 	/* Sometimes you want to put a fence between an RDMA
 	 * READ and the following SEND.
 	 * We could either do this all the time
@@ -607,31 +567,45 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
 		send_flags = IB_SEND_FENCE;
 
-	/*
-	 * We could be copying the header into the unused tail of the page.
-	 * That would need to be changed in the future when those pages might
-	 * be mapped userspace pages or page cache pages.  So instead we always
-	 * use a second sge and our long-lived ring of mapped headers.  We send
-	 * the header after the data so that the data payload can be aligned on
-	 * the receiver.
-	 */
+	/* Each frag gets a header. Msgs may be 0 bytes */
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &rm->data.m_sg[sg];
+	i = 0;
+	do {
+		unsigned int len = 0;
 
-	/* handle a 0-len message */
-	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
-		rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
-		goto add_header;
-	}
+		/* Set up the header */
+		send->s_wr.send_flags = send_flags;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.num_sge = 1;
+		send->s_wr.next = NULL;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
 
-	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) {
-		unsigned int len;
+		send->s_sge[0].addr = ic->i_send_hdrs_dma
+			+ (pos * sizeof(struct rds_header));
+		send->s_sge[0].length = sizeof(struct rds_header);
 
-		send = &ic->i_sends[pos];
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		/* Set up the data, if present */
+		if (i < work_alloc
+		    && scat != &rm->data.m_sg[rm->data.m_count]) {
+			len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+			send->s_wr.num_sge = 2;
 
-		len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
-		rds_ib_xmit_populate_wr(ic, send, pos,
-				ib_sg_dma_address(dev, scat) + off, len,
-				send_flags);
+			send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+			send->s_sge[1].length = len;
+
+			bytes_sent += len;
+			off += len;
+			if (off == ib_sg_dma_len(dev, scat)) {
+				scat++;
+				off = 0;
+			}
+		}
 
 		/*
 		 * We want to delay signaling completions just enough to get
@@ -658,18 +632,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
 			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
 
-		sent += len;
-		off += len;
-		if (off == ib_sg_dma_len(dev, scat)) {
-			scat++;
-			off = 0;
-		}
-
-add_header:
-		/* Tack on the header after the data. The header SGE should already
-		 * have been set up to point to the right header buffer. */
-		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
 		if (adv_credits) {
 			struct rds_header *hdr = &ic->i_send_hdrs[pos];
 
@@ -685,12 +647,16 @@ add_header:
 		prev = send;
 
 		pos = (pos + 1) % ic->i_send_ring.w_nr;
-	}
+		send = &ic->i_sends[pos];
+		i++;
+
+	} while (i < work_alloc
+		 && scat != &rm->data.m_sg[rm->data.m_count]);
 
 	/* Account the RDS header in the number of bytes we sent, but just once.
 	 * The caller has no concept of fragmentation. */
 	if (hdr_off == 0)
-		sent += sizeof(struct rds_header);
+		bytes_sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
 	if (scat == &rm->data.m_sg[rm->data.m_count]) {
@@ -699,6 +665,7 @@ add_header:
 		ic->i_rm = NULL;
 	}
 
+	/* Put back wrs & credits we didn't use */
 	if (i < work_alloc) {
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
 		work_alloc = i;
@@ -725,7 +692,7 @@ add_header:
 		goto out;
 	}
 
-	ret = sent;
+	ret = bytes_sent;
 out:
 	BUG_ON(adv_credits);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 1d34f175712b59ad292ecbbaa8fc05402a1fd8ed Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Thu, 14 Jan 2010 15:08:33 -0800
Subject: RDS: Remove unsignaled_bytes sysctl

Removed unsignaled_bytes sysctl and code to signal
based on it. I believe unsignaled_wrs is more than
sufficient for our purposes.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h        |  1 -
 net/rds/ib_send.c   |  7 -------
 net/rds/ib_sysctl.c | 13 -------------
 3 files changed, 21 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index dbf8d6e31493..148818174a04 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -138,7 +138,6 @@ struct rds_ib_connection {
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
-	long			i_unsignaled_bytes;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 06c1d7e032d2..48724b71f26c 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -518,7 +518,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		}
 
 		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-		ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
 		rds_message_addref(rm);
 		ic->i_rm = rm;
 
@@ -617,12 +616,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		}
 
-		ic->i_unsignaled_bytes -= len;
-		if (ic->i_unsignaled_bytes <= 0) {
-			ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
-			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		}
-
 		/*
 		 * Always signal the last one if we're stopping due to flow control.
 		 */
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index c070524c4d95..b556c5c2a902 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
 static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
 static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
 
-unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-
 /*
  * This sysctl does nothing.
  *
@@ -93,15 +89,6 @@ ctl_table rds_ib_sysctl_table[] = {
 		.extra1		= &rds_ib_sysctl_max_unsig_wr_min,
 		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
 	},
-	{
-		.procname       = "max_unsignaled_bytes",
-		.data		= &rds_ib_sysctl_max_unsig_bytes,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-		.extra1		= &rds_ib_sysctl_max_unsig_bytes_min,
-		.extra2		= &rds_ib_sysctl_max_unsig_bytes_max,
-	},
 	{
 		.procname       = "max_recv_allocation",
 		.data		= &rds_ib_sysctl_max_recv_allocation,
-- 
cgit v1.2.3-59-g8ed1b


From c8de3f1005e8359ea07083e37f3f993646e1adba Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 15 Jan 2010 15:55:26 -0800
Subject: RDS/IB: Make all flow control code conditional on i_flowctl

Maybe things worked fine with the flow control code running
even in the non-flow-control case, but making it explicitly
conditional helps the non-fc case be easier to read.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 48724b71f26c..d839b403d46b 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -489,7 +489,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		if (credit_alloc < work_alloc) {
 			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
 			work_alloc = credit_alloc;
-			flow_controlled++;
+			flow_controlled = 1;
 		}
 		if (work_alloc == 0) {
 			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
@@ -552,9 +552,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		/*
 		 * Update adv_credits since we reset the ACK_REQUIRED bit.
 		 */
-		rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
-		adv_credits += posted;
-		BUG_ON(adv_credits > 255);
+		if (ic->i_flowctl) {
+			rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+			adv_credits += posted;
+			BUG_ON(adv_credits > 255);
+		}
 	}
 
 	/* Sometimes you want to put a fence between an RDMA
@@ -619,13 +621,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		/*
 		 * Always signal the last one if we're stopping due to flow control.
 		 */
-		if (flow_controlled && i == (work_alloc-1))
+		if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
 			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 
 		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
 			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
 
-		if (adv_credits) {
+		if (ic->i_flowctl && adv_credits) {
 			struct rds_header *hdr = &ic->i_send_hdrs[pos];
 
 			/* add credit and redo the header checksum */
-- 
cgit v1.2.3-59-g8ed1b


From d37c9359056f4f07b37e59810f0ece1031e280b2 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 19 Jan 2010 18:14:56 -0800
Subject: RDS: Move loop-only function to loop.c

Also, try to better-document the locking around the
rm and its m_inc in loop.c.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/loop.c    | 15 +++++++++++++--
 net/rds/message.c |  6 ------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/loop.c b/net/rds/loop.c
index 9364de6dcd8b..4a3dd49315b4 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -64,7 +64,8 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 	BUG_ON(hdr_off || sg || off);
 
 	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
-	rds_message_addref(rm); /* for the inc */
+	/* For the embedded inc. Matching put is in loop_inc_free() */
+	rds_message_addref(rm);
 
 	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
 			  GFP_KERNEL, KM_USER0);
@@ -77,6 +78,16 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 	return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
 }
 
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
+{
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_put(rm);
+}
+
 static int rds_loop_xmit_cong_map(struct rds_connection *conn,
 				  struct rds_cong_map *map,
 				  unsigned long offset)
@@ -176,6 +187,6 @@ struct rds_transport rds_loop_transport = {
 	.conn_connect		= rds_loop_conn_connect,
 	.conn_shutdown		= rds_loop_conn_shutdown,
 	.inc_copy_to_user	= rds_message_inc_copy_to_user,
-	.inc_free		= rds_message_inc_free,
+	.inc_free		= rds_loop_inc_free,
 	.t_name			= "loopback",
 };
diff --git a/net/rds/message.c b/net/rds/message.c
index f681690b2bdc..3ea05c864cd4 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -89,12 +89,6 @@ void rds_message_put(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_put);
 
-void rds_message_inc_free(struct rds_incoming *inc)
-{
-	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-	rds_message_put(rm);
-}
-
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
 				 __be16 dport, u64 seq)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 241eef3e2f51fe4ad50abacd7f79c4e2d468197e Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 19 Jan 2010 21:25:26 -0800
Subject: RDS: Implement silent atomics

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      |  2 +-
 net/rds/ib_send.c | 62 ++++++++++++++++++++++++++++---------------------------
 net/rds/message.c |  2 ++
 net/rds/rds.h     |  3 ++-
 net/rds/send.c    | 11 +++++++---
 5 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 148818174a04..96769b86a536 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -336,7 +336,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
 			     u32 *adv_credits, int need_posted, int max_posted);
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index d839b403d46b..e6745d827c3a 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -225,15 +225,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 			/* In the error case, wc.opcode sometimes contains garbage */
 			switch (send->s_wr.opcode) {
 			case IB_WR_SEND:
-				if (send->s_rm)
-					rds_ib_send_unmap_rm(ic, send, wc.status);
-				break;
 			case IB_WR_RDMA_WRITE:
 			case IB_WR_RDMA_READ:
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
 			case IB_WR_ATOMIC_CMP_AND_SWP:
-				/* Nothing to be done - the SG list will be unmapped
-				 * when the SEND completes. */
+				if (send->s_rm)
+					rds_ib_send_unmap_rm(ic, send, wc.status);
 				break;
 			default:
 				if (printk_ratelimit())
@@ -425,6 +422,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 }
 
+static inline void rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+					      struct rds_ib_send_work *send,
+					      bool notify)
+{
+	/*
+	 * We want to delay signaling completions just enough to get
+	 * the batching benefits but not so much that we create dead time
+	 * on the wire.
+	 */
+	if (ic->i_unsignaled_wrs-- == 0 || notify) {
+		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+		send->s_wr.send_flags |= IB_SEND_SIGNALED;
+	}
+}
+
 /*
  * This can be called multiple times for a given message.  The first time
  * we see a message we map its scatterlist into the IB device so that
@@ -517,7 +529,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 			rm->data.m_count = 0;
 		}
 
-		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
 		rds_message_addref(rm);
 		ic->i_rm = rm;
 
@@ -608,15 +619,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 			}
 		}
 
-		/*
-		 * We want to delay signaling completions just enough to get
-		 * the batching benefits but not so much that we create dead time
-		 * on the wire.
-		 */
-		if (ic->i_unsignaled_wrs-- == 0) {
-			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		}
+		rds_ib_set_wr_signal_state(ic, send, 0);
 
 		/*
 		 * Always signal the last one if we're stopping due to flow control.
@@ -656,7 +659,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	/* if we finished the message then send completion owns it */
 	if (scat == &rm->data.m_sg[rm->data.m_count]) {
 		prev->s_rm = ic->i_rm;
-		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
 	}
 
@@ -698,9 +701,10 @@ out:
  * A simplified version of the rdma case, we always map 1 SG, and
  * only 8 bytes, for the return value from the atomic operation.
  */
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rm_atomic_op *op = &rm->atomic;
 	struct rds_ib_send_work *send = NULL;
 	struct ib_send_wr *failed_wr;
 	struct rds_ib_device *rds_ibdev;
@@ -731,12 +735,20 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 		send->s_wr.wr.atomic.compare_add = op->op_swap_add;
 		send->s_wr.wr.atomic.swap = 0;
 	}
-	send->s_wr.send_flags = IB_SEND_SIGNALED;
+	rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 	send->s_wr.num_sge = 1;
 	send->s_wr.next = NULL;
 	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
 	send->s_wr.wr.atomic.rkey = op->op_rkey;
 
+	/*
+	 * If there is no data or rdma ops in the message, then
+	 * we must fill in s_rm ourselves, so we properly clean up
+	 * on completion.
+	 */
+	if (!rm->rdma.m_rdma_op.r_active && !rm->data.op_active)
+		send->s_rm = rm;
+
 	/* map 8 byte retval buffer to the device */
 	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
 	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
@@ -836,14 +848,8 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
 		send->s_wr.send_flags = 0;
 		send->s_queued = jiffies;
-		/*
-		 * We want to delay signaling completions just enough to get
-		 * the batching benefits but not so much that we create dead time on the wire.
-		 */
-		if (ic->i_unsignaled_wrs-- == 0) {
-			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-			send->s_wr.send_flags = IB_SEND_SIGNALED;
-		}
+
+		rds_ib_set_wr_signal_state(ic, send, op->r_notify);
 
 		send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
@@ -884,10 +890,6 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 			send = ic->i_sends;
 	}
 
-	/* if we finished the message then send completion owns it */
-	if (scat == &op->r_sg[op->r_count])
-		prev->s_wr.send_flags = IB_SEND_SIGNALED;
-
 	if (i < work_alloc) {
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
 		work_alloc = i;
diff --git a/net/rds/message.c b/net/rds/message.c
index 3ea05c864cd4..a27e493a63a2 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -325,6 +325,8 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 			sg++;
 	}
 
+	rm->data.op_active = 1;
+
 out:
 	return ret;
 }
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0c610a102c20..bf2349da4db7 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -341,6 +341,7 @@ struct rds_message {
 			struct rds_mr		*m_rdma_mr;
 		} rdma;
 		struct rm_data_op {
+			unsigned int		op_active:1;
 			unsigned int		m_nents;
 			unsigned int		m_count;
 			struct scatterlist	*m_sg;
@@ -418,7 +419,7 @@ struct rds_transport {
 	int (*xmit_cong_map)(struct rds_connection *conn,
 			     struct rds_cong_map *map, unsigned long offset);
 	int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
-	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rds_message *rm);
 	int (*recv)(struct rds_connection *conn);
 	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
 				size_t size);
diff --git a/net/rds/send.c b/net/rds/send.c
index 5bc35d2f40ea..42fb934293be 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -266,7 +266,7 @@ int rds_send_xmit(struct rds_connection *conn)
 
 
 		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
-			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+			ret = conn->c_trans->xmit_atomic(conn, rm);
 			if (ret)
 				break;
 			conn->c_xmit_atomic_sent = 1;
@@ -285,13 +285,18 @@ int rds_send_xmit(struct rds_connection *conn)
 			if (ret)
 				break;
 			conn->c_xmit_rdma_sent = 1;
+
+			/* rdmas need data sent, even if just the header */
+			rm->data.op_active = 1;
+
 			/* The transport owns the mapped memory for now.
 			 * You can't unmap it while it's on the send queue */
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 		}
 
-		if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
-		    conn->c_xmit_sg < rm->data.m_nents) {
+		if (rm->data.op_active
+		    && (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+			conn->c_xmit_sg < rm->data.m_nents)) {
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
 						  conn->c_xmit_sg,
-- 
cgit v1.2.3-59-g8ed1b


From 4324879df06ba4db01a0b455af2d003f117e6aa3 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 27 Jan 2010 16:07:30 -0800
Subject: RDS: Inline rdma_prepare into cmsg_rdma_args

cmsg_rdma_args just calls rdma_prepare and does a little
arg checking -- not quite enough to justify its existence.
Plus, it is the only caller of rdma_prepare().

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rdma.c | 38 ++++++++++++--------------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4df545a6eab8..91967c8bc572 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -497,12 +497,13 @@ int rds_rdma_extra_size(struct rds_rdma_args *args)
 }
 
 /*
- * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
  */
-static int rds_rdma_prepare(struct rds_message *rm,
-			    struct rds_sock *rs,
-			    struct rds_rdma_args *args)
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
 {
+	struct rds_rdma_args *args;
 	struct rds_iovec vec;
 	struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
 	unsigned int nr_pages;
@@ -513,6 +514,11 @@ static int rds_rdma_prepare(struct rds_message *rm,
 	unsigned int i, j;
 	int ret = 0;
 
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+	    || rm->rdma.m_rdma_op.r_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
 
 	if (rs->rs_bound_addr == 0) {
 		ret = -ENOTCONN; /* XXX not a great errno */
@@ -623,7 +629,6 @@ static int rds_rdma_prepare(struct rds_message *rm,
 		op->r_nents += nr;
 	}
 
-
 	if (nr_bytes > args->remote_vec.bytes) {
 		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
 				nr_bytes,
@@ -639,28 +644,9 @@ out:
 	if (ret)
 		rds_rdma_free_op(op);
 
-	return ret;
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg)
-{
-	int ret;
-
-	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-	    rm->rdma.m_rdma_op.r_active)
-		return -EINVAL;
-
-	ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg));
-	if (ret)
-		return ret;
-
 	rds_stats_inc(s_send_rdma);
-	return 0;
+
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From d0ab25a83c4a08cd98b73a37d3f4c069f7b4f50b Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 27 Jan 2010 16:15:48 -0800
Subject: RDS: purge atomic resources too in rds_message_purge()

Add atomic_free_op function, analogous to rdma_free_op,
and call it in rds_message_purge().

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c |  5 +++++
 net/rds/rdma.c    | 16 ++++++++++++++++
 net/rds/rds.h     |  1 +
 3 files changed, 22 insertions(+)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index a27e493a63a2..b53306c3e656 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -73,6 +73,11 @@ static void rds_message_purge(struct rds_message *rm)
 		rds_rdma_free_op(&rm->rdma.m_rdma_op);
 	if (rm->rdma.m_rdma_mr)
 		rds_mr_put(rm->rdma.m_rdma_mr);
+
+	if (rm->atomic.op_active)
+		rds_atomic_free_op(&rm->atomic);
+	if (rm->atomic.op_rdma_mr)
+		rds_mr_put(rm->atomic.op_rdma_mr);
 }
 
 void rds_message_put(struct rds_message *rm)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 91967c8bc572..0df86a382e2e 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -462,6 +462,22 @@ void rds_rdma_free_op(struct rds_rdma_op *ro)
 	ro->r_active = 0;
 }
 
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+	struct page *page = sg_page(ao->op_sg);
+
+	/* Mark page dirty if it was possibly modified, which
+	 * is the case for a RDMA_READ which copies from remote
+	 * to local memory */
+	set_page_dirty(page);
+	put_page(page);
+
+	kfree(ao->op_notifier);
+	ao->op_notifier = NULL;
+	ao->op_active = 0;
+}
+
+
 /*
  * Count the number of pages needed to describe an incoming iovec.
  */
diff --git a/net/rds/rds.h b/net/rds/rds.h
index bf2349da4db7..32b3d46aea36 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -745,6 +745,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 			  struct cmsghdr *cmsg);
 void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
 void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
 void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
 int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
-- 
cgit v1.2.3-59-g8ed1b


From f8b3aaf2ba8ca9e27b47f8bfdff07c8b968f2c05 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 1 Mar 2010 14:11:53 -0800
Subject: RDS: Remove struct rds_rdma_op

A big changeset, but it's all pretty dumb.

struct rds_rdma_op was already embedded in struct rm_rdma_op.
Remove rds_rdma_op and put its members in rm_rdma_op. Rename
members with "op_" prefix instead of "r_", for consistency.

Of course this breaks a lot, so fixup the code accordingly.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      |  4 ++--
 net/rds/ib_send.c | 60 ++++++++++++++++++++++++------------------------
 net/rds/iw.h      |  4 ++--
 net/rds/iw_send.c | 68 +++++++++++++++++++++++++++----------------------------
 net/rds/message.c |  8 +++----
 net/rds/rdma.c    | 58 +++++++++++++++++++++++------------------------
 net/rds/rds.h     | 41 ++++++++++++++++-----------------
 net/rds/send.c    | 50 ++++++++++++++++++++--------------------
 8 files changed, 145 insertions(+), 148 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 96769b86a536..d64b5087eefe 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -54,7 +54,7 @@ struct rds_ib_connect_private {
 
 struct rds_ib_send_work {
 	struct rds_message	*s_rm;
-	struct rds_rdma_op	*s_op;
+	struct rm_rdma_op	*s_op;
 	struct ib_send_wr	s_wr;
 	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
 	unsigned long		s_queued;
@@ -331,7 +331,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_ib_send_init_ring(struct rds_ib_connection *ic);
 void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index e6745d827c3a..63981cd1827a 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -79,14 +79,14 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 			rm->data.m_sg, rm->data.m_nents,
 			DMA_TO_DEVICE);
 
-	if (rm->rdma.m_rdma_op.r_active) {
-		struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
+	if (rm->rdma.op_active) {
+		struct rm_rdma_op *op = &rm->rdma;
 
-		if (op->r_mapped) {
+		if (op->op_mapped) {
 			ib_dma_unmap_sg(ic->i_cm_id->device,
-					op->r_sg, op->r_nents,
-					op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-			op->r_mapped = 0;
+					op->op_sg, op->op_nents,
+					op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+			op->op_mapped = 0;
 		}
 
 		/* If the user asked for a completion notification on this
@@ -111,10 +111,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 		 */
 		rds_ib_send_complete(rm, wc_status, rds_rdma_send_complete);
 
-		if (rm->rdma.m_rdma_op.r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
+		if (rm->rdma.op_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
 	}
 
 	if (rm->atomic.op_active) {
@@ -540,10 +540,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.m_rdma_op.r_active) {
+		if (rm->rdma.op_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -576,7 +576,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
+	if (rm->rdma.op_active && rm->rdma.op_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/* Each frag gets a header. Msgs may be 0 bytes */
@@ -746,7 +746,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm)
 	 * we must fill in s_rm ourselves, so we properly clean up
 	 * on completion.
 	 */
-	if (!rm->rdma.m_rdma_op.r_active && !rm->data.op_active)
+	if (!rm->rdma.op_active && !rm->data.op_active)
 		send->s_rm = rm;
 
 	/* map 8 byte retval buffer to the device */
@@ -788,7 +788,7 @@ out:
 	return ret;
 }
 
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_send_work *send = NULL;
@@ -798,7 +798,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	struct rds_ib_device *rds_ibdev;
 	struct scatterlist *scat;
 	unsigned long len;
-	u64 remote_addr = op->r_remote_addr;
+	u64 remote_addr = op->op_remote_addr;
 	u32 pos;
 	u32 work_alloc;
 	u32 i;
@@ -810,25 +810,25 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
 
 	/* map the message the first time we see it */
-	if (!op->r_mapped) {
-		op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
-					op->r_sg, op->r_nents, (op->r_write) ?
-					DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
-		if (op->r_count == 0) {
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
 			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
 			ret = -ENOMEM; /* XXX ? */
 			goto out;
 		}
 
-		op->r_mapped = 1;
+		op->op_mapped = 1;
 	}
 
 	/*
 	 * Instead of knowing how to return a partial rdma read/write we insist that there
 	 * be enough work requests to send the entire message.
 	 */
-	i = ceil(op->r_count, rds_ibdev->max_sge);
+	i = ceil(op->op_count, rds_ibdev->max_sge);
 
 	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
 	if (work_alloc != i) {
@@ -841,19 +841,19 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &op->r_sg[0];
+	scat = &op->op_sg[0];
 	sent = 0;
-	num_sge = op->r_count;
+	num_sge = op->op_count;
 
-	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
 		send->s_wr.send_flags = 0;
 		send->s_queued = jiffies;
 
-		rds_ib_set_wr_signal_state(ic, send, op->r_notify);
+		rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 
-		send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
-		send->s_wr.wr.rdma.rkey = op->r_key;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
 		send->s_op = op;
 
 		if (num_sge > rds_ibdev->max_sge) {
@@ -868,7 +868,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		if (prev)
 			prev->s_wr.next = &send->s_wr;
 
-		for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
 			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
 			send->s_sge[j].addr =
 				 ib_sg_dma_address(ic->i_cm_id->device, scat);
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 6f08300851ad..f112105faced 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
 	struct rds_message	*s_rm;
 
 	/* We should really put these into a union: */
-	struct rds_rdma_op	*s_op;
+	struct rm_rdma_op	*s_op;
 	struct rds_iw_mapping	*s_mapping;
 	struct ib_mr		*s_mr;
 	struct ib_fast_reg_page_list *s_page_list;
@@ -357,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_iw_send_init_ring(struct rds_iw_connection *ic);
 void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 9b79a1b10445..05ebf16ecad7 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -63,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
 }
 
 static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
-				   struct rds_rdma_op *op)
+				   struct rm_rdma_op *op)
 {
-	if (op->r_mapped) {
+	if (op->op_mapped) {
 		ib_dma_unmap_sg(ic->i_cm_id->device,
-			op->r_sg, op->r_nents,
-			op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		op->r_mapped = 0;
+			op->op_sg, op->op_nents,
+			op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
 	}
 }
 
@@ -85,8 +85,8 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		     rm->data.m_sg, rm->data.m_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->rdma.m_rdma_op.r_active) {
-		rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
+	if (rm->rdma.op_active) {
+		rds_iw_send_unmap_rdma(ic, &rm->rdma);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		 */
 		rds_iw_send_rdma_complete(rm, wc_status);
 
-		if (rm->rdma.m_rdma_op.r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
+		if (rm->rdma.op_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.m_rdma_op.r_active) {
+		if (rm->rdma.op_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
+	if (rm->rdma.op_active && rm->rdma.op_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
@@ -785,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
 	ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
 }
 
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 {
 	struct rds_iw_connection *ic = conn->c_transport_data;
 	struct rds_iw_send_work *send = NULL;
@@ -795,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	struct rds_iw_device *rds_iwdev;
 	struct scatterlist *scat;
 	unsigned long len;
-	u64 remote_addr = op->r_remote_addr;
+	u64 remote_addr = op->op_remote_addr;
 	u32 pos, fr_pos;
 	u32 work_alloc;
 	u32 i;
@@ -807,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
 
 	/* map the message the first time we see it */
-	if (!op->r_mapped) {
-		op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
-					op->r_sg, op->r_nents, (op->r_write) ?
-					DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
-		if (op->r_count == 0) {
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
 			rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
 			ret = -ENOMEM; /* XXX ? */
 			goto out;
 		}
 
-		op->r_mapped = 1;
+		op->op_mapped = 1;
 	}
 
-	if (!op->r_write) {
+	if (!op->op_write) {
 		/* Alloc space on the send queue for the fastreg */
 		work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
 		if (work_alloc != 1) {
@@ -836,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	 * Instead of knowing how to return a partial rdma read/write we insist that there
 	 * be enough work requests to send the entire message.
 	 */
-	i = ceil(op->r_count, rds_iwdev->max_sge);
+	i = ceil(op->op_count, rds_iwdev->max_sge);
 
 	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
 	if (work_alloc != i) {
@@ -847,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	}
 
 	send = &ic->i_sends[pos];
-	if (!op->r_write) {
+	if (!op->op_write) {
 		first = prev = &ic->i_sends[fr_pos];
 	} else {
 		first = send;
 		prev = NULL;
 	}
-	scat = &op->r_sg[0];
+	scat = &op->op_sg[0];
 	sent = 0;
-	num_sge = op->r_count;
+	num_sge = op->op_count;
 
-	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
 		send->s_wr.send_flags = 0;
 		send->s_queued = jiffies;
 
@@ -874,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		 * for local access after RDS is finished with it, using
 		 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
 		 */
-		if (op->r_write)
+		if (op->op_write)
 			send->s_wr.opcode = IB_WR_RDMA_WRITE;
 		else
 			send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
-		send->s_wr.wr.rdma.rkey = op->r_key;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
 		send->s_op = op;
 
 		if (num_sge > rds_iwdev->max_sge) {
@@ -894,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		if (prev)
 			prev->s_wr.next = &send->s_wr;
 
-		for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
 			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
 
 			if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -928,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	}
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &op->r_sg[op->r_count])
+	if (scat == &op->op_sg[op->op_count])
 		first->s_wr.send_flags = IB_SEND_SIGNALED;
 
 	if (i < work_alloc) {
@@ -942,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	 * adapters do not allow using the lkey for this at all.  To bypass this use a
 	 * fastreg_mr (or possibly a dma_mr)
 	 */
-	if (!op->r_write) {
+	if (!op->op_write) {
 		rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
-			op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+			op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
 		work_alloc++;
 	}
 
diff --git a/net/rds/message.c b/net/rds/message.c
index b53306c3e656..bca7eda6dde9 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -69,10 +69,10 @@ static void rds_message_purge(struct rds_message *rm)
 	}
 	rm->data.m_nents = 0;
 
-	if (rm->rdma.m_rdma_op.r_active)
-		rds_rdma_free_op(&rm->rdma.m_rdma_op);
-	if (rm->rdma.m_rdma_mr)
-		rds_mr_put(rm->rdma.m_rdma_mr);
+	if (rm->rdma.op_active)
+		rds_rdma_free_op(&rm->rdma);
+	if (rm->rdma.op_rdma_mr)
+		rds_mr_put(rm->rdma.op_rdma_mr);
 
 	if (rm->atomic.op_active)
 		rds_atomic_free_op(&rm->atomic);
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 0df86a382e2e..8d22999b0471 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -440,26 +440,26 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 	rds_mr_put(mr);
 }
 
-void rds_rdma_free_op(struct rds_rdma_op *ro)
+void rds_rdma_free_op(struct rm_rdma_op *ro)
 {
 	unsigned int i;
 
-	for (i = 0; i < ro->r_nents; i++) {
-		struct page *page = sg_page(&ro->r_sg[i]);
+	for (i = 0; i < ro->op_nents; i++) {
+		struct page *page = sg_page(&ro->op_sg[i]);
 
 		/* Mark page dirty if it was possibly modified, which
 		 * is the case for a RDMA_READ which copies from remote
 		 * to local memory */
-		if (!ro->r_write) {
+		if (!ro->op_write) {
 			BUG_ON(irqs_disabled());
 			set_page_dirty(page);
 		}
 		put_page(page);
 	}
 
-	kfree(ro->r_notifier);
-	ro->r_notifier = NULL;
-	ro->r_active = 0;
+	kfree(ro->op_notifier);
+	ro->op_notifier = NULL;
+	ro->op_active = 0;
 }
 
 void rds_atomic_free_op(struct rm_atomic_op *ao)
@@ -521,7 +521,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 {
 	struct rds_rdma_args *args;
 	struct rds_iovec vec;
-	struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
+	struct rm_rdma_op *op = &rm->rdma;
 	unsigned int nr_pages;
 	unsigned int nr_bytes;
 	struct page **pages = NULL;
@@ -531,7 +531,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 	int ret = 0;
 
 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
-	    || rm->rdma.m_rdma_op.r_active)
+	    || rm->rdma.op_active)
 		return -EINVAL;
 
 	args = CMSG_DATA(cmsg);
@@ -556,27 +556,27 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		goto out;
 	}
 
-	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
-	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
-	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-	op->r_active = 1;
-	op->r_recverr = rs->rs_recverr;
+	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->op_active = 1;
+	op->op_recverr = rs->rs_recverr;
 	WARN_ON(!nr_pages);
-	op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
+	op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
 
-	if (op->r_notify || op->r_recverr) {
+	if (op->op_notify || op->op_recverr) {
 		/* We allocate an uninitialized notifier here, because
 		 * we don't want to do that in the completion handler. We
 		 * would have to use GFP_ATOMIC there, and don't want to deal
 		 * with failed allocations.
 		 */
-		op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
-		if (!op->r_notifier) {
+		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+		if (!op->op_notifier) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		op->r_notifier->n_user_token = args->user_token;
-		op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+		op->op_notifier->n_user_token = args->user_token;
+		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
 	}
 
 	/* The cookie contains the R_Key of the remote memory region, and
@@ -586,15 +586,15 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 	 * destination address (which is really an offset into the MR)
 	 * FIXME: We may want to move this into ib_rdma.c
 	 */
-	op->r_key = rds_rdma_cookie_key(args->cookie);
-	op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+	op->op_rkey = rds_rdma_cookie_key(args->cookie);
+	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
 
 	nr_bytes = 0;
 
 	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
 	       (unsigned long long)args->nr_local,
 	       (unsigned long long)args->remote_vec.addr,
-	       op->r_key);
+	       op->op_rkey);
 
 	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
 
@@ -617,7 +617,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		/* If it's a WRITE operation, we want to pin the pages for reading.
 		 * If it's a READ operation, we need to pin the pages for writing.
 		 */
-		ret = rds_pin_pages(vec.addr, nr, pages, !op->r_write);
+		ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write);
 		if (ret < 0)
 			goto out;
 
@@ -630,7 +630,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 			unsigned int offset = vec.addr & ~PAGE_MASK;
 			struct scatterlist *sg;
 
-			sg = &op->r_sg[op->r_nents + j];
+			sg = &op->op_sg[op->op_nents + j];
 			sg_set_page(sg, pages[j],
 					min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
 					offset);
@@ -642,7 +642,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 			vec.bytes -= sg->length;
 		}
 
-		op->r_nents += nr;
+		op->op_nents += nr;
 	}
 
 	if (nr_bytes > args->remote_vec.bytes) {
@@ -652,7 +652,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		ret = -EINVAL;
 		goto out;
 	}
-	op->r_bytes = nr_bytes;
+	op->op_bytes = nr_bytes;
 
 	ret = 0;
 out:
@@ -700,7 +700,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
 	if (mr) {
 		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
-		rm->rdma.m_rdma_mr = mr;
+		rm->rdma.op_rdma_mr = mr;
 	}
 	return err;
 }
@@ -718,7 +718,7 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 	    rm->m_rdma_cookie != 0)
 		return -EINVAL;
 
-	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.m_rdma_mr);
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
 }
 
 /*
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 32b3d46aea36..76eeb5988b5f 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -230,22 +230,6 @@ struct rds_mr {
 /* Flags for mr->r_state */
 #define RDS_MR_DEAD		0
 
-struct rds_rdma_op {
-	u32			r_key;
-	u64			r_remote_addr;
-	unsigned int		r_write:1;
-	unsigned int		r_fence:1;
-	unsigned int		r_notify:1;
-	unsigned int		r_recverr:1;
-	unsigned int		r_mapped:1;
-	unsigned int		r_active:1;
-	struct rds_notifier	*r_notifier;
-	unsigned int		r_bytes;
-	unsigned int		r_nents;
-	unsigned int		r_count;
-	struct scatterlist	*r_sg;
-};
-
 static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
 {
 	return r_key | (((u64) offset) << 32);
@@ -331,14 +315,27 @@ struct rds_message {
 			unsigned int		op_recverr:1;
 			unsigned int		op_mapped:1;
 			unsigned int		op_active:1;
-			struct rds_notifier	*op_notifier;
 			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
 
 			struct rds_mr		*op_rdma_mr;
 		} atomic;
 		struct rm_rdma_op {
-			struct rds_rdma_op	m_rdma_op;
-			struct rds_mr		*m_rdma_mr;
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_write:1;
+			unsigned int		op_fence:1;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_active:1;
+			unsigned int		op_bytes;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
@@ -418,7 +415,7 @@ struct rds_transport {
 		    unsigned int hdr_off, unsigned int sg, unsigned int off);
 	int (*xmit_cong_map)(struct rds_connection *conn,
 			     struct rds_cong_map *map, unsigned long offset);
-	int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
 	int (*xmit_atomic)(struct rds_connection *conn, struct rds_message *rm);
 	int (*recv)(struct rds_connection *conn);
 	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
@@ -727,7 +724,7 @@ int rds_send_acked_before(struct rds_connection *conn, u64 seq);
 void rds_send_remove_from_sock(struct list_head *messages, int status);
 int rds_send_pong(struct rds_connection *conn, __be16 dport);
 struct rds_message *rds_send_get_message(struct rds_connection *,
-					 struct rds_rdma_op *);
+					 struct rm_rdma_op *);
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
@@ -744,7 +741,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 			  struct cmsghdr *cmsg);
 int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 			  struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
 void rds_atomic_free_op(struct rm_atomic_op *ao);
 void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
 void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
diff --git a/net/rds/send.c b/net/rds/send.c
index 42fb934293be..08df279ced2a 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -237,7 +237,7 @@ int rds_send_xmit(struct rds_connection *conn)
 			 * connection.
 			 * Therefore, we never retransmit messages with RDMA ops.
 			 */
-			if (rm->rdma.m_rdma_op.r_active &&
+			if (rm->rdma.op_active &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
 				spin_lock_irqsave(&conn->c_lock, flags);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
@@ -280,8 +280,8 @@ int rds_send_xmit(struct rds_connection *conn)
 		 * keep this simple and require that the transport either
 		 * send the whole rdma or none of it.
 		 */
-		if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) {
-			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op);
+		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
 			if (ret)
 				break;
 			conn->c_xmit_rdma_sent = 1;
@@ -430,16 +430,16 @@ int rds_send_acked_before(struct rds_connection *conn, u64 seq)
 void rds_rdma_send_complete(struct rds_message *rm, int status)
 {
 	struct rds_sock *rs = NULL;
-	struct rds_rdma_op *ro;
+	struct rm_rdma_op *ro;
 	struct rds_notifier *notifier;
 	unsigned long flags;
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	ro = &rm->rdma.m_rdma_op;
+	ro = &rm->rdma;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro->r_active && ro->r_notify && ro->r_notifier) {
-		notifier = ro->r_notifier;
+	    ro->op_active && ro->op_notify && ro->op_notifier) {
+		notifier = ro->op_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
 
@@ -448,7 +448,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
 		spin_unlock(&rs->rs_lock);
 
-		ro->r_notifier = NULL;
+		ro->op_notifier = NULL;
 	}
 
 	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
@@ -503,13 +503,13 @@ EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
 static inline void
 __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
 {
-	struct rds_rdma_op *ro;
+	struct rm_rdma_op *ro;
 
-	ro = &rm->rdma.m_rdma_op;
-	if (ro->r_active && ro->r_notify && ro->r_notifier) {
-		ro->r_notifier->n_status = status;
-		list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
-		ro->r_notifier = NULL;
+	ro = &rm->rdma;
+	if (ro->op_active && ro->op_notify && ro->op_notifier) {
+		ro->op_notifier->n_status = status;
+		list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+		ro->op_notifier = NULL;
 	}
 
 	/* No need to wake the app - caller does this */
@@ -521,7 +521,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
  * So speed is not an issue here.
  */
 struct rds_message *rds_send_get_message(struct rds_connection *conn,
-					 struct rds_rdma_op *op)
+					 struct rm_rdma_op *op)
 {
 	struct rds_message *rm, *tmp, *found = NULL;
 	unsigned long flags;
@@ -529,7 +529,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	spin_lock_irqsave(&conn->c_lock, flags);
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (&rm->rdma.m_rdma_op == op) {
+		if (&rm->rdma == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			goto out;
@@ -537,7 +537,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	}
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (&rm->rdma.m_rdma_op == op) {
+		if (&rm->rdma == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			break;
@@ -597,20 +597,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
 		spin_lock(&rs->rs_lock);
 
 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-			struct rds_rdma_op *ro = &rm->rdma.m_rdma_op;
+			struct rm_rdma_op *ro = &rm->rdma;
 			struct rds_notifier *notifier;
 
 			list_del_init(&rm->m_sock_item);
 			rds_send_sndbuf_remove(rs, rm);
 
-			if (ro->r_active && ro->r_notifier &&
-			    (ro->r_notify || (ro->r_recverr && status))) {
-				notifier = ro->r_notifier;
+			if (ro->op_active && ro->op_notifier &&
+			       (ro->op_notify || (ro->op_recverr && status))) {
+				notifier = ro->op_notifier;
 				list_add_tail(&notifier->n_list,
 						&rs->rs_notify_queue);
 				if (!notifier->n_status)
 					notifier->n_status = status;
-				rm->rdma.m_rdma_op.r_notifier = NULL;
+				rm->rdma.op_notifier = NULL;
 			}
 			was_on_sock = 1;
 			rm->m_rs = NULL;
@@ -987,11 +987,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	if (ret)
 		goto out;
 
-	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) &&
-	       !conn->c_trans->xmit_rdma) {
+	if ((rm->m_rdma_cookie || rm->rdma.op_active) &&
+	    !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-			       &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
+			       &rm->rdma, conn->c_trans->xmit_rdma);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 6c7cc6e4694dc464ae884332f2a322973497e3cf Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 27 Jan 2010 18:04:18 -0800
Subject: RDS: Rename data op members prefix from m_ to op_

For consistency.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_send.c  | 26 +++++++++++++-------------
 net/rds/iw_send.c  | 24 ++++++++++++------------
 net/rds/message.c  | 22 +++++++++++-----------
 net/rds/rds.h      |  6 +++---
 net/rds/send.c     | 10 +++++-----
 net/rds/tcp_send.c | 14 +++++++-------
 6 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 63981cd1827a..95f15247acd7 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -76,7 +76,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
 	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
 	ib_dma_unmap_sg(ic->i_cm_id->device,
-			rm->data.m_sg, rm->data.m_nents,
+			rm->data.op_sg, rm->data.op_nents,
 			DMA_TO_DEVICE);
 
 	if (rm->rdma.op_active) {
@@ -513,20 +513,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 	/* map the message the first time we see it */
 	if (!ic->i_rm) {
-		if (rm->data.m_nents) {
-			rm->data.m_count = ib_dma_map_sg(dev,
-							    rm->data.m_sg,
-							    rm->data.m_nents,
-							    DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count);
-			if (rm->data.m_count == 0) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
 				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
 				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 				ret = -ENOMEM; /* XXX ? */
 				goto out;
 			}
 		} else {
-			rm->data.m_count = 0;
+			rm->data.op_count = 0;
 		}
 
 		rds_message_addref(rm);
@@ -583,7 +583,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &rm->data.m_sg[sg];
+	scat = &rm->data.op_sg[sg];
 	i = 0;
 	do {
 		unsigned int len = 0;
@@ -604,7 +604,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* Set up the data, if present */
 		if (i < work_alloc
-		    && scat != &rm->data.m_sg[rm->data.m_count]) {
+		    && scat != &rm->data.op_sg[rm->data.op_count]) {
 			len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
 			send->s_wr.num_sge = 2;
 
@@ -649,7 +649,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		i++;
 
 	} while (i < work_alloc
-		 && scat != &rm->data.m_sg[rm->data.m_count]);
+		 && scat != &rm->data.op_sg[rm->data.op_count]);
 
 	/* Account the RDS header in the number of bytes we sent, but just once.
 	 * The caller has no concept of fragmentation. */
@@ -657,7 +657,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		bytes_sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &rm->data.m_sg[rm->data.m_count]) {
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
 		prev->s_rm = ic->i_rm;
 		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 05ebf16ecad7..6280ea020d4e 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -82,7 +82,7 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
 	ib_dma_unmap_sg(ic->i_cm_id->device,
-		     rm->data.m_sg, rm->data.m_nents,
+		     rm->data.op_sg, rm->data.op_nents,
 		     DMA_TO_DEVICE);
 
 	if (rm->rdma.op_active) {
@@ -562,20 +562,20 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 				rm->m_inc.i_hdr.h_flags,
 				be32_to_cpu(rm->m_inc.i_hdr.h_len));
 		   */
-		if (rm->data.m_nents) {
-			rm->data.m_count = ib_dma_map_sg(dev,
-						    rm->data.m_sg,
-						    rm->data.m_nents,
-						    DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count);
-			if (rm->data.m_count == 0) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
 				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
 				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
 				ret = -ENOMEM; /* XXX ? */
 				goto out;
 			}
 		} else {
-			rm->data.m_count = 0;
+			rm->data.op_count = 0;
 		}
 
 		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -622,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &rm->data.m_sg[sg];
+	scat = &rm->data.op_sg[sg];
 	sent = 0;
 	i = 0;
 
@@ -651,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) {
+	for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
 		unsigned int len;
 
 		send = &ic->i_sends[pos];
@@ -729,7 +729,7 @@ add_header:
 		sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &rm->data.m_sg[rm->data.m_count]) {
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
 		prev->s_rm = ic->i_rm;
 		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
diff --git a/net/rds/message.c b/net/rds/message.c
index bca7eda6dde9..4bd9504ca048 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -62,12 +62,12 @@ static void rds_message_purge(struct rds_message *rm)
 	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
 		return;
 
-	for (i = 0; i < rm->data.m_nents; i++) {
-		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.m_sg[i]));
+	for (i = 0; i < rm->data.op_nents; i++) {
+		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
 		/* XXX will have to put_page for page refs */
-		__free_page(sg_page(&rm->data.m_sg[i]));
+		__free_page(sg_page(&rm->data.op_sg[i]));
 	}
-	rm->data.m_nents = 0;
+	rm->data.op_nents = 0;
 
 	if (rm->rdma.op_active)
 		rds_rdma_free_op(&rm->rdma);
@@ -261,11 +261,11 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 
 	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
-	rm->data.m_nents = ceil(total_len, PAGE_SIZE);
-	rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs);
+	rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
 
-	for (i = 0; i < rm->data.m_nents; ++i) {
-		sg_set_page(&rm->data.m_sg[i],
+	for (i = 0; i < rm->data.op_nents; ++i) {
+		sg_set_page(&rm->data.op_sg[i],
 				virt_to_page(page_addrs[i]),
 				PAGE_SIZE, 0);
 	}
@@ -288,7 +288,7 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 	/*
 	 * now allocate and copy in the data payload.
 	 */
-	sg = rm->data.m_sg;
+	sg = rm->data.op_sg;
 	iov = first_iov;
 	iov_off = 0;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
@@ -299,7 +299,7 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 						       GFP_HIGHUSER);
 			if (ret)
 				goto out;
-			rm->data.m_nents++;
+			rm->data.op_nents++;
 			sg_off = 0;
 		}
 
@@ -354,7 +354,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
 
 	iov = first_iov;
 	iov_off = 0;
-	sg = rm->data.m_sg;
+	sg = rm->data.op_sg;
 	vec_off = 0;
 	copied = 0;
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 76eeb5988b5f..d70284989124 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -339,9 +339,9 @@ struct rds_message {
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
-			unsigned int		m_nents;
-			unsigned int		m_count;
-			struct scatterlist	*m_sg;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
 		} data;
 	};
 	unsigned int		m_used_sgs;
diff --git a/net/rds/send.c b/net/rds/send.c
index 08df279ced2a..d60d31309032 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -166,7 +166,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		rm = conn->c_xmit_rm;
 		if (rm &&
 		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-		    conn->c_xmit_sg == rm->data.m_nents) {
+		    conn->c_xmit_sg == rm->data.op_nents) {
 			conn->c_xmit_rm = NULL;
 			conn->c_xmit_sg = 0;
 			conn->c_xmit_hdr_off = 0;
@@ -296,7 +296,7 @@ int rds_send_xmit(struct rds_connection *conn)
 
 		if (rm->data.op_active
 		    && (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
-			conn->c_xmit_sg < rm->data.m_nents)) {
+			conn->c_xmit_sg < rm->data.op_nents)) {
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
 						  conn->c_xmit_sg,
@@ -312,7 +312,7 @@ int rds_send_xmit(struct rds_connection *conn)
 				ret -= tmp;
 			}
 
-			sg = &rm->data.m_sg[conn->c_xmit_sg];
+			sg = &rm->data.op_sg[conn->c_xmit_sg];
 			while (ret) {
 				tmp = min_t(int, ret, sg->length -
 						      conn->c_xmit_data_off);
@@ -323,7 +323,7 @@ int rds_send_xmit(struct rds_connection *conn)
 					sg++;
 					conn->c_xmit_sg++;
 					BUG_ON(ret != 0 &&
-					       conn->c_xmit_sg == rm->data.m_nents);
+					       conn->c_xmit_sg == rm->data.op_nents);
 				}
 			}
 		}
@@ -959,7 +959,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	rm->data.m_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+	rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
 	/* XXX fix this to not allocate memory */
 	ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
 	if (ret)
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index d63aa35ac673..53c1de55f471 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -166,21 +166,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 			goto out;
 	}
 
-	while (sg < rm->data.m_nents) {
+	while (sg < rm->data.op_nents) {
 		ret = tc->t_sock->ops->sendpage(tc->t_sock,
-						sg_page(&rm->data.m_sg[sg]),
-						rm->data.m_sg[sg].offset + off,
-						rm->data.m_sg[sg].length - off,
+						sg_page(&rm->data.op_sg[sg]),
+						rm->data.op_sg[sg].offset + off,
+						rm->data.op_sg[sg].length - off,
 						MSG_DONTWAIT|MSG_NOSIGNAL);
-		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.m_sg[sg]),
-			 rm->data.m_sg[sg].offset + off, rm->data.m_sg[sg].length - off,
+		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+			 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
 			 ret);
 		if (ret <= 0)
 			break;
 
 		off += ret;
 		done += ret;
-		if (off == rm->data.m_sg[sg].length) {
+		if (off == rm->data.op_sg[sg].length) {
 			off = 0;
 			sg++;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 5b2366bd2835919e2e6a836e837eab4a9274bd46 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 3 Feb 2010 19:36:44 -0800
Subject: RDS: Rewrite rds_send_xmit

Simplify rds_send_xmit().

Send a congestion map (via xmit_cong_map) without
decrementing send_quota.

Move resetting of conn xmit variables to end of loop.

Update comments.

Implement a special case to turn off sending an rds header
when there is an atomic op and no other data.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rds.h  |   5 +--
 net/rds/send.c | 131 +++++++++++++++++++++++++++++++--------------------------
 2 files changed, 73 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/net/rds/rds.h b/net/rds/rds.h
index d70284989124..13ed30ac424d 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -96,8 +96,9 @@ struct rds_connection {
 	unsigned long		c_xmit_sg;
 	unsigned int		c_xmit_hdr_off;
 	unsigned int		c_xmit_data_off;
-	unsigned int		c_xmit_rdma_sent;
 	unsigned int		c_xmit_atomic_sent;
+	unsigned int		c_xmit_rdma_sent;
+	unsigned int		c_xmit_data_sent;
 
 	spinlock_t		c_lock;		/* protect msg queues */
 	u64			c_next_tx_seq;
@@ -120,8 +121,6 @@ struct rds_connection {
 
 	struct list_head	c_map_item;
 	unsigned long		c_map_queued;
-	unsigned long		c_map_offset;
-	unsigned long		c_map_bytes;
 
 	unsigned int		c_unacked_packets;
 	unsigned int		c_unacked_bytes;
diff --git a/net/rds/send.c b/net/rds/send.c
index d60d31309032..66dc6b045261 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -72,8 +72,9 @@ void rds_send_reset(struct rds_connection *conn)
 	conn->c_xmit_sg = 0;
 	conn->c_xmit_hdr_off = 0;
 	conn->c_xmit_data_off = 0;
-	conn->c_xmit_rdma_sent = 0;
 	conn->c_xmit_atomic_sent = 0;
+	conn->c_xmit_rdma_sent = 0;
+	conn->c_xmit_data_sent = 0;
 
 	conn->c_map_queued = 0;
 
@@ -137,69 +138,54 @@ int rds_send_xmit(struct rds_connection *conn)
 
 	/*
 	 * spin trying to push headers and data down the connection until
-	 * the connection doens't make forward progress.
+	 * the connection doesn't make forward progress.
 	 */
 	while (--send_quota) {
-		/*
-		 * See if need to send a congestion map update if we're
-		 * between sending messages.  The send_sem protects our sole
-		 * use of c_map_offset and _bytes.
-		 * Note this is used only by transports that define a special
-		 * xmit_cong_map function. For all others, we create allocate
-		 * a cong_map message and treat it just like any other send.
-		 */
-		if (conn->c_map_bytes) {
-			ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
-							   conn->c_map_offset);
-			if (ret <= 0)
-				break;
 
-			conn->c_map_offset += ret;
-			conn->c_map_bytes -= ret;
-			if (conn->c_map_bytes)
-				continue;
-		}
-
-		/* If we're done sending the current message, clear the
-		 * offset and S/G temporaries.
-		 */
 		rm = conn->c_xmit_rm;
-		if (rm &&
-		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-		    conn->c_xmit_sg == rm->data.op_nents) {
-			conn->c_xmit_rm = NULL;
-			conn->c_xmit_sg = 0;
-			conn->c_xmit_hdr_off = 0;
-			conn->c_xmit_data_off = 0;
-			conn->c_xmit_rdma_sent = 0;
-			conn->c_xmit_atomic_sent = 0;
-
-			/* Release the reference to the previous message. */
-			rds_message_put(rm);
-			rm = NULL;
-		}
 
-		/* If we're asked to send a cong map update, do so.
+		/*
+		 * If between sending messages, we can send a pending congestion
+		 * map update.
+		 *
+		 * Transports either define a special xmit_cong_map function,
+		 * or we allocate a cong_map message and treat it just like any
+		 * other send.
 		 */
 		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
 			if (conn->c_trans->xmit_cong_map) {
-				conn->c_map_offset = 0;
-				conn->c_map_bytes = sizeof(struct rds_header) +
+				unsigned long map_offset = 0;
+				unsigned long map_bytes = sizeof(struct rds_header) +
 					RDS_CONG_MAP_BYTES;
-				continue;
-			}
 
-			rm = rds_cong_update_alloc(conn);
-			if (IS_ERR(rm)) {
-				ret = PTR_ERR(rm);
-				break;
-			}
+				while (map_bytes) {
+					ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+									   map_offset);
+					if (ret <= 0) {
+						/* too far down the rabbithole! */
+						mutex_unlock(&conn->c_send_lock);
+						rds_conn_error(conn, "Cong map xmit failed\n");
+						goto out;
+					}
+
+					map_offset += ret;
+					map_bytes -= ret;
+				}
+			} else {
+				/* send cong update like a normal rm */
+				rm = rds_cong_update_alloc(conn);
+				if (IS_ERR(rm)) {
+					ret = PTR_ERR(rm);
+					break;
+				}
+				rm->data.op_active = 1;
 
-			conn->c_xmit_rm = rm;
+				conn->c_xmit_rm = rm;
+			}
 		}
 
 		/*
-		 * Grab the next message from the send queue, if there is one.
+		 * If not already working on one, grab the next message.
 		 *
 		 * c_xmit_rm holds a ref while we're sending this message down
 		 * the connction.  We can use this ref while holding the
@@ -264,7 +250,6 @@ int rds_send_xmit(struct rds_connection *conn)
 			conn->c_xmit_rm = rm;
 		}
 
-
 		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
 			ret = conn->c_trans->xmit_atomic(conn, rm);
 			if (ret)
@@ -273,13 +258,20 @@ int rds_send_xmit(struct rds_connection *conn)
 			/* The transport owns the mapped memory for now.
 			 * You can't unmap it while it's on the send queue */
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+
+			/*
+			 * This is evil, muahaha.
+			 * We permit 0-byte sends. (rds-ping depends on this.)
+			 * BUT if there is an atomic op and no sent data,
+			 * we turn off sending the header, to achieve
+			 * "silent" atomics.
+			 * But see below; RDMA op might toggle this back on!
+			 */
+			if (rm->data.op_nents == 0)
+				rm->data.op_active = 0;
 		}
 
-		/*
-		 * Try and send an rdma message.  Let's see if we can
-		 * keep this simple and require that the transport either
-		 * send the whole rdma or none of it.
-		 */
+		/* The transport either sends the whole rdma or none of it */
 		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
 			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
 			if (ret)
@@ -294,9 +286,7 @@ int rds_send_xmit(struct rds_connection *conn)
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 		}
 
-		if (rm->data.op_active
-		    && (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
-			conn->c_xmit_sg < rm->data.op_nents)) {
+		if (rm->data.op_active && !conn->c_xmit_data_sent) {
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
 						  conn->c_xmit_sg,
@@ -326,6 +316,27 @@ int rds_send_xmit(struct rds_connection *conn)
 					       conn->c_xmit_sg == rm->data.op_nents);
 				}
 			}
+
+			if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+			    (conn->c_xmit_sg == rm->data.op_nents))
+				conn->c_xmit_data_sent = 1;
+		}
+
+		/*
+		 * A rm will only take multiple times through this loop
+		 * if there is a data op. Thus, if the data is sent (or there was
+		 * none), then we're done with the rm.
+		 */
+		if (!rm->data.op_active || conn->c_xmit_data_sent) {
+			conn->c_xmit_rm = NULL;
+			conn->c_xmit_sg = 0;
+			conn->c_xmit_hdr_off = 0;
+			conn->c_xmit_data_off = 0;
+			conn->c_xmit_rdma_sent = 0;
+			conn->c_xmit_atomic_sent = 0;
+			conn->c_xmit_data_sent = 0;
+
+			rds_message_put(rm);
 		}
 	}
 
@@ -350,7 +361,7 @@ int rds_send_xmit(struct rds_connection *conn)
 	 */
 	mutex_unlock(&conn->c_send_lock);
 
-	if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+	if (send_quota == 0 && !was_empty) {
 		/* We exhausted the send quota, but there's work left to
 		 * do. Return and (re-)schedule the send worker.
 		 */
-- 
cgit v1.2.3-59-g8ed1b


From 372cd7dedfd1ea93a9ae8d9c282e910dc1b76773 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 3 Feb 2010 19:40:32 -0800
Subject: RDS: Do not set op_active in r_m_copy_from_user().

Do not allocate sgs for data for 0-length datagrams

Set data.op_active in rds_sendmsg() instead of
rds_message_copy_from_user().

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c |  2 --
 net/rds/send.c    | 13 ++++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 4bd9504ca048..f4c3be0878e9 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -330,8 +330,6 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 			sg++;
 	}
 
-	rm->data.op_active = 1;
-
 out:
 	return ret;
 }
diff --git a/net/rds/send.c b/net/rds/send.c
index 66dc6b045261..ad89a63c4304 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -970,11 +970,14 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
-	/* XXX fix this to not allocate memory */
-	ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
-	if (ret)
-		goto out;
+	/* Attach data to the rm */
+	if (payload_len) {
+		rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+		ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+		if (ret)
+			goto out;
+	}
+	rm->data.op_active = 1;
 
 	rm->m_daddr = daddr;
 
-- 
cgit v1.2.3-59-g8ed1b


From ee4c7b47e46a9dea789aadb8279c8505f755b3ee Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 3 Feb 2010 19:41:52 -0800
Subject: RDS: Add a warning if trying to allocate 0 sgs

rds_message_alloc_sgs() only works when nents is nonzero.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index f4c3be0878e9..1f73a7358a8e 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -240,6 +240,7 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
 	struct scatterlist *sg_ret;
 
 	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+	WARN_ON(!nents);
 
 	sg_ret = &sg_first[rm->m_used_sgs];
 	sg_init_table(sg_ret, nents);
-- 
cgit v1.2.3-59-g8ed1b


From 940786eb0a0faf3f30898a1cc7c1540d54c1aff6 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 19 Feb 2010 18:04:58 -0800
Subject: RDS: queue failure notifications for dropped atomic ops

When dropping ops in the send queue, we notify the client
of failed rdma ops they asked for notifications on, but not
atomic ops. It should be for both.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index ad89a63c4304..cdca9747fcbc 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -512,9 +512,10 @@ EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
  * socket, socket lock) and can just move the notifier.
  */
 static inline void
-__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
 {
 	struct rm_rdma_op *ro;
+	struct rm_atomic_op *ao;
 
 	ro = &rm->rdma;
 	if (ro->op_active && ro->op_notify && ro->op_notifier) {
@@ -523,6 +524,13 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
 		ro->op_notifier = NULL;
 	}
 
+	ao = &rm->atomic;
+	if (ao->op_active && ao->op_notify && ao->op_notifier) {
+		ao->op_notifier->n_status = status;
+		list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+		ao->op_notifier = NULL;
+	}
+
 	/* No need to wake the app - caller does this */
 }
 
@@ -733,7 +741,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		spin_lock_irqsave(&rm->m_rs_lock, flags);
 
 		spin_lock(&rs->rs_lock);
-		__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
 		spin_unlock(&rs->rs_lock);
 
 		rm->m_rs = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 7e3bd65ebfd5d6cd76b8b979920c632d6e6b4b2a Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 1 Mar 2010 16:04:59 -0800
Subject: RDS: Move some variables around for consistency

Also, add a comment.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/rdma.c | 3 +--
 net/rds/rds.h  | 8 ++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 8d22999b0471..5ba514684431 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -747,6 +747,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 	}
 
 	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_active = 1;
 	rm->atomic.op_recverr = rs->rs_recverr;
 	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
 
@@ -782,8 +783,6 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
 	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
 
-	rm->atomic.op_active = 1;
-
 	return ret;
 err:
 	if (page)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 13ed30ac424d..46d190d08549 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -301,7 +301,13 @@ struct rds_message {
 	 */
 	spinlock_t		m_rs_lock;
 	struct rds_sock		*m_rs;
+
+	/* cookie to send to remote, in rds header */
 	rds_rdma_cookie_t	m_rdma_cookie;
+
+	unsigned int		m_used_sgs;
+	unsigned int		m_total_sgs;
+
 	struct {
 		struct rm_atomic_op {
 			int			op_type;
@@ -343,8 +349,6 @@ struct rds_message {
 			struct scatterlist	*op_sg;
 		} data;
 	};
-	unsigned int		m_used_sgs;
-	unsigned int		m_total_sgs;
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 2c3a5f9abb1dc5efdab8ba9a568b1661c65fd1e3 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 1 Mar 2010 16:10:40 -0800
Subject: RDS: Add flag for silent ops. Do atomic op before RDMA

Add a flag to the API so users can indicate they want
silent operations. This is needed because silent ops
cannot be used with USE_ONCE MRs, so we can't just
assume silent.

Also, change send_xmit to do atomic op before rdma op if
both are present, and centralize the hairy logic to determine if
we want to attempt silent, or not.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 include/linux/rds.h |  1 +
 net/rds/rdma.c      |  2 ++
 net/rds/rds.h       |  2 ++
 net/rds/send.c      | 55 ++++++++++++++++++++++++++++++-----------------------
 4 files changed, 36 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/include/linux/rds.h b/include/linux/rds.h
index 9239152abf7a..109f1d343318 100644
--- a/include/linux/rds.h
+++ b/include/linux/rds.h
@@ -276,5 +276,6 @@ struct rds_rdma_notify {
 #define RDS_RDMA_USE_ONCE	0x0008	/* free MR after use */
 #define RDS_RDMA_DONTWAIT	0x0010	/* Don't wait in SET_BARRIER */
 #define RDS_RDMA_NOTIFY_ME	0x0020	/* Notify when operation completes */
+#define RDS_RDMA_SILENT		0x0040	/* Do not interrupt remote */
 
 #endif /* IB_RDS_H */
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 5ba514684431..48781fe4431c 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -559,6 +559,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
 	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
 	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
 	op->op_active = 1;
 	op->op_recverr = rs->rs_recverr;
 	WARN_ON(!nr_pages);
@@ -747,6 +748,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 	}
 
 	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
 	rm->atomic.op_active = 1;
 	rm->atomic.op_recverr = rs->rs_recverr;
 	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 46d190d08549..23b921000e74 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -319,6 +319,7 @@ struct rds_message {
 			unsigned int		op_notify:1;
 			unsigned int		op_recverr:1;
 			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
 			unsigned int		op_active:1;
 			struct scatterlist	*op_sg;
 			struct rds_notifier	*op_notifier;
@@ -333,6 +334,7 @@ struct rds_message {
 			unsigned int		op_notify:1;
 			unsigned int		op_recverr:1;
 			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
 			unsigned int		op_active:1;
 			unsigned int		op_bytes;
 			unsigned int		op_nents;
diff --git a/net/rds/send.c b/net/rds/send.c
index cdca9747fcbc..38567f3ee7e8 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -250,42 +250,50 @@ int rds_send_xmit(struct rds_connection *conn)
 			conn->c_xmit_rm = rm;
 		}
 
-		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
-			ret = conn->c_trans->xmit_atomic(conn, rm);
+		/* The transport either sends the whole rdma or none of it */
+		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
 			if (ret)
 				break;
-			conn->c_xmit_atomic_sent = 1;
+			conn->c_xmit_rdma_sent = 1;
+
 			/* The transport owns the mapped memory for now.
 			 * You can't unmap it while it's on the send queue */
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
-
-			/*
-			 * This is evil, muahaha.
-			 * We permit 0-byte sends. (rds-ping depends on this.)
-			 * BUT if there is an atomic op and no sent data,
-			 * we turn off sending the header, to achieve
-			 * "silent" atomics.
-			 * But see below; RDMA op might toggle this back on!
-			 */
-			if (rm->data.op_nents == 0)
-				rm->data.op_active = 0;
 		}
 
-		/* The transport either sends the whole rdma or none of it */
-		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
-			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+			ret = conn->c_trans->xmit_atomic(conn, rm);
 			if (ret)
 				break;
-			conn->c_xmit_rdma_sent = 1;
-
-			/* rdmas need data sent, even if just the header */
-			rm->data.op_active = 1;
-
+			conn->c_xmit_atomic_sent = 1;
 			/* The transport owns the mapped memory for now.
 			 * You can't unmap it while it's on the send queue */
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 		}
 
+		/*
+		 * A number of cases require an RDS header to be sent
+		 * even if there is no data.
+		 * We permit 0-byte sends; rds-ping depends on this.
+		 * However, if there are exclusively attached silent ops,
+		 * we skip the hdr/data send, to enable silent operation.
+		 */
+		if (rm->data.op_nents == 0) {
+			int ops_present;
+			int all_ops_are_silent = 1;
+
+			ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+			if (rm->atomic.op_active && !rm->atomic.op_silent)
+				all_ops_are_silent = 0;
+			if (rm->rdma.op_active && !rm->rdma.op_silent)
+				all_ops_are_silent = 0;
+
+			if (ops_present && all_ops_are_silent
+			    && !rm->m_rdma_cookie)
+				rm->data.op_active = 0;
+		}
+
 		if (rm->data.op_active && !conn->c_xmit_data_sent) {
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
@@ -1009,8 +1017,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	if (ret)
 		goto out;
 
-	if ((rm->m_rdma_cookie || rm->rdma.op_active) &&
-	    !conn->c_trans->xmit_rdma) {
+	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
 			       &rm->rdma, conn->c_trans->xmit_rdma);
-- 
cgit v1.2.3-59-g8ed1b


From aa0a4ef4ac3a3c5ffa35e32520bfbc0922ef3630 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 13 Apr 2010 12:00:35 -0700
Subject: RDS: Make sure cmsgs aren't used in improper ways

It hasn't cropped up in the field, but this code ensures it is
impossible to issue operations that pass an rdma cookie (DEST, MAP)
in the same sendmsg call that's actually initiating rdma or atomic
ops.

Disallowing this perverse-but-technically-allowed usage makes silent
RDMA heuristics slightly easier.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 38567f3ee7e8..69ab1040d02d 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -846,6 +846,7 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 {
 	struct cmsghdr *cmsg;
 	int size = 0;
+	int cmsg_groups = 0;
 	int retval;
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
@@ -857,19 +858,23 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 		switch (cmsg->cmsg_type) {
 		case RDS_CMSG_RDMA_ARGS:
+			cmsg_groups |= 1;
 			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
 			if (retval < 0)
 				return retval;
 			size += retval;
+
 			break;
 
 		case RDS_CMSG_RDMA_DEST:
 		case RDS_CMSG_RDMA_MAP:
+			cmsg_groups |= 2;
 			/* these are valid but do no add any size */
 			break;
 
 		case RDS_CMSG_ATOMIC_CSWP:
 		case RDS_CMSG_ATOMIC_FADD:
+			cmsg_groups |= 1;
 			size += sizeof(struct scatterlist);
 			break;
 
@@ -881,6 +886,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
 
+	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+	if (cmsg_groups == 3)
+		return -EINVAL;
+
 	return size;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ff3d7d36134ef7138803734fdbf91cc986ea7976 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 1 Mar 2010 14:03:09 -0800
Subject: RDS: Perform unmapping ops in stages

Previously, RDS would wait until the final send WR had completed
and then handle cleanup. With silent ops, we do not know
if an atomic, rdma, or data op will be last. This patch
handles any of these cases by keeping a pointer to the last
op in the message in m_last_op.

When the TX completion event fires, rds dispatches to per-op-type
cleanup functions, and then does whole-message cleanup, if the
last op equalled m_last_op.

This patch also moves towards having op-specific functions take
the op struct, instead of the overall rm struct.

rds_ib_connection has a pointer to keep track of a a partially-
completed data send operation. This patch changes it from an
rds_message pointer to the narrower rm_data_op pointer, and
modifies places that use this pointer as needed.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      |   7 +-
 net/rds/ib_cm.c   |   9 +-
 net/rds/ib_send.c | 242 ++++++++++++++++++++++++++++--------------------------
 net/rds/rds.h     |   4 +-
 net/rds/send.c    |   6 +-
 5 files changed, 142 insertions(+), 126 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index d64b5087eefe..202140a84f0c 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -53,8 +53,7 @@ struct rds_ib_connect_private {
 };
 
 struct rds_ib_send_work {
-	struct rds_message	*s_rm;
-	struct rm_rdma_op	*s_op;
+	void			*s_op;
 	struct ib_send_wr	s_wr;
 	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
 	unsigned long		s_queued;
@@ -92,7 +91,7 @@ struct rds_ib_connection {
 
 	/* tx */
 	struct rds_ib_work_ring	i_send_ring;
-	struct rds_message	*i_rm;
+	struct rm_data_op	*i_data_op;
 	struct rds_header	*i_send_hdrs;
 	u64			i_send_hdrs_dma;
 	struct rds_ib_send_work *i_sends;
@@ -336,7 +335,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
 			     u32 *adv_credits, int need_posted, int max_posted);
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 8b0c743c0900..b5b5ebbc0bb6 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -673,9 +673,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 	BUG_ON(ic->rds_ibdev);
 
 	/* Clear pending transmit */
-	if (ic->i_rm) {
-		rds_message_put(ic->i_rm);
-		ic->i_rm = NULL;
+	if (ic->i_data_op) {
+		struct rds_message *rm;
+
+		rm = container_of(ic->i_data_op, struct rds_message, data);
+		rds_message_put(rm);
+		ic->i_data_op = NULL;
 	}
 
 	/* Clear the ACK state */
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 95f15247acd7..6461a152bd5b 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -67,80 +67,122 @@ static void rds_ib_send_complete(struct rds_message *rm,
 	complete(rm, notify_status);
 }
 
-static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
-			  struct rds_ib_send_work *send,
-			  int wc_status)
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
 {
-	struct rds_message *rm = send->s_rm;
-
-	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
-	ib_dma_unmap_sg(ic->i_cm_id->device,
-			rm->data.op_sg, rm->data.op_nents,
-			DMA_TO_DEVICE);
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+}
 
-	if (rm->rdma.op_active) {
-		struct rm_rdma_op *op = &rm->rdma;
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+				   struct rm_rdma_op *op,
+				   int wc_status)
+{
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
 
-		if (op->op_mapped) {
-			ib_dma_unmap_sg(ic->i_cm_id->device,
-					op->op_sg, op->op_nents,
-					op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-			op->op_mapped = 0;
-		}
+	/* If the user asked for a completion notification on this
+	 * message, we can implement three different semantics:
+	 *  1.	Notify when we received the ACK on the RDS message
+	 *	that was queued with the RDMA. This provides reliable
+	 *	notification of RDMA status at the expense of a one-way
+	 *	packet delay.
+	 *  2.	Notify when the IB stack gives us the completion event for
+	 *	the RDMA operation.
+	 *  3.	Notify when the IB stack gives us the completion event for
+	 *	the accompanying RDS messages.
+	 * Here, we implement approach #3. To implement approach #2,
+	 * we would need to take an event for the rdma WR. To implement #1,
+	 * don't call rds_rdma_send_complete at all, and fall back to the notify
+	 * handling in the ACK processing code.
+	 *
+	 * Note: There's no need to explicitly sync any RDMA buffers using
+	 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+	 * operation itself unmapped the RDMA buffers, which takes care
+	 * of synching.
+	 */
+	rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+			     wc_status, rds_rdma_send_complete);
 
-		/* If the user asked for a completion notification on this
-		 * message, we can implement three different semantics:
-		 *  1.	Notify when we received the ACK on the RDS message
-		 *	that was queued with the RDMA. This provides reliable
-		 *	notification of RDMA status at the expense of a one-way
-		 *	packet delay.
-		 *  2.	Notify when the IB stack gives us the completion event for
-		 *	the RDMA operation.
-		 *  3.	Notify when the IB stack gives us the completion event for
-		 *	the accompanying RDS messages.
-		 * Here, we implement approach #3. To implement approach #2,
-		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
-		 * don't call rds_rdma_send_complete at all, and fall back to the notify
-		 * handling in the ACK processing code.
-		 *
-		 * Note: There's no need to explicitly sync any RDMA buffers using
-		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
-		 * operation itself unmapped the RDMA buffers, which takes care
-		 * of synching.
-		 */
-		rds_ib_send_complete(rm, wc_status, rds_rdma_send_complete);
+	if (op->op_write)
+		rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+	else
+		rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
+}
 
-		if (rm->rdma.op_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
-		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+				     struct rm_atomic_op *op,
+				     int wc_status)
+{
+	/* unmap atomic recvbuf */
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+				DMA_FROM_DEVICE);
+		op->op_mapped = 0;
 	}
 
-	if (rm->atomic.op_active) {
-		struct rm_atomic_op *op = &rm->atomic;
-
-		/* unmap atomic recvbuf */
-		if (op->op_mapped) {
-			ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
-					DMA_FROM_DEVICE);
-			op->op_mapped = 0;
-		}
+	rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+			     wc_status, rds_atomic_send_complete);
 
-		rds_ib_send_complete(rm, wc_status, rds_atomic_send_complete);
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+		rds_stats_inc(s_atomic_cswp);
+	else
+		rds_stats_inc(s_atomic_fadd);
+}
 
-		if (rm->atomic.op_type == RDS_ATOMIC_TYPE_CSWP)
-			rds_stats_inc(s_atomic_cswp);
-		else
-			rds_stats_inc(s_atomic_fadd);
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+						struct rds_ib_send_work *send,
+						int wc_status)
+{
+	struct rds_message *rm = NULL;
+
+	/* In the error case, wc.opcode sometimes contains garbage */
+	switch (send->s_wr.opcode) {
+	case IB_WR_SEND:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, data);
+			rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_READ:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, rdma);
+			rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, atomic);
+			rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+		}
+		break;
+	default:
+		if (printk_ratelimit())
+			printk(KERN_NOTICE
+			       "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+			       __func__, send->s_wr.opcode);
+		break;
 	}
 
-	/* If anyone waited for this message to get flushed out, wake
-	 * them up now */
-	rds_message_unmapped(rm);
+	send->s_wr.opcode = 0xdead;
 
-	rds_message_put(rm);
-	send->s_rm = NULL;
+	return rm;
 }
 
 void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -151,7 +193,6 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
 		struct ib_sge *sge;
 
-		send->s_rm = NULL;
 		send->s_op = NULL;
 
 		send->s_wr.wr_id = i;
@@ -173,9 +214,8 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
 	u32 i;
 
 	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-		if (!send->s_rm || send->s_wr.opcode == 0xdead)
-			continue;
-		rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+		if (send->s_op && send->s_wr.opcode != 0xdead)
+			rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
 	}
 }
 
@@ -189,6 +229,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 {
 	struct rds_connection *conn = context;
 	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_message *rm = NULL;
 	struct ib_wc wc;
 	struct rds_ib_send_work *send;
 	u32 completed;
@@ -222,42 +263,18 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 		for (i = 0; i < completed; i++) {
 			send = &ic->i_sends[oldest];
 
-			/* In the error case, wc.opcode sometimes contains garbage */
-			switch (send->s_wr.opcode) {
-			case IB_WR_SEND:
-			case IB_WR_RDMA_WRITE:
-			case IB_WR_RDMA_READ:
-			case IB_WR_ATOMIC_FETCH_AND_ADD:
-			case IB_WR_ATOMIC_CMP_AND_SWP:
-				if (send->s_rm)
-					rds_ib_send_unmap_rm(ic, send, wc.status);
-				break;
-			default:
-				if (printk_ratelimit())
-					printk(KERN_NOTICE
-						"RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
-						__func__, send->s_wr.opcode);
-				break;
-			}
+			rm = rds_ib_send_unmap_op(ic, send, wc.status);
 
-			send->s_wr.opcode = 0xdead;
-			send->s_wr.num_sge = 1;
 			if (send->s_queued + HZ/2 < jiffies)
 				rds_ib_stats_inc(s_ib_tx_stalled);
 
-			/* If a RDMA operation produced an error, signal this right
-			 * away. If we don't, the subsequent SEND that goes with this
-			 * RDMA will be canceled with ERR_WFLUSH, and the application
-			 * never learn that the RDMA failed. */
-			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
-				struct rds_message *rm;
-
-				rm = rds_send_get_message(conn, send->s_op);
-				if (rm) {
-					rds_ib_send_unmap_rm(ic, send, wc.status);
-					rds_ib_send_complete(rm, wc.status, rds_rdma_send_complete);
-					rds_message_put(rm);
-				}
+			if (&send->s_op == &rm->m_final_op) {
+				/* If anyone waited for this message to get flushed out, wake
+				 * them up now */
+				rds_message_unmapped(rm);
+
+				rds_message_put(rm);
+				send->s_op = NULL;
 			}
 
 			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
@@ -512,7 +529,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* map the message the first time we see it */
-	if (!ic->i_rm) {
+	if (!ic->i_data_op) {
 		if (rm->data.op_nents) {
 			rm->data.op_count = ib_dma_map_sg(dev,
 							  rm->data.op_sg,
@@ -530,7 +547,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		}
 
 		rds_message_addref(rm);
-		ic->i_rm = rm;
+		ic->i_data_op = &rm->data;
 
 		/* Finalize the header */
 		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -583,7 +600,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &rm->data.op_sg[sg];
+	scat = &ic->i_data_op->op_sg[sg];
 	i = 0;
 	do {
 		unsigned int len = 0;
@@ -658,9 +675,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 	/* if we finished the message then send completion owns it */
 	if (scat == &rm->data.op_sg[rm->data.op_count]) {
-		prev->s_rm = ic->i_rm;
+		prev->s_op = ic->i_data_op;
 		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
-		ic->i_rm = NULL;
+		ic->i_data_op = NULL;
 	}
 
 	/* Put back wrs & credits we didn't use */
@@ -681,9 +698,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-		if (prev->s_rm) {
-			ic->i_rm = prev->s_rm;
-			prev->s_rm = NULL;
+		if (prev->s_op) {
+			ic->i_data_op = prev->s_op;
+			prev->s_op = NULL;
 		}
 
 		rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
@@ -701,10 +718,9 @@ out:
  * A simplified version of the rdma case, we always map 1 SG, and
  * only 8 bytes, for the return value from the atomic operation.
  */
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm)
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
-	struct rm_atomic_op *op = &rm->atomic;
 	struct rds_ib_send_work *send = NULL;
 	struct ib_send_wr *failed_wr;
 	struct rds_ib_device *rds_ibdev;
@@ -741,14 +757,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm)
 	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
 	send->s_wr.wr.atomic.rkey = op->op_rkey;
 
-	/*
-	 * If there is no data or rdma ops in the message, then
-	 * we must fill in s_rm ourselves, so we properly clean up
-	 * on completion.
-	 */
-	if (!rm->rdma.op_active && !rm->data.op_active)
-		send->s_rm = rm;
-
 	/* map 8 byte retval buffer to the device */
 	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
 	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
@@ -809,7 +817,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 
 	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
 
-	/* map the message the first time we see it */
+	/* map the op the first time we see it */
 	if (!op->op_mapped) {
 		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
 					     op->op_sg, op->op_nents, (op->op_write) ?
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 23b921000e74..7291f006f364 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -308,6 +308,8 @@ struct rds_message {
 	unsigned int		m_used_sgs;
 	unsigned int		m_total_sgs;
 
+	void			*m_final_op;
+
 	struct {
 		struct rm_atomic_op {
 			int			op_type;
@@ -421,7 +423,7 @@ struct rds_transport {
 	int (*xmit_cong_map)(struct rds_connection *conn,
 			     struct rds_cong_map *map, unsigned long offset);
 	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
-	int (*xmit_atomic)(struct rds_connection *conn, struct rds_message *rm);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
 	int (*recv)(struct rds_connection *conn);
 	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
 				size_t size);
diff --git a/net/rds/send.c b/net/rds/send.c
index 69ab1040d02d..d1f364e44e36 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -252,6 +252,7 @@ int rds_send_xmit(struct rds_connection *conn)
 
 		/* The transport either sends the whole rdma or none of it */
 		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			rm->m_final_op = &rm->rdma;
 			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
 			if (ret)
 				break;
@@ -263,10 +264,12 @@ int rds_send_xmit(struct rds_connection *conn)
 		}
 
 		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
-			ret = conn->c_trans->xmit_atomic(conn, rm);
+			rm->m_final_op = &rm->atomic;
+			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
 			if (ret)
 				break;
 			conn->c_xmit_atomic_sent = 1;
+
 			/* The transport owns the mapped memory for now.
 			 * You can't unmap it while it's on the send queue */
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
@@ -295,6 +298,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		}
 
 		if (rm->data.op_active && !conn->c_xmit_data_sent) {
+			rm->m_final_op = &rm->data;
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
 						  conn->c_xmit_sg,
-- 
cgit v1.2.3-59-g8ed1b


From e32b4a70495aac6a612e13f4c21db09dd756ff2c Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 3 Mar 2010 19:25:21 -0800
Subject: RDS/IB: Do not wait for send ring to be empty on conn shutdown

Now that we are signaling send completions much less, we are likely
to have dirty entries in the send queue when the connection is
shut down (on rmmod, for example.) These are cleaned up a little
further down in conn_shutdown, but if we wait on the ring_empty_wait
for them, it'll never happen, and we hand on unload.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_cm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b5b5ebbc0bb6..1a91af75f4c7 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -620,8 +620,11 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 				ic->i_cm_id, err);
 		}
 
+		/*
+		 * Don't wait for the send ring to be empty -- there may be completed
+		 * non-signaled entries sitting on there. We unmap these below.
+		 */
 		wait_event(rds_ib_ring_empty_wait,
-			rds_ib_ring_empty(&ic->i_send_ring) &&
 			rds_ib_ring_empty(&ic->i_recv_ring));
 
 		if (ic->i_send_hdrs)
-- 
cgit v1.2.3-59-g8ed1b


From 77dd550e5547846604ff6f90c4dc6bba4414e485 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 22 Mar 2010 15:22:04 -0700
Subject: RDS: Stop supporting old cong map sending method

We now ask the transport to give us a rm for the congestion
map, and then we handle it normally. Previously, the
transport defined a function that we would call to send
a congestion map.

Convert TCP and loop transports to new cong map method.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.c       |  1 -
 net/rds/iw.c       |  1 -
 net/rds/loop.c     | 19 ++++++-------------
 net/rds/rds.h      |  6 ------
 net/rds/send.c     | 39 +++++++--------------------------------
 net/rds/tcp.c      |  1 -
 net/rds/tcp.h      |  2 --
 net/rds/tcp_send.c | 50 --------------------------------------------------
 8 files changed, 13 insertions(+), 106 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 932dacbdbea1..927c481b5245 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -265,7 +265,6 @@ struct rds_transport rds_ib_transport = {
 	.laddr_check		= rds_ib_laddr_check,
 	.xmit_complete		= rds_ib_xmit_complete,
 	.xmit			= rds_ib_xmit,
-	.xmit_cong_map		= NULL,
 	.xmit_rdma		= rds_ib_xmit_rdma,
 	.xmit_atomic		= rds_ib_xmit_atomic,
 	.recv			= rds_ib_recv,
diff --git a/net/rds/iw.c b/net/rds/iw.c
index e766aecd46c9..467790da1316 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
 	.laddr_check		= rds_iw_laddr_check,
 	.xmit_complete		= rds_iw_xmit_complete,
 	.xmit			= rds_iw_xmit,
-	.xmit_cong_map		= NULL,
 	.xmit_rdma		= rds_iw_xmit_rdma,
 	.recv			= rds_iw_recv,
 	.conn_alloc		= rds_iw_conn_alloc,
diff --git a/net/rds/loop.c b/net/rds/loop.c
index 4a3dd49315b4..c390156b426f 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -61,6 +61,12 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 			 unsigned int hdr_off, unsigned int sg,
 			 unsigned int off)
 {
+	/* Do not send cong updates to loopback */
+	if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+	}
+
 	BUG_ON(hdr_off || sg || off);
 
 	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
@@ -88,18 +94,6 @@ static void rds_loop_inc_free(struct rds_incoming *inc)
         rds_message_put(rm);
 }
 
-static int rds_loop_xmit_cong_map(struct rds_connection *conn,
-				  struct rds_cong_map *map,
-				  unsigned long offset)
-{
-	BUG_ON(offset);
-	BUG_ON(map != conn->c_lcong);
-
-	rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-
-	return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
-}
-
 /* we need to at least give the thread something to succeed */
 static int rds_loop_recv(struct rds_connection *conn)
 {
@@ -180,7 +174,6 @@ void rds_loop_exit(void)
  */
 struct rds_transport rds_loop_transport = {
 	.xmit			= rds_loop_xmit,
-	.xmit_cong_map		= rds_loop_xmit_cong_map,
 	.recv			= rds_loop_recv,
 	.conn_alloc		= rds_loop_conn_alloc,
 	.conn_free		= rds_loop_conn_free,
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 7291f006f364..e81d7e478474 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -393,10 +393,6 @@ struct rds_notifier {
  *                 transport is responsible for other serialization, including
  *                 rds_recv_incoming().  This is called in process context but
  *                 should try hard not to block.
- *
- * @xmit_cong_map: This asks the transport to send the local bitmap down the
- * 		   given connection.  XXX get a better story about the bitmap
- * 		   flag and header.
  */
 
 #define RDS_TRANS_IB	0
@@ -420,8 +416,6 @@ struct rds_transport {
 	void (*xmit_complete)(struct rds_connection *conn);
 	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
 		    unsigned int hdr_off, unsigned int sg, unsigned int off);
-	int (*xmit_cong_map)(struct rds_connection *conn,
-			     struct rds_cong_map *map, unsigned long offset);
 	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
 	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
 	int (*recv)(struct rds_connection *conn);
diff --git a/net/rds/send.c b/net/rds/send.c
index d1f364e44e36..8a0647af5d95 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -147,41 +147,16 @@ int rds_send_xmit(struct rds_connection *conn)
 		/*
 		 * If between sending messages, we can send a pending congestion
 		 * map update.
-		 *
-		 * Transports either define a special xmit_cong_map function,
-		 * or we allocate a cong_map message and treat it just like any
-		 * other send.
 		 */
 		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
-			if (conn->c_trans->xmit_cong_map) {
-				unsigned long map_offset = 0;
-				unsigned long map_bytes = sizeof(struct rds_header) +
-					RDS_CONG_MAP_BYTES;
-
-				while (map_bytes) {
-					ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
-									   map_offset);
-					if (ret <= 0) {
-						/* too far down the rabbithole! */
-						mutex_unlock(&conn->c_send_lock);
-						rds_conn_error(conn, "Cong map xmit failed\n");
-						goto out;
-					}
-
-					map_offset += ret;
-					map_bytes -= ret;
-				}
-			} else {
-				/* send cong update like a normal rm */
-				rm = rds_cong_update_alloc(conn);
-				if (IS_ERR(rm)) {
-					ret = PTR_ERR(rm);
-					break;
-				}
-				rm->data.op_active = 1;
-
-				conn->c_xmit_rm = rm;
+			rm = rds_cong_update_alloc(conn);
+			if (IS_ERR(rm)) {
+				ret = PTR_ERR(rm);
+				break;
 			}
+			rm->data.op_active = 1;
+
+			conn->c_xmit_rm = rm;
 		}
 
 		/*
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 831881640e56..3262992f5f2b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
 	.laddr_check		= rds_tcp_laddr_check,
 	.xmit_prepare		= rds_tcp_xmit_prepare,
 	.xmit_complete		= rds_tcp_xmit_complete,
-	.xmit_cong_map		= rds_tcp_xmit_cong_map,
 	.xmit			= rds_tcp_xmit,
 	.recv			= rds_tcp_recv,
 	.conn_alloc		= rds_tcp_conn_alloc,
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c639872bf701..16b166379ea9 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -80,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
 int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 	         unsigned int hdr_off, unsigned int sg, unsigned int off);
 void rds_tcp_write_space(struct sock *sk);
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
-			  struct rds_cong_map *map, unsigned long offset);
 
 /* tcp_stats.c */
 DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 53c1de55f471..2979fb4a4b9a 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -76,56 +76,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
 	return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
 }
 
-/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
-			  struct rds_cong_map *map, unsigned long offset)
-{
-	static struct rds_header rds_tcp_map_header = {
-		.h_flags = RDS_FLAG_CONG_BITMAP,
-	};
-	struct rds_tcp_connection *tc = conn->c_transport_data;
-	unsigned long i;
-	int ret;
-	int copied = 0;
-
-	/* Some problem claims cpu_to_be32(constant) isn't a constant. */
-	rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
-
-	if (offset < sizeof(struct rds_header)) {
-		ret = rds_tcp_sendmsg(tc->t_sock,
-				      (void *)&rds_tcp_map_header + offset,
-				      sizeof(struct rds_header) - offset);
-		if (ret <= 0)
-			return ret;
-		offset += ret;
-		copied = ret;
-		if (offset < sizeof(struct rds_header))
-			return ret;
-	}
-
-	offset -= sizeof(struct rds_header);
-	i = offset / PAGE_SIZE;
-	offset = offset % PAGE_SIZE;
-	BUG_ON(i >= RDS_CONG_MAP_PAGES);
-
-	do {
-		ret = tc->t_sock->ops->sendpage(tc->t_sock,
-					virt_to_page(map->m_page_addrs[i]),
-					offset, PAGE_SIZE - offset,
-					MSG_DONTWAIT);
-		if (ret <= 0)
-			break;
-		copied += ret;
-		offset += ret;
-		if (offset == PAGE_SIZE) {
-			offset = 0;
-			i++;
-		}
-	} while (i < RDS_CONG_MAP_PAGES);
-
-        return copied ? copied : ret;
-}
-
 /* the core send_sem serializes this with other xmit and shutdown */
 int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 	         unsigned int hdr_off, unsigned int sg, unsigned int off)
-- 
cgit v1.2.3-59-g8ed1b


From f17a1a55fb672d7f64be7f2e940ef5669e5efa0a Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Thu, 18 Mar 2010 17:19:52 -0700
Subject: RDS: Refill recv ring directly from tasklet

Performance is better if we use allocations that don't block
to refill the receive ring. Since the whole reason we were
kicking out to the worker thread was so we could do blocking
allocs, we no longer need to do this.

Remove gfp params from rds_ib_recv_refill(); we always use
GFP_NOWAIT.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      |  3 +--
 net/rds/ib_cm.c   |  2 +-
 net/rds/ib_recv.c | 22 ++++++++--------------
 3 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 202140a84f0c..4c2ee2401979 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -297,8 +297,7 @@ void rds_ib_flush_mrs(void);
 int __init rds_ib_recv_init(void);
 void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-		       gfp_t page_gfp, int prefill);
+int rds_ib_recv_refill(struct rds_connection *conn, int prefill);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 1a91af75f4c7..75eda9c82135 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	rds_ib_recv_init_ring(ic);
 	/* Post receive buffers - as a side effect, this will update
 	 * the posted credit count. */
-	rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+	rds_ib_recv_refill(conn, 1);
 
 	/* Tune RNR behavior */
 	rds_ib_tune_rnr(ic, &qp_attr);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index e9fe08a86c40..8f041f7954a2 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -135,8 +135,7 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 }
 
 static int rds_ib_recv_refill_one(struct rds_connection *conn,
-				  struct rds_ib_recv_work *recv,
-				  gfp_t kptr_gfp, gfp_t page_gfp)
+				  struct rds_ib_recv_work *recv)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	dma_addr_t dma_addr;
@@ -148,8 +147,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 			rds_ib_stats_inc(s_ib_rx_alloc_limit);
 			goto out;
 		}
-		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
-						 kptr_gfp);
+		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT);
 		if (!recv->r_ibinc) {
 			atomic_dec(&rds_ib_allocation);
 			goto out;
@@ -159,7 +157,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	}
 
 	if (!recv->r_frag) {
-		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
 		if (!recv->r_frag)
 			goto out;
 		INIT_LIST_HEAD(&recv->r_frag->f_item);
@@ -167,7 +165,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	}
 
 	if (!ic->i_frag.f_page) {
-		ic->i_frag.f_page = alloc_page(page_gfp);
+		ic->i_frag.f_page = alloc_page(GFP_NOWAIT);
 		if (!ic->i_frag.f_page)
 			goto out;
 		ic->i_frag.f_offset = 0;
@@ -221,8 +219,7 @@ out:
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-		       gfp_t page_gfp, int prefill)
+int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_recv_work *recv;
@@ -241,7 +238,7 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 		}
 
 		recv = &ic->i_recvs[pos];
-		ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+		ret = rds_ib_recv_refill_one(conn, recv);
 		if (ret) {
 			ret = -1;
 			break;
@@ -856,11 +853,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
 	if (rds_ib_ring_empty(&ic->i_recv_ring))
 		rds_ib_stats_inc(s_ib_rx_ring_empty);
 
-	/*
-	 * If the ring is running low, then schedule the thread to refill.
-	 */
 	if (rds_ib_ring_low(&ic->i_recv_ring))
-		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+		rds_ib_recv_refill(conn, 0);
 }
 
 int rds_ib_recv(struct rds_connection *conn)
@@ -875,7 +869,7 @@ int rds_ib_recv(struct rds_connection *conn)
 	 * we're really low and we want the caller to back off for a bit.
 	 */
 	mutex_lock(&ic->i_recv_mutex);
-	if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+	if (rds_ib_recv_refill(conn, 0))
 		ret = -ENOMEM;
 	else
 		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
-- 
cgit v1.2.3-59-g8ed1b


From 049ee3f500954176a87f22e6ee3e98aecb1b8958 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 23 Mar 2010 17:39:07 -0700
Subject: RDS: Change send lock from a mutex to a spinlock

This change allows us to call rds_send_xmit() from a tasklet,
which is crucial to our new operating model.

* Change c_send_lock to a spinlock
* Update stats fields "sem_" to "_lock"
* Remove unneeded rds_conn_is_sending()

About locking between shutdown and send -- send checks if the
connection is up. Shutdown puts the connection into
DISCONNECTING. After this, all threads entering send will exit
immediately. However, a thread could be *in* send_xmit(), so
shutdown acquires the c_send_lock to ensure everyone is out
before proceeding with connection shutdown.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/connection.c | 22 ++++++----------------
 net/rds/rds.h        |  6 +++---
 net/rds/send.c       | 15 +++++++--------
 net/rds/stats.c      |  4 ++--
 4 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 88bcaf3f3e16..56aebe444ad3 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -62,18 +62,6 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
 } while (0)
 
-static inline int rds_conn_is_sending(struct rds_connection *conn)
-{
-	int ret = 0;
-
-	if (!mutex_trylock(&conn->c_send_lock))
-		ret = 1;
-	else
-		mutex_unlock(&conn->c_send_lock);
-
-	return ret;
-}
-
 static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 					      __be32 laddr, __be32 faddr,
 					      struct rds_transport *trans)
@@ -158,7 +146,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	spin_lock_init(&conn->c_lock);
 	conn->c_next_tx_seq = 1;
 
-	mutex_init(&conn->c_send_lock);
+	spin_lock_init(&conn->c_send_lock);
 	INIT_LIST_HEAD(&conn->c_send_queue);
 	INIT_LIST_HEAD(&conn->c_retrans);
 
@@ -283,10 +271,12 @@ void rds_conn_shutdown(struct rds_connection *conn)
 		}
 		mutex_unlock(&conn->c_cm_lock);
 
-		mutex_lock(&conn->c_send_lock);
+		/* verify everybody's out of rds_send_xmit() */
+		spin_lock_irq(&conn->c_send_lock);
+		spin_unlock_irq(&conn->c_send_lock);
+
 		conn->c_trans->conn_shutdown(conn);
 		rds_conn_reset(conn);
-		mutex_unlock(&conn->c_send_lock);
 
 		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
 			/* This can happen - eg when we're in the middle of tearing
@@ -476,7 +466,7 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
 	cinfo->flags = 0;
 
 	rds_conn_info_set(cinfo->flags,
-			  rds_conn_is_sending(conn), SENDING);
+			  spin_is_locked(&conn->c_send_lock), SENDING);
 	/* XXX Future: return the state rather than these funky bits */
 	rds_conn_info_set(cinfo->flags,
 			  atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
diff --git a/net/rds/rds.h b/net/rds/rds.h
index e81d7e478474..c3a668b9cc14 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -91,7 +91,7 @@ struct rds_connection {
 	struct rds_cong_map	*c_lcong;
 	struct rds_cong_map	*c_fcong;
 
-	struct mutex		c_send_lock;	/* protect send ring */
+	spinlock_t		c_send_lock;	/* protect send ring */
 	struct rds_message	*c_xmit_rm;
 	unsigned long		c_xmit_sg;
 	unsigned int		c_xmit_hdr_off;
@@ -548,8 +548,8 @@ struct rds_statistics {
 	uint64_t	s_recv_ping;
 	uint64_t	s_send_queue_empty;
 	uint64_t	s_send_queue_full;
-	uint64_t	s_send_sem_contention;
-	uint64_t	s_send_sem_queue_raced;
+	uint64_t	s_send_lock_contention;
+	uint64_t	s_send_lock_queue_raced;
 	uint64_t	s_send_immediate_retry;
 	uint64_t	s_send_delayed_retry;
 	uint64_t	s_send_drop_acked;
diff --git a/net/rds/send.c b/net/rds/send.c
index 8a0647af5d95..d4feec6ad09c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -116,19 +116,18 @@ int rds_send_xmit(struct rds_connection *conn)
 	int was_empty = 0;
 	LIST_HEAD(to_be_dropped);
 
+	if (!rds_conn_up(conn))
+		goto out;
+
 	/*
 	 * sendmsg calls here after having queued its message on the send
 	 * queue.  We only have one task feeding the connection at a time.  If
 	 * another thread is already feeding the queue then we back off.  This
 	 * avoids blocking the caller and trading per-connection data between
 	 * caches per message.
-	 *
-	 * The sem holder will issue a retry if they notice that someone queued
-	 * a message after they stopped walking the send queue but before they
-	 * dropped the sem.
 	 */
-	if (!mutex_trylock(&conn->c_send_lock)) {
-		rds_stats_inc(s_send_sem_contention);
+	if (!spin_trylock_irqsave(&conn->c_send_lock, flags)) {
+		rds_stats_inc(s_send_lock_contention);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -346,7 +345,7 @@ int rds_send_xmit(struct rds_connection *conn)
 	 * stop processing the loop when the transport hasn't taken
 	 * responsibility for forward progress.
 	 */
-	mutex_unlock(&conn->c_send_lock);
+	spin_unlock_irqrestore(&conn->c_send_lock, flags);
 
 	if (send_quota == 0 && !was_empty) {
 		/* We exhausted the send quota, but there's work left to
@@ -360,7 +359,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		 * spin lock */
 		spin_lock_irqsave(&conn->c_lock, flags);
 		if (!list_empty(&conn->c_send_queue)) {
-			rds_stats_inc(s_send_sem_queue_raced);
+			rds_stats_inc(s_send_lock_queue_raced);
 			ret = -EAGAIN;
 		}
 		spin_unlock_irqrestore(&conn->c_lock, flags);
diff --git a/net/rds/stats.c b/net/rds/stats.c
index c66d95d9c262..b77be8be33ba 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
 	"recv_ping",
 	"send_queue_empty",
 	"send_queue_full",
-	"send_sem_contention",
-	"send_sem_queue_raced",
+	"send_lock_contention",
+	"send_lock_queue_raced",
 	"send_immediate_retry",
 	"send_delayed_retry",
 	"send_drop_acked",
-- 
cgit v1.2.3-59-g8ed1b


From 2ad8099b58f274dc23bc866ca259d7e5db87fa1a Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 23 Mar 2010 17:48:04 -0700
Subject: RDS: rds_send_xmit() locking/irq fixes

rds_message_put() cannot be called with irqs off, so move it after
irqs are re-enabled.

Spinlocks throughout the function do not to use _irqsave because
the lock of c_send_lock at top already disabled irqs.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index d4feec6ad09c..624a3dc7f060 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -168,7 +168,7 @@ int rds_send_xmit(struct rds_connection *conn)
 		if (!rm) {
 			unsigned int len;
 
-			spin_lock_irqsave(&conn->c_lock, flags);
+			spin_lock(&conn->c_lock);
 
 			if (!list_empty(&conn->c_send_queue)) {
 				rm = list_entry(conn->c_send_queue.next,
@@ -183,7 +183,7 @@ int rds_send_xmit(struct rds_connection *conn)
 				list_move_tail(&rm->m_conn_item, &conn->c_retrans);
 			}
 
-			spin_unlock_irqrestore(&conn->c_lock, flags);
+			spin_unlock(&conn->c_lock);
 
 			if (!rm) {
 				was_empty = 1;
@@ -199,11 +199,10 @@ int rds_send_xmit(struct rds_connection *conn)
 			 */
 			if (rm->rdma.op_active &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
-				spin_lock_irqsave(&conn->c_lock, flags);
+				spin_lock(&conn->c_lock);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
 					list_move(&rm->m_conn_item, &to_be_dropped);
-				spin_unlock_irqrestore(&conn->c_lock, flags);
-				rds_message_put(rm);
+				spin_unlock(&conn->c_lock);
 				continue;
 			}
 
@@ -326,10 +325,6 @@ int rds_send_xmit(struct rds_connection *conn)
 		}
 	}
 
-	/* Nuke any messages we decided not to retransmit. */
-	if (!list_empty(&to_be_dropped))
-		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
-
 	if (conn->c_trans->xmit_complete)
 		conn->c_trans->xmit_complete(conn);
 
@@ -347,6 +342,14 @@ int rds_send_xmit(struct rds_connection *conn)
 	 */
 	spin_unlock_irqrestore(&conn->c_send_lock, flags);
 
+	/* Nuke any messages we decided not to retransmit. */
+	if (!list_empty(&to_be_dropped)) {
+		/* irqs on here, so we can put(), unlike above */
+		list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+			rds_message_put(rm);
+		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+	}
+
 	if (send_quota == 0 && !was_empty) {
 		/* We exhausted the send quota, but there's work left to
 		 * do. Return and (re-)schedule the send worker.
-- 
cgit v1.2.3-59-g8ed1b


From a7d3a281483684f77e350b045af7f80a149fc4c7 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 16:20:18 -0700
Subject: RDS: Call rds_send_xmit() directly from sendmsg()

rds_sendmsg() is calling the send worker function to
send the just-queued datagrams, presumably because it wants
the behavior where anything not sent will re-call the send
worker. We now ensure all queued datagrams are sent by retrying
from the send completion handler, so this isn't needed any more.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 624a3dc7f060..15b715a85fd5 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1073,7 +1073,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	rds_stats_inc(s_send_queued);
 
 	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-		rds_send_worker(&conn->c_send_w.work);
+		rds_send_xmit(conn);
 
 	rds_message_put(rm);
 	return payload_len;
-- 
cgit v1.2.3-59-g8ed1b


From 2fa57129df61bf3fb7d90c5486fe15df94091f61 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 16:45:40 -0700
Subject: RDS: Bypass workqueue when queueing cong updates

Now that rds_send_xmit() does not block, we can call it directly
instead of going through the helper thread.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/cong.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/cong.c b/net/rds/cong.c
index c741e906d49f..75ea686f27d5 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
 			rds_stats_inc(s_cong_update_queued);
-			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+			rds_send_xmit(conn);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From f2ec76f288118fb18449402d75383212cbcb6762 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 16:46:46 -0700
Subject: RDS: Use NOWAIT in message_map_pages()

Can no longer block, so use NOWAIT.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 1f73a7358a8e..dd915e3e492d 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -256,7 +256,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	int num_sgs = ceil(total_len, PAGE_SIZE);
 	int extra_bytes = num_sgs * sizeof(struct scatterlist);
 
-	rm = rds_message_alloc(extra_bytes, GFP_KERNEL);
+	rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
 	if (!rm)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3-59-g8ed1b


From cf4b7389ee812817deeb11da1422004e01b50646 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 16:50:54 -0700
Subject: RDS: Fix locking in send on m_rs_lock

Do not nest m_rs_lock under c_lock

Disable interrupts in {rdma,atomic}_send_complete

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 15b715a85fd5..ecda3e6c432c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -468,8 +468,9 @@ void rds_atomic_send_complete(struct rds_message *rm, int status)
 	struct rds_sock *rs = NULL;
 	struct rm_atomic_op *ao;
 	struct rds_notifier *notifier;
+	unsigned long flags;
 
-	spin_lock(&rm->m_rs_lock);
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
 	ao = &rm->atomic;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
@@ -486,7 +487,7 @@ void rds_atomic_send_complete(struct rds_message *rm, int status)
 		ao->op_notifier = NULL;
 	}
 
-	spin_unlock(&rm->m_rs_lock);
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 	if (rs) {
 		rds_wake_sk_sleep(rs);
-- 
cgit v1.2.3-59-g8ed1b


From ab1a6926f589c51e7a57ce7544d85272c4acc854 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 16:52:12 -0700
Subject: RDS: rds_message_unmapped() doesn't need to check if queue active

If the queue has nobody on it, then wake_up does nothing.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index dd915e3e492d..9122b5392111 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -406,8 +406,7 @@ void rds_message_wait(struct rds_message *rm)
 void rds_message_unmapped(struct rds_message *rm)
 {
 	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-	if (waitqueue_active(&rds_message_flush_waitq))
-		wake_up(&rds_message_flush_waitq);
+	wake_up(&rds_message_flush_waitq);
 }
 EXPORT_SYMBOL_GPL(rds_message_unmapped);
 
-- 
cgit v1.2.3-59-g8ed1b


From 51e2cba8b5936c13b40f0fa11aa4e84683dbc751 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 17:47:30 -0700
Subject: RDS: Move atomic stats from general to ib-specific area

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h       | 2 ++
 net/rds/ib_send.c  | 4 ++--
 net/rds/ib_stats.c | 2 ++
 net/rds/rds.h      | 2 --
 net/rds/stats.c    | 2 --
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 4c2ee2401979..c506604325d5 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -202,6 +202,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rdma_mr_pool_flush;
 	uint64_t	s_ib_rdma_mr_pool_wait;
 	uint64_t	s_ib_rdma_mr_pool_depleted;
+	uint64_t	s_ib_atomic_cswp;
+	uint64_t	s_ib_atomic_fadd;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 6461a152bd5b..657037d96cbf 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -132,9 +132,9 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
 			     wc_status, rds_atomic_send_complete);
 
 	if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
-		rds_stats_inc(s_atomic_cswp);
+		rds_ib_stats_inc(s_ib_atomic_cswp);
 	else
-		rds_stats_inc(s_atomic_fadd);
+		rds_ib_stats_inc(s_ib_atomic_fadd);
 }
 
 /*
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d2c904dd6fbc..2d5965d6e97c 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
 	"ib_rdma_mr_pool_flush",
 	"ib_rdma_mr_pool_wait",
 	"ib_rdma_mr_pool_depleted",
+	"ib_atomic_cswp",
+	"ib_atomic_fadd",
 };
 
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c3a668b9cc14..2f19d49fac9c 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -566,8 +566,6 @@ struct rds_statistics {
 	uint64_t	s_cong_update_received;
 	uint64_t	s_cong_send_error;
 	uint64_t	s_cong_send_blocked;
-	uint64_t	s_atomic_cswp;
-	uint64_t	s_atomic_fadd;
 };
 
 /* af_rds.c */
diff --git a/net/rds/stats.c b/net/rds/stats.c
index b77be8be33ba..344929a663e5 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -75,8 +75,6 @@ static const char *const rds_stat_names[] = {
 	"cong_update_received",
 	"cong_send_error",
 	"cong_send_blocked",
-	"s_atomic_cswp",
-	"s_atomic_fadd",
 };
 
 void rds_stats_info_copy(struct rds_info_iterator *iter,
-- 
cgit v1.2.3-59-g8ed1b


From fcc5450c6386526034edc437e4cb2c67a6fdd7e9 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 17:08:49 -0700
Subject: RDS: Remove send_quota from send_xmit()

The purpose of the send quota was really to give fairness
when different connections were all using the same
workq thread to send backlogged msgs -- they could only send
so many before another connection could make progress.

Now that each connection is pushing the backlog from its
completion handler, they are all guaranteed to make progress
and the quota isn't needed any longer.

A thread *will* have to send all previously queued data, as well
as any further msgs placed on the queue while while c_send_lock
was held. In a pathological case a single process can get
roped into doing this for long periods while other threads
get off free. But, since it can only do this until the transport
reports full, this is a bounded scenario.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index ecda3e6c432c..656c5c1b32bc 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -110,12 +110,11 @@ int rds_send_xmit(struct rds_connection *conn)
 	struct rds_message *rm;
 	unsigned long flags;
 	unsigned int tmp;
-	unsigned int send_quota = send_batch_count;
 	struct scatterlist *sg;
 	int ret = 0;
-	int was_empty = 0;
 	LIST_HEAD(to_be_dropped);
 
+restart:
 	if (!rds_conn_up(conn))
 		goto out;
 
@@ -139,7 +138,7 @@ int rds_send_xmit(struct rds_connection *conn)
 	 * spin trying to push headers and data down the connection until
 	 * the connection doesn't make forward progress.
 	 */
-	while (--send_quota) {
+	while (1) {
 
 		rm = conn->c_xmit_rm;
 
@@ -185,10 +184,8 @@ int rds_send_xmit(struct rds_connection *conn)
 
 			spin_unlock(&conn->c_lock);
 
-			if (!rm) {
-				was_empty = 1;
+			if (!rm)
 				break;
-			}
 
 			/* Unfortunately, the way Infiniband deals with
 			 * RDMA to a bad MR key is by moving the entire
@@ -350,20 +347,23 @@ int rds_send_xmit(struct rds_connection *conn)
 		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
 	}
 
-	if (send_quota == 0 && !was_empty) {
-		/* We exhausted the send quota, but there's work left to
-		 * do. Return and (re-)schedule the send worker.
-		 */
-		ret = -EAGAIN;
-	}
-
-	if (ret == 0 && was_empty) {
+	/*
+	 * Other senders will see we have c_send_lock and exit. We
+	 * need to recheck the send queue and race again for c_send_lock
+	 * to make sure messages don't just sit on the send queue.
+	 *
+	 * If the transport cannot continue (i.e ret != 0), then it must
+	 * call us when more room is available, such as from the tx
+	 * completion handler.
+	 */
+	if (ret == 0) {
 		/* A simple bit test would be way faster than taking the
 		 * spin lock */
 		spin_lock_irqsave(&conn->c_lock, flags);
 		if (!list_empty(&conn->c_send_queue)) {
 			rds_stats_inc(s_send_lock_queue_raced);
-			ret = -EAGAIN;
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+			goto restart;
 		}
 		spin_unlock_irqrestore(&conn->c_lock, flags);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a40aa9233aa22d69212d02f92e5b607bd4d658f4 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 29 Mar 2010 17:10:01 -0700
Subject: RDS: Do wait_event_interruptible instead of wait_event

Can't see a reason not to allow signals to interrupt the wait.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/message.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 9122b5392111..4cb1ed704153 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -399,14 +399,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
  */
 void rds_message_wait(struct rds_message *rm)
 {
-	wait_event(rds_message_flush_waitq,
+	wait_event_interruptible(rds_message_flush_waitq,
 			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
 }
 
 void rds_message_unmapped(struct rds_message *rm)
 {
 	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-	wake_up(&rds_message_flush_waitq);
+	wake_up_interruptible(&rds_message_flush_waitq);
 }
 EXPORT_SYMBOL_GPL(rds_message_unmapped);
 
-- 
cgit v1.2.3-59-g8ed1b


From acfcd4d4ec4ed8cb504f96d4fabb7a94029b362b Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 31 Mar 2010 18:56:25 -0700
Subject: RDS: Get pong working again

Call send_xmit() directly from pong()

Set pongs as op_active

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 656c5c1b32bc..de5693cdcefb 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1108,6 +1108,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	}
 
 	rm->m_daddr = conn->c_faddr;
+	rm->data.op_active = 1;
 
 	/* If the connection is down, trigger a connect. We may
 	 * have scheduled a delayed reconnect however - in this case
@@ -1135,7 +1136,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	rds_stats_inc(s_send_queued);
 	rds_stats_inc(s_send_pong);
 
-	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		rds_send_xmit(conn);
+
 	rds_message_put(rm);
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9e29db0e3645cafa980e68a9c717a761448389e1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Apr 2010 16:38:14 -0400
Subject: RDS: Use a generation counter to avoid rds_send_xmit loop

rds_send_xmit is required to loop around after it releases the lock
because someone else could done a trylock, found someone working on the
list and backed off.

But, once we drop our lock, it is possible that someone else does come
in and make progress on the list.  We should detect this and not loop
around if another process is actually working on the list.

This patch adds a generation counter that is bumped every time we
get the lock and do some send work.  If the retry notices someone else
has bumped the generation counter, it does not need to loop around and
continue working.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/connection.c |  1 +
 net/rds/rds.h        |  1 +
 net/rds/send.c       | 11 +++++++----
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 56aebe444ad3..7e4e9dfdbc0b 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -147,6 +147,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	conn->c_next_tx_seq = 1;
 
 	spin_lock_init(&conn->c_send_lock);
+	atomic_set(&conn->c_send_generation, 1);
 	INIT_LIST_HEAD(&conn->c_send_queue);
 	INIT_LIST_HEAD(&conn->c_retrans);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 2f19d49fac9c..b57cb50c1f22 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -92,6 +92,7 @@ struct rds_connection {
 	struct rds_cong_map	*c_fcong;
 
 	spinlock_t		c_send_lock;	/* protect send ring */
+	atomic_t		c_send_generation;
 	struct rds_message	*c_xmit_rm;
 	unsigned long		c_xmit_sg;
 	unsigned int		c_xmit_hdr_off;
diff --git a/net/rds/send.c b/net/rds/send.c
index de5693cdcefb..663fd60b40cf 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -112,6 +112,7 @@ int rds_send_xmit(struct rds_connection *conn)
 	unsigned int tmp;
 	struct scatterlist *sg;
 	int ret = 0;
+	int gen = 0;
 	LIST_HEAD(to_be_dropped);
 
 restart:
@@ -134,6 +135,8 @@ restart:
 	if (conn->c_trans->xmit_prepare)
 		conn->c_trans->xmit_prepare(conn);
 
+	gen = atomic_inc_return(&conn->c_send_generation);
+
 	/*
 	 * spin trying to push headers and data down the connection until
 	 * the connection doesn't make forward progress.
@@ -359,13 +362,13 @@ restart:
 	if (ret == 0) {
 		/* A simple bit test would be way faster than taking the
 		 * spin lock */
-		spin_lock_irqsave(&conn->c_lock, flags);
+		smp_mb();
 		if (!list_empty(&conn->c_send_queue)) {
 			rds_stats_inc(s_send_lock_queue_raced);
-			spin_unlock_irqrestore(&conn->c_lock, flags);
-			goto restart;
+			if (gen == atomic_read(&conn->c_send_generation)) {
+				goto restart;
+			}
 		}
-		spin_unlock_irqrestore(&conn->c_lock, flags);
 	}
 out:
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From ce47f52f42e69d48d1b63fa618fce9cd7ffa9417 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Thu, 15 Apr 2010 17:19:29 -0700
Subject: RDS: Update comments in rds_send_xmit()

Update comments to reflect changes in previous commit.

Keeping as separate commits due to different authorship.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/send.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 663fd60b40cf..8e3fd9981c2e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -353,15 +353,14 @@ restart:
 	/*
 	 * Other senders will see we have c_send_lock and exit. We
 	 * need to recheck the send queue and race again for c_send_lock
-	 * to make sure messages don't just sit on the send queue.
+	 * to make sure messages don't just sit on the send queue, if
+	 * somebody hasn't already beat us into the loop.
 	 *
 	 * If the transport cannot continue (i.e ret != 0), then it must
 	 * call us when more room is available, such as from the tx
 	 * completion handler.
 	 */
 	if (ret == 0) {
-		/* A simple bit test would be way faster than taking the
-		 * spin lock */
 		smp_mb();
 		if (!list_empty(&conn->c_send_queue)) {
 			rds_stats_inc(s_send_lock_queue_raced);
-- 
cgit v1.2.3-59-g8ed1b


From 976673ee1b92d939168c8c1fbad3e16c45caa545 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 21 Apr 2010 13:04:43 -0700
Subject: rds: switch to rwlock on bind_lock

The bind_lock is almost entirely readonly, but it gets
hammered during normal operations and is a major bottleneck.

This commit changes it to an rwlock, which takes it from 80%
of the system time on a big numa machine down to much lower
numbers.

A better fix would involve RCU, which is done in a later commit

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/bind.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 65de5cbdb576..a65afff6f4b5 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -43,7 +43,7 @@
  * This is now called for every incoming frame so we arguably care much more
  * about it than we used to.
  */
-static DEFINE_SPINLOCK(rds_bind_lock);
+static DEFINE_RWLOCK(rds_bind_lock);
 static struct rb_root rds_bind_tree = RB_ROOT;
 
 static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
@@ -88,13 +88,13 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
 	struct rds_sock *rs;
 	unsigned long flags;
 
-	spin_lock_irqsave(&rds_bind_lock, flags);
+	read_lock_irqsave(&rds_bind_lock, flags);
 	rs = rds_bind_tree_walk(addr, port, NULL);
 	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 		rds_sock_addref(rs);
 	else
 		rs = NULL;
-	spin_unlock_irqrestore(&rds_bind_lock, flags);
+	read_unlock_irqrestore(&rds_bind_lock, flags);
 
 	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
 		ntohs(port));
@@ -116,7 +116,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 		last = rover - 1;
 	}
 
-	spin_lock_irqsave(&rds_bind_lock, flags);
+	write_lock_irqsave(&rds_bind_lock, flags);
 
 	do {
 		if (rover == 0)
@@ -137,7 +137,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 		  rs, &addr, (int)ntohs(*port));
 	}
 
-	spin_unlock_irqrestore(&rds_bind_lock, flags);
+	write_unlock_irqrestore(&rds_bind_lock, flags);
 
 	return ret;
 }
@@ -146,7 +146,7 @@ void rds_remove_bound(struct rds_sock *rs)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&rds_bind_lock, flags);
+	write_lock_irqsave(&rds_bind_lock, flags);
 
 	if (rs->rs_bound_addr) {
 		rdsdebug("rs %p unbinding from %pI4:%d\n",
@@ -158,7 +158,7 @@ void rds_remove_bound(struct rds_sock *rs)
 		rs->rs_bound_addr = 0;
 	}
 
-	spin_unlock_irqrestore(&rds_bind_lock, flags);
+	write_unlock_irqrestore(&rds_bind_lock, flags);
 }
 
 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-- 
cgit v1.2.3-59-g8ed1b


From c83188dcd76b1f0c17c31b4bbd8de57c634b19f8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 21 Apr 2010 13:09:28 -0700
Subject: rds: per-rm flush_wait waitq

This removes a global waitqueue used to wait for rds messages
and replaces it with a waitqueue inside the rds_message struct.

The global waitqueue turns into a global lock and significantly
bottlenecks operations on large machines.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/message.c | 7 +++----
 net/rds/rds.h     | 2 ++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 4cb1ed704153..96e2bf7dc77e 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -35,8 +35,6 @@
 
 #include "rds.h"
 
-static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
-
 static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
 [RDS_EXTHDR_NONE]	= 0,
 [RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
@@ -226,6 +224,7 @@ struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
 	INIT_LIST_HEAD(&rm->m_sock_item);
 	INIT_LIST_HEAD(&rm->m_conn_item);
 	spin_lock_init(&rm->m_rs_lock);
+	init_waitqueue_head(&rm->m_flush_wait);
 
 out:
 	return rm;
@@ -399,14 +398,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
  */
 void rds_message_wait(struct rds_message *rm)
 {
-	wait_event_interruptible(rds_message_flush_waitq,
+	wait_event_interruptible(rm->m_flush_wait,
 			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
 }
 
 void rds_message_unmapped(struct rds_message *rm)
 {
 	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-	wake_up_interruptible(&rds_message_flush_waitq);
+	wake_up_interruptible(&rm->m_flush_wait);
 }
 EXPORT_SYMBOL_GPL(rds_message_unmapped);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index b57cb50c1f22..c22bd7b49460 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -301,6 +301,8 @@ struct rds_message {
 	 *   -> rs->rs_lock
 	 */
 	spinlock_t		m_rs_lock;
+	wait_queue_head_t	m_flush_wait;
+
 	struct rds_sock		*m_rs;
 
 	/* cookie to send to remote, in rds header */
-- 
cgit v1.2.3-59-g8ed1b


From 764f2dd92f5cd308d1c4372b33fea2b265c093f5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 22 Apr 2010 21:59:15 -0400
Subject: rds: rcu-ize rds_ib_get_device()

rds_ib_get_device is called very often as we turn an
ip address into a corresponding device structure.  It currently
take a global spinlock as it walks different lists to find active
devices.

This commit changes the lists over to RCU, which isn't very complex
because they are not updated very often at all.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib.c      |  1 +
 net/rds/ib_rdma.c | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 927c481b5245..7a2131d37dfb 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -137,6 +137,7 @@ void rds_ib_remove_one(struct ib_device *device)
 	if (!rds_ibdev)
 		return;
 
+	synchronize_rcu();
 	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
 		list_del(&i_ipaddr->list);
 		kfree(i_ipaddr);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 242231f09464..7240e583ee58 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -32,6 +32,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "rds.h"
 #include "ib.h"
@@ -83,14 +84,14 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 	struct rds_ib_ipaddr *i_ipaddr;
 
 	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-		spin_lock_irq(&rds_ibdev->spinlock);
-		list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 			if (i_ipaddr->ipaddr == ipaddr) {
-				spin_unlock_irq(&rds_ibdev->spinlock);
+				rcu_read_unlock();
 				return rds_ibdev;
 			}
 		}
-		spin_unlock_irq(&rds_ibdev->spinlock);
+		rcu_read_unlock();
 	}
 
 	return NULL;
@@ -107,7 +108,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 	i_ipaddr->ipaddr = ipaddr;
 
 	spin_lock_irq(&rds_ibdev->spinlock);
-	list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+	list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
 	spin_unlock_irq(&rds_ibdev->spinlock);
 
 	return 0;
@@ -116,16 +117,23 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 {
 	struct rds_ib_ipaddr *i_ipaddr, *next;
+	struct rds_ib_ipaddr *to_free = NULL;
+
 
 	spin_lock_irq(&rds_ibdev->spinlock);
-	list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+	list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 		if (i_ipaddr->ipaddr == ipaddr) {
-			list_del(&i_ipaddr->list);
-			kfree(i_ipaddr);
+			list_del_rcu(&i_ipaddr->list);
+			to_free = i_ipaddr;
 			break;
 		}
 	}
 	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	if (to_free) {
+		synchronize_rcu();
+		kfree(to_free);
+	}
 }
 
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
-- 
cgit v1.2.3-59-g8ed1b


From 4a81802b5e5e0b059627d7173c917711cf35e668 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 23 Apr 2010 11:04:21 -0700
Subject: RDS/IB: Remove unused variable in ib_remove_addr()

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 7240e583ee58..7315fffd3bc8 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -116,7 +116,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 
 static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 {
-	struct rds_ib_ipaddr *i_ipaddr, *next;
+	struct rds_ib_ipaddr *i_ipaddr;
 	struct rds_ib_ipaddr *to_free = NULL;
 
 
-- 
cgit v1.2.3-59-g8ed1b


From e4c52c98e04937ea87b0979a81354d0040d284f9 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 23 Apr 2010 10:49:53 -0700
Subject: RDS/IB: add _to_node() macros for numa and use {k,v}malloc_node()

Allocate send/recv rings in memory that is node-local to the HCA.
This significantly helps performance.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.c      | 2 +-
 net/rds/ib.h      | 6 ++++++
 net/rds/ib_cm.c   | 6 ++++--
 net/rds/ib_rdma.c | 5 +++--
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 7a2131d37dfb..7d289d7985fe 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -77,7 +77,7 @@ void rds_ib_add_one(struct ib_device *device)
 		goto free_attr;
 	}
 
-	rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+	rds_ibdev = kmalloc_node(sizeof *rds_ibdev, GFP_KERNEL, ibdev_to_node(device));
 	if (!rds_ibdev)
 		goto free_attr;
 
diff --git a/net/rds/ib.h b/net/rds/ib.h
index c506604325d5..4bc3e2fba25a 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,6 +3,8 @@
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
 #include "rds.h"
 #include "rdma_transport.h"
 
@@ -167,6 +169,10 @@ struct rds_ib_device {
 	spinlock_t		spinlock;	/* protect the above */
 };
 
+#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
+#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
 /* bits for i_ack_flags */
 #define IB_ACK_IN_FLIGHT	0
 #define IB_ACK_REQUESTED	1
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 75eda9c82135..b5d0b60a26bc 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -347,7 +347,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 		goto out;
 	}
 
-	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+	ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+				   ibdev_to_node(dev));
 	if (!ic->i_sends) {
 		ret = -ENOMEM;
 		rdsdebug("send allocation failed\n");
@@ -355,7 +356,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	}
 	memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
 
-	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+	ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+				   ibdev_to_node(dev));
 	if (!ic->i_recvs) {
 		ret = -ENOMEM;
 		rdsdebug("recv allocation failed\n");
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 7315fffd3bc8..cc341cd70c87 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -297,7 +297,7 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 		rds_ib_flush_mr_pool(pool, 0);
 	}
 
-	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
 	if (!ibmr) {
 		err = -ENOMEM;
 		goto out_no_cigar;
@@ -376,7 +376,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
 	if (page_cnt > fmr_message_size)
 		return -EINVAL;
 
-	dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+	dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+				 rdsibdev_to_node(rds_ibdev));
 	if (!dma_pages)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-59-g8ed1b


From 38a4e5e61344490f18241333d7b1b368a3a38748 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 May 2010 15:09:45 -0700
Subject: rds: Use RCU for the bind lookup searches

The RDS bind lookups are somewhat expensive in terms of CPU
time and locking overhead.  This commit changes them into a
faster RCU based hash tree instead of the rbtrees they were using
before.

On large NUMA systems it is a significant improvement.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/af_rds.c  |  8 +++++
 net/rds/bind.c    | 90 ++++++++++++++++++++++++++++---------------------------
 net/rds/ib_rdma.c |  2 ++
 net/rds/rds.h     |  2 +-
 4 files changed, 57 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ef09340cf7a9..f16d2a92cb89 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -72,7 +72,15 @@ static int rds_release(struct socket *sock)
 	 * with the socket. */
 	rds_clear_recv_queue(rs);
 	rds_cong_remove_socket(rs);
+
+	/*
+	 * the binding lookup hash uses rcu, we need to
+	 * make sure we sychronize_rcu before we free our
+	 * entry
+	 */
 	rds_remove_bound(rs);
+	synchronize_rcu();
+
 	rds_send_drop_to(rs, NULL);
 	rds_rdma_drop_keys(rs);
 	rds_notify_queue_get(rs, NULL);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index a65afff6f4b5..2f6b3fcc79f8 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -34,45 +34,52 @@
 #include <net/sock.h>
 #include <linux/in.h>
 #include <linux/if_arp.h>
+#include <linux/jhash.h>
 #include "rds.h"
 
-/*
- * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
- * particularly zippy.
- *
- * This is now called for every incoming frame so we arguably care much more
- * about it than we used to.
- */
-static DEFINE_RWLOCK(rds_bind_lock);
-static struct rb_root rds_bind_tree = RB_ROOT;
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+static DEFINE_SPINLOCK(rds_bind_lock);
+
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+	return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+				  (BIND_HASH_SIZE - 1));
+}
 
-static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
-					   struct rds_sock *insert)
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+					struct rds_sock *insert)
 {
-	struct rb_node **p = &rds_bind_tree.rb_node;
-	struct rb_node *parent = NULL;
 	struct rds_sock *rs;
+	struct hlist_node *node;
+	struct hlist_head *head = hash_to_bucket(addr, port);
 	u64 cmp;
 	u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
 
-	while (*p) {
-		parent = *p;
-		rs = rb_entry(parent, struct rds_sock, rs_bound_node);
-
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
 		cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
 		      be16_to_cpu(rs->rs_bound_port);
 
-		if (needle < cmp)
-			p = &(*p)->rb_left;
-		else if (needle > cmp)
-			p = &(*p)->rb_right;
-		else
+		if (cmp == needle) {
+			rcu_read_unlock();
 			return rs;
+		}
 	}
+	rcu_read_unlock();
 
 	if (insert) {
-		rb_link_node(&insert->rs_bound_node, parent, p);
-		rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+		/*
+		 * make sure our addr and port are set before
+		 * we are added to the list, other people
+		 * in rcu will find us as soon as the
+		 * hlist_add_head_rcu is done
+		 */
+		insert->rs_bound_addr = addr;
+		insert->rs_bound_port = port;
+		rds_sock_addref(insert);
+
+		hlist_add_head_rcu(&insert->rs_bound_node, head);
 	}
 	return NULL;
 }
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
 struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
 {
 	struct rds_sock *rs;
-	unsigned long flags;
 
-	read_lock_irqsave(&rds_bind_lock, flags);
-	rs = rds_bind_tree_walk(addr, port, NULL);
+	rs = rds_bind_lookup(addr, port, NULL);
+
 	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 		rds_sock_addref(rs);
 	else
 		rs = NULL;
-	read_unlock_irqrestore(&rds_bind_lock, flags);
 
 	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
 		ntohs(port));
@@ -116,28 +121,21 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 		last = rover - 1;
 	}
 
-	write_lock_irqsave(&rds_bind_lock, flags);
+	spin_lock_irqsave(&rds_bind_lock, flags);
 
 	do {
 		if (rover == 0)
 			rover++;
-		if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) {
-			*port = cpu_to_be16(rover);
+		if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+			*port = rs->rs_bound_port;
 			ret = 0;
+			rdsdebug("rs %p binding to %pI4:%d\n",
+			  rs, &addr, (int)ntohs(*port));
 			break;
 		}
 	} while (rover++ != last);
 
-	if (ret == 0)  {
-		rs->rs_bound_addr = addr;
-		rs->rs_bound_port = *port;
-		rds_sock_addref(rs);
-
-		rdsdebug("rs %p binding to %pI4:%d\n",
-		  rs, &addr, (int)ntohs(*port));
-	}
-
-	write_unlock_irqrestore(&rds_bind_lock, flags);
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
 
 	return ret;
 }
@@ -146,19 +144,19 @@ void rds_remove_bound(struct rds_sock *rs)
 {
 	unsigned long flags;
 
-	write_lock_irqsave(&rds_bind_lock, flags);
+	spin_lock_irqsave(&rds_bind_lock, flags);
 
 	if (rs->rs_bound_addr) {
 		rdsdebug("rs %p unbinding from %pI4:%d\n",
 		  rs, &rs->rs_bound_addr,
 		  ntohs(rs->rs_bound_port));
 
-		rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+		hlist_del_init_rcu(&rs->rs_bound_node);
 		rds_sock_put(rs);
 		rs->rs_bound_addr = 0;
 	}
 
-	write_unlock_irqrestore(&rds_bind_lock, flags);
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
 }
 
 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 out:
 	release_sock(sk);
+
+	/* we might have called rds_remove_bound on error */
+	if (ret)
+		synchronize_rcu();
 	return ret;
 }
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index cc341cd70c87..4ba01b9ffd44 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -303,6 +303,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 		goto out_no_cigar;
 	}
 
+	memset(ibmr, 0, sizeof(*ibmr));
+
 	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
 			(IB_ACCESS_LOCAL_WRITE |
 			 IB_ACCESS_REMOTE_READ |
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c22bd7b49460..241a0859d16e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -452,7 +452,7 @@ struct rds_sock {
 	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
 	 * support.
 	 */
-	struct rb_node		rs_bound_node;
+	struct hlist_node	rs_bound_node;
 	__be32			rs_bound_addr;
 	__be32			rs_conn_addr;
 	__be16			rs_bound_port;
-- 
cgit v1.2.3-59-g8ed1b


From 7e3f2952eeb1a0fe2aa9882fd1705a88f9d89b35 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 May 2010 15:11:11 -0700
Subject: rds: don't let RDS shutdown a connection while senders are present

This is the first in a long line of patches that tries to fix races
between RDS connection shutdown and RDS traffic.

Here we are maintaining a count of active senders to make sure
the connection doesn't go away while they are using it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/connection.c |  7 +++++++
 net/rds/ib_recv.c    | 12 ------------
 net/rds/message.c    |  5 ++++-
 net/rds/rds.h        |  1 +
 net/rds/send.c       | 17 ++++++++++++++---
 5 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7e4e9dfdbc0b..9c249f394f29 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -148,6 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 
 	spin_lock_init(&conn->c_send_lock);
 	atomic_set(&conn->c_send_generation, 1);
+	atomic_set(&conn->c_senders, 0);
 	INIT_LIST_HEAD(&conn->c_send_queue);
 	INIT_LIST_HEAD(&conn->c_retrans);
 
@@ -276,6 +277,12 @@ void rds_conn_shutdown(struct rds_connection *conn)
 		spin_lock_irq(&conn->c_send_lock);
 		spin_unlock_irq(&conn->c_send_lock);
 
+		while(atomic_read(&conn->c_senders)) {
+			schedule_timeout(1);
+			spin_lock_irq(&conn->c_send_lock);
+			spin_unlock_irq(&conn->c_send_lock);
+		}
+
 		conn->c_trans->conn_shutdown(conn);
 		rds_conn_reset(conn);
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 8f041f7954a2..24d14615f41a 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -863,18 +863,6 @@ int rds_ib_recv(struct rds_connection *conn)
 	int ret = 0;
 
 	rdsdebug("conn %p\n", conn);
-
-	/*
-	 * If we get a temporary posting failure in this context then
-	 * we're really low and we want the caller to back off for a bit.
-	 */
-	mutex_lock(&ic->i_recv_mutex);
-	if (rds_ib_recv_refill(conn, 0))
-		ret = -ENOMEM;
-	else
-		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
-	mutex_unlock(&ic->i_recv_mutex);
-
 	if (rds_conn_up(conn))
 		rds_ib_attempt_ack(ic);
 
diff --git a/net/rds/message.c b/net/rds/message.c
index 96e2bf7dc77e..84f937f11d47 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -81,7 +81,10 @@ static void rds_message_purge(struct rds_message *rm)
 void rds_message_put(struct rds_message *rm)
 {
 	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
-
+	if (atomic_read(&rm->m_refcount) == 0) {
+printk(KERN_CRIT "danger refcount zero on %p\n", rm);
+WARN_ON(1);
+	}
 	if (atomic_dec_and_test(&rm->m_refcount)) {
 		BUG_ON(!list_empty(&rm->m_sock_item));
 		BUG_ON(!list_empty(&rm->m_conn_item));
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 241a0859d16e..4ab3d1aa0237 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -93,6 +93,7 @@ struct rds_connection {
 
 	spinlock_t		c_send_lock;	/* protect send ring */
 	atomic_t		c_send_generation;
+	atomic_t		c_senders;
 	struct rds_message	*c_xmit_rm;
 	unsigned long		c_xmit_sg;
 	unsigned int		c_xmit_hdr_off;
diff --git a/net/rds/send.c b/net/rds/send.c
index 8e3fd9981c2e..d35c43ff792e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -60,15 +60,23 @@ void rds_send_reset(struct rds_connection *conn)
 	struct rds_message *rm, *tmp;
 	unsigned long flags;
 
+	spin_lock_irqsave(&conn->c_send_lock, flags);
 	if (conn->c_xmit_rm) {
+		rm = conn->c_xmit_rm;
+		conn->c_xmit_rm = NULL;
 		/* Tell the user the RDMA op is no longer mapped by the
 		 * transport. This isn't entirely true (it's flushed out
 		 * independently) but as the connection is down, there's
 		 * no ongoing RDMA to/from that memory */
-		rds_message_unmapped(conn->c_xmit_rm);
-		rds_message_put(conn->c_xmit_rm);
-		conn->c_xmit_rm = NULL;
+printk(KERN_CRIT "send reset unmapping %p\n", rm);
+		rds_message_unmapped(rm);
+		spin_unlock_irqrestore(&conn->c_send_lock, flags);
+
+		rds_message_put(rm);
+	} else {
+		spin_unlock_irqrestore(&conn->c_send_lock, flags);
 	}
+
 	conn->c_xmit_sg = 0;
 	conn->c_xmit_hdr_off = 0;
 	conn->c_xmit_data_off = 0;
@@ -131,6 +139,7 @@ restart:
 		ret = -ENOMEM;
 		goto out;
 	}
+	atomic_inc(&conn->c_senders);
 
 	if (conn->c_trans->xmit_prepare)
 		conn->c_trans->xmit_prepare(conn);
@@ -350,6 +359,8 @@ restart:
 		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
 	}
 
+	atomic_dec(&conn->c_senders);
+
 	/*
 	 * Other senders will see we have c_send_lock and exit. We
 	 * need to recheck the send queue and race again for c_send_lock
-- 
cgit v1.2.3-59-g8ed1b


From c9e65383a20d9a656db70efbf67e57f8115ad776 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 May 2010 15:14:16 -0700
Subject: rds: Fix RDMA message reference counting

The RDS send_xmit code was trying to get fancy with message
counting and was dropping the final reference on the RDMA messages
too early.  This resulted in memory corruption and oopsen.

The fix here is to always add a ref as the parts of the message passes
through rds_send_xmit, and always drop a ref as the parts of the message
go through completion handling.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_send.c | 11 ++++++-----
 net/rds/send.c    | 11 ++++++++---
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 657037d96cbf..82459e52c771 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -268,11 +268,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 			if (send->s_queued + HZ/2 < jiffies)
 				rds_ib_stats_inc(s_ib_tx_stalled);
 
-			if (&send->s_op == &rm->m_final_op) {
-				/* If anyone waited for this message to get flushed out, wake
-				 * them up now */
-				rds_message_unmapped(rm);
-
+			if (send->s_op) {
+				if (send->s_op == rm->m_final_op) {
+					/* If anyone waited for this message to get flushed out, wake
+					 * them up now */
+					rds_message_unmapped(rm);
+				}
 				rds_message_put(rm);
 				send->s_op = NULL;
 			}
diff --git a/net/rds/send.c b/net/rds/send.c
index d35c43ff792e..5c6d4a0be0d7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -68,7 +68,6 @@ void rds_send_reset(struct rds_connection *conn)
 		 * transport. This isn't entirely true (it's flushed out
 		 * independently) but as the connection is down, there's
 		 * no ongoing RDMA to/from that memory */
-printk(KERN_CRIT "send reset unmapping %p\n", rm);
 		rds_message_unmapped(rm);
 		spin_unlock_irqrestore(&conn->c_send_lock, flags);
 
@@ -234,10 +233,13 @@ restart:
 
 		/* The transport either sends the whole rdma or none of it */
 		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			rds_message_addref(rm);
 			rm->m_final_op = &rm->rdma;
 			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
-			if (ret)
+			if (ret) {
+				rds_message_put(rm);
 				break;
+			}
 			conn->c_xmit_rdma_sent = 1;
 
 			/* The transport owns the mapped memory for now.
@@ -246,10 +248,13 @@ restart:
 		}
 
 		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+			rds_message_addref(rm);
 			rm->m_final_op = &rm->atomic;
 			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
-			if (ret)
+			if (ret) {
+				rds_message_put(rm);
 				break;
+			}
 			conn->c_xmit_atomic_sent = 1;
 
 			/* The transport owns the mapped memory for now.
-- 
cgit v1.2.3-59-g8ed1b


From abf454398c2ebafc629ebb8b149f5a752c79e919 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 May 2010 15:14:52 -0700
Subject: RDS: use locking on the connection hash list

rds_conn_destroy really needs locking while it changes the
connection hash.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/connection.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 9c249f394f29..87df15b9f8e4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -321,7 +321,10 @@ void rds_conn_destroy(struct rds_connection *conn)
 		 "%pI4\n", conn, &conn->c_laddr,
 		 &conn->c_faddr);
 
+	/* Ensure conn will not be scheduled for reconnect */
+	spin_lock_irq(&rds_conn_lock);
 	hlist_del_init(&conn->c_hash_node);
+	spin_unlock_irq(&rds_conn_lock);
 
 	/* wait for the rds thread to shut it down */
 	atomic_set(&conn->c_state, RDS_CONN_ERROR);
-- 
cgit v1.2.3-59-g8ed1b


From bcf50ef2ce3c5d8f2fe995259da16677898cb300 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 May 2010 15:15:15 -0700
Subject: rds: use RCU to protect the connection hash

The connection hash was almost entirely RCU ready, this
just makes the final couple of changes to use RCU instead
of spinlocks for everything.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/connection.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 87df15b9f8e4..180b83ab2607 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -62,6 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
 } while (0)
 
+/* rcu read lock must be held or the connection spinlock */
 static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 					      __be32 laddr, __be32 faddr,
 					      struct rds_transport *trans)
@@ -69,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 	struct rds_connection *conn, *ret = NULL;
 	struct hlist_node *pos;
 
-	hlist_for_each_entry(conn, pos, head, c_hash_node) {
+	hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
 				conn->c_trans == trans) {
 			ret = conn;
@@ -119,7 +120,8 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&rds_conn_lock, flags);
+
+	rcu_read_lock();
 	conn = rds_conn_lookup(head, laddr, faddr, trans);
 	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
 	    !is_outgoing) {
@@ -130,7 +132,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 		parent = conn;
 		conn = parent->c_passive;
 	}
-	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 	if (conn)
 		goto out;
 
@@ -227,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 			kmem_cache_free(rds_conn_slab, conn);
 			conn = found;
 		} else {
-			hlist_add_head(&conn->c_hash_node, head);
+			hlist_add_head_rcu(&conn->c_hash_node, head);
 			rds_cong_add_conn(conn);
 			rds_conn_count++;
 		}
@@ -306,8 +308,13 @@ void rds_conn_shutdown(struct rds_connection *conn)
 	 * to the conn hash, so we never trigger a reconnect on this
 	 * conn - the reconnect is always triggered by the active peer. */
 	cancel_delayed_work_sync(&conn->c_conn_w);
-	if (!hlist_unhashed(&conn->c_hash_node))
+	rcu_read_lock();
+	if (!hlist_unhashed(&conn->c_hash_node)) {
+		rcu_read_unlock();
 		rds_queue_reconnect(conn);
+	} else {
+		rcu_read_unlock();
+	}
 }
 
 /*
@@ -323,14 +330,12 @@ void rds_conn_destroy(struct rds_connection *conn)
 
 	/* Ensure conn will not be scheduled for reconnect */
 	spin_lock_irq(&rds_conn_lock);
-	hlist_del_init(&conn->c_hash_node);
+	hlist_del_init_rcu(&conn->c_hash_node);
 	spin_unlock_irq(&rds_conn_lock);
 
-	/* wait for the rds thread to shut it down */
-	atomic_set(&conn->c_state, RDS_CONN_ERROR);
-	cancel_delayed_work(&conn->c_conn_w);
-	queue_work(rds_wq, &conn->c_down_w);
-	flush_workqueue(rds_wq);
+	synchronize_rcu();
+
+	rds_conn_shutdown(conn);
 
 	/* tear down queued messages */
 	list_for_each_entry_safe(rm, rtmp,
@@ -369,17 +374,16 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 	struct list_head *list;
 	struct rds_connection *conn;
 	struct rds_message *rm;
-	unsigned long flags;
 	unsigned int total = 0;
 	size_t i;
 
 	len /= sizeof(struct rds_info_message);
 
-	spin_lock_irqsave(&rds_conn_lock, flags);
+	rcu_read_lock();
 
 	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
 	     i++, head++) {
-		hlist_for_each_entry(conn, pos, head, c_hash_node) {
+		hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 			if (want_send)
 				list = &conn->c_send_queue;
 			else
@@ -399,8 +403,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 			spin_unlock(&conn->c_lock);
 		}
 	}
-
-	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 
 	lens->nr = total;
 	lens->each = sizeof(struct rds_info_message);
@@ -430,19 +433,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 	uint64_t buffer[(item_len + 7) / 8];
 	struct hlist_head *head;
 	struct hlist_node *pos;
-	struct hlist_node *tmp;
 	struct rds_connection *conn;
-	unsigned long flags;
 	size_t i;
 
-	spin_lock_irqsave(&rds_conn_lock, flags);
+	rcu_read_lock();
 
 	lens->nr = 0;
 	lens->each = item_len;
 
 	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
 	     i++, head++) {
-		hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+		hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 
 			/* XXX no c_lock usage.. */
 			if (!visitor(conn, buffer))
@@ -458,8 +459,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			lens->nr++;
 		}
 	}
-
-	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1cc2228c599f173d77000a250bf0541294e1a7be Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 May 2010 16:15:35 -0700
Subject: rds: Fix reference counting on the for xmit_atomic and xmit_rdma

This makes sure we have the proper number of references in
rds_ib_xmit_atomic and rds_ib_xmit_rdma.  We also consistently
drop references the same way for all message types as the IOs end.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_send.c | 10 +++++++++-
 net/rds/send.c    | 10 ++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 82459e52c771..209dbc6d159d 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -757,6 +757,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 	send->s_wr.next = NULL;
 	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
 	send->s_wr.wr.atomic.rkey = op->op_rkey;
+	send->s_op = op;
+	rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
 
 	/* map 8 byte retval buffer to the device */
 	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
@@ -857,13 +859,13 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
 		send->s_wr.send_flags = 0;
 		send->s_queued = jiffies;
+		send->s_op = NULL;
 
 		rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 
 		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
 		send->s_wr.wr.rdma.rkey = op->op_rkey;
-		send->s_op = op;
 
 		if (num_sge > rds_ibdev->max_sge) {
 			send->s_wr.num_sge = rds_ibdev->max_sge;
@@ -899,6 +901,12 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 			send = ic->i_sends;
 	}
 
+	/* give a reference to the last op */
+	if (scat == &op->op_sg[op->op_count]) {
+		prev->s_op = op;
+		rds_message_addref(container_of(op, struct rds_message, rdma));
+	}
+
 	if (i < work_alloc) {
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
 		work_alloc = i;
diff --git a/net/rds/send.c b/net/rds/send.c
index 5c6d4a0be0d7..437f1e72609c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -233,13 +233,10 @@ restart:
 
 		/* The transport either sends the whole rdma or none of it */
 		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
-			rds_message_addref(rm);
 			rm->m_final_op = &rm->rdma;
 			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
-			if (ret) {
-				rds_message_put(rm);
+			if (ret)
 				break;
-			}
 			conn->c_xmit_rdma_sent = 1;
 
 			/* The transport owns the mapped memory for now.
@@ -248,13 +245,10 @@ restart:
 		}
 
 		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
-			rds_message_addref(rm);
 			rm->m_final_op = &rm->atomic;
 			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
-			if (ret) {
-				rds_message_put(rm);
+			if (ret)
 				break;
-			}
 			conn->c_xmit_atomic_sent = 1;
 
 			/* The transport owns the mapped memory for now.
-- 
cgit v1.2.3-59-g8ed1b


From a46ca94e7fb2c93a59e08b42fd77d8c478fda5fc Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 24 May 2010 13:14:59 -0700
Subject: RDS/IB: rds_ib_cm_handle_connect() forgot to unlock c_cm_lock

rds_ib_cm_handle_connect() could return without unlocking the c_conn_lock if
rds_setup_qp() failed.  Rather than adding another imbalanced mutex_unlock() to
this error path we only unlock the mutex once as we exit the function, reducing
the likelyhood of making this same mistake in the future.  We remove the
previous mulitple return sites, leaving one unambigious return path.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib_cm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b5d0b60a26bc..73253f7c1fa3 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -428,7 +428,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	struct rds_ib_connection *ic = NULL;
 	struct rdma_conn_param conn_param;
 	u32 version;
-	int err, destroy = 1;
+	int err = 1, destroy = 1;
 
 	/* Check whether the remote protocol version matches ours. */
 	version = rds_ib_protocol_compatible(event);
@@ -467,7 +467,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 			/* Wait and see - our connect may still be succeeding */
 			rds_ib_stats_inc(s_ib_connect_raced);
 		}
-		mutex_unlock(&conn->c_cm_lock);
 		goto out;
 	}
 
@@ -504,16 +503,14 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 
 	/* rdma_accept() calls rdma_reject() internally if it fails */
 	err = rdma_accept(cm_id, &conn_param);
-	mutex_unlock(&conn->c_cm_lock);
-	if (err) {
+	if (err)
 		rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
-		goto out;
-	}
-
-	return 0;
 
 out:
-	rdma_reject(cm_id, NULL, 0);
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+	if (err)
+		rdma_reject(cm_id, NULL, 0);
 	return destroy;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 89bf9d4158b5a1b6bd00960eb2e47601ec8cc138 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 18 May 2010 15:44:50 -0700
Subject: RDS/IB: get the xmit max_sge from the RDS IB device on the connection

rds_ib_xmit_rdma() was calling ib_get_client_data() to get at the rds_ibdevice
just to get the max_sge for the transmit.  This patch instead has it get it
directly off the rds_ibdev which is stored on the connection.

The current code won't free the rds_ibdev until all the IB connections that use
it are freed.  So it's safe to reference the rds_ibdev this way.  In the future
it also makes it easier to support proper reference counting of the rds_ibdev
struct.

As an additional bonus, this gets rid of the performance hit of calling in to
the IB stack to look up the rds_ibdev.  The current implementation in the IB
stack acquires an interrupt blocking spinlock to protect the registration of
client callback data.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib_send.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 209dbc6d159d..3f91e794eae9 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -806,10 +806,10 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 	struct rds_ib_send_work *first;
 	struct rds_ib_send_work *prev;
 	struct ib_send_wr *failed_wr;
-	struct rds_ib_device *rds_ibdev;
 	struct scatterlist *scat;
 	unsigned long len;
 	u64 remote_addr = op->op_remote_addr;
+	u32 max_sge = ic->rds_ibdev->max_sge;
 	u32 pos;
 	u32 work_alloc;
 	u32 i;
@@ -818,8 +818,6 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 	int ret;
 	int num_sge;
 
-	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
 	/* map the op the first time we see it */
 	if (!op->op_mapped) {
 		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
@@ -839,7 +837,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 	 * Instead of knowing how to return a partial rdma read/write we insist that there
 	 * be enough work requests to send the entire message.
 	 */
-	i = ceil(op->op_count, rds_ibdev->max_sge);
+	i = ceil(op->op_count, max_sge);
 
 	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
 	if (work_alloc != i) {
@@ -867,9 +865,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
 		send->s_wr.wr.rdma.rkey = op->op_rkey;
 
-		if (num_sge > rds_ibdev->max_sge) {
-			send->s_wr.num_sge = rds_ibdev->max_sge;
-			num_sge -= rds_ibdev->max_sge;
+		if (num_sge > max_sge) {
+			send->s_wr.num_sge = max_sge;
+			num_sge -= max_sge;
 		} else {
 			send->s_wr.num_sge = num_sge;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 3e0249f9c05cb77b66f7f09644ca9ca208d991a9 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 18 May 2010 15:48:51 -0700
Subject: RDS/IB: add refcount tracking to struct rds_ib_device

The RDS IB client .remove callback used to free the rds_ibdev for the given
device unconditionally.  This could race other users of the struct.  This patch
adds refcounting so that we only free the rds_ibdev once all of its users are
done.

Many rds_ibdev users are tied to connections.  We give the connection a
reference and change these users to reference the device in the connection
instead of looking it up in the IB client data.  The only user of the IB client
data remaining is the first lookup of the device as connections are built up.

Incrementing the reference count of a device found in the IB client data could
race with final freeing so we use an RCU grace period to make sure that freeing
won't happen until those lookups are done.

MRs need the rds_ibdev to get at the pool that they're freed in to.  They exist
outside a connection and many MRs can reference different devices from one
socket, so it was natural to have each MR hold a reference.  MR refs can be
dropped from interrupt handlers and final device teardown can block so we push
it off to a work struct.  Pool teardown had to be fixed to cancel its pending
work instead of deadlocking waiting for all queued work, including itself, to
finish.

MRs get their reference from the global device list, which gets a reference.
It is left unprotected by locks and remains racy.  A simple global lock would
be a significant bottleneck.  More scalable (complicated) locking should be
done carefully in a later patch.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.c      | 129 ++++++++++++++++++++++++++++++++++++++++--------------
 net/rds/ib.h      |   4 ++
 net/rds/ib_cm.c   |  33 ++++++--------
 net/rds/ib_rdma.c |  14 +++++-
 4 files changed, 125 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 7d289d7985fe..1732f8effb59 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -59,6 +59,38 @@ struct list_head rds_ib_devices;
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+	struct rds_ib_ipaddr *i_ipaddr, *i_next;
+	struct rds_ib_device *rds_ibdev = container_of(work,
+					struct rds_ib_device, free_work);
+
+	if (rds_ibdev->mr_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+	if (rds_ibdev->mr)
+		ib_dereg_mr(rds_ibdev->mr);
+	if (rds_ibdev->pd)
+		ib_dealloc_pd(rds_ibdev->pd);
+
+	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+		list_del(&i_ipaddr->list);
+		kfree(i_ipaddr);
+	}
+
+	kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+	BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+	if (atomic_dec_and_test(&rds_ibdev->refcount))
+		queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
 void rds_ib_add_one(struct ib_device *device)
 {
 	struct rds_ib_device *rds_ibdev;
@@ -77,11 +109,14 @@ void rds_ib_add_one(struct ib_device *device)
 		goto free_attr;
 	}
 
-	rds_ibdev = kmalloc_node(sizeof *rds_ibdev, GFP_KERNEL, ibdev_to_node(device));
+	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+				 ibdev_to_node(device));
 	if (!rds_ibdev)
 		goto free_attr;
 
 	spin_lock_init(&rds_ibdev->spinlock);
+	atomic_set(&rds_ibdev->refcount, 1);
+	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
 
 	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -96,67 +131,93 @@ void rds_ib_add_one(struct ib_device *device)
 
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device);
-	if (IS_ERR(rds_ibdev->pd))
-		goto free_dev;
+	if (IS_ERR(rds_ibdev->pd)) {
+		rds_ibdev->pd = NULL;
+		goto put_dev;
+	}
 
-	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-				      IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(rds_ibdev->mr))
-		goto err_pd;
+	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(rds_ibdev->mr)) {
+		rds_ibdev->mr = NULL;
+		goto put_dev;
+	}
 
 	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
 	if (IS_ERR(rds_ibdev->mr_pool)) {
 		rds_ibdev->mr_pool = NULL;
-		goto err_mr;
+		goto put_dev;
 	}
 
 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
 	INIT_LIST_HEAD(&rds_ibdev->conn_list);
 	list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+	atomic_inc(&rds_ibdev->refcount);
 
 	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+	atomic_inc(&rds_ibdev->refcount);
 
-	goto free_attr;
-
-err_mr:
-	ib_dereg_mr(rds_ibdev->mr);
-err_pd:
-	ib_dealloc_pd(rds_ibdev->pd);
-free_dev:
-	kfree(rds_ibdev);
+put_dev:
+	rds_ib_dev_put(rds_ibdev);
 free_attr:
 	kfree(dev_attr);
 }
 
+/*
+ * New connections use this to find the device to associate with the
+ * connection.  It's not in the fast path so we're not concerned about the
+ * performance of the IB call.  (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period.  The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	rcu_read_lock();
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (rds_ibdev)
+		atomic_inc(&rds_ibdev->refcount);
+	rcu_read_unlock();
+	return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away.  This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
 void rds_ib_remove_one(struct ib_device *device)
 {
 	struct rds_ib_device *rds_ibdev;
-	struct rds_ib_ipaddr *i_ipaddr, *i_next;
 
 	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
 	if (!rds_ibdev)
 		return;
 
-	synchronize_rcu();
-	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
-		list_del(&i_ipaddr->list);
-		kfree(i_ipaddr);
-	}
-
 	rds_ib_destroy_conns(rds_ibdev);
 
-	if (rds_ibdev->mr_pool)
-		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
-
-	ib_dereg_mr(rds_ibdev->mr);
-
-	while (ib_dealloc_pd(rds_ibdev->pd)) {
-		rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
-		msleep(1);
-	}
+	/*
+	 * prevent future connection attempts from getting a reference to this
+	 * device and wait for currently racing connection attempts to finish
+	 * getting their reference
+	 */
+	ib_set_client_data(device, &rds_ib_client, NULL);
+	synchronize_rcu();
+	rds_ib_dev_put(rds_ibdev);
 
 	list_del(&rds_ibdev->list);
-	kfree(rds_ibdev);
+	rds_ib_dev_put(rds_ibdev);
 }
 
 struct ib_client rds_ib_client = {
@@ -190,7 +251,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
 		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
 
-		rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+		rds_ibdev = ic->rds_ibdev;
 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
 		iinfo->max_send_sge = rds_ibdev->max_sge;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 4bc3e2fba25a..282ec69fe282 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -167,6 +167,8 @@ struct rds_ib_device {
 	unsigned int		max_initiator_depth;
 	unsigned int		max_responder_resources;
 	spinlock_t		spinlock;	/* protect the above */
+	atomic_t		refcount;
+	struct work_struct	free_work;
 };
 
 #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
@@ -251,6 +253,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 extern struct rds_transport rds_ib_transport;
 extern void rds_ib_add_one(struct ib_device *device);
 extern void rds_ib_remove_one(struct ib_device *device);
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
 extern struct ib_client rds_ib_client;
 
 extern unsigned int fmr_pool_size;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 73253f7c1fa3..a9fb917c00bb 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -95,7 +95,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 {
 	const struct rds_ib_connect_private *dp = NULL;
 	struct rds_ib_connection *ic = conn->c_transport_data;
-	struct rds_ib_device *rds_ibdev;
 	struct ib_qp_attr qp_attr;
 	int err;
 
@@ -145,12 +144,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	if (err)
 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
-	/* update ib_device with this local ipaddr & conn */
-	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-	err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+	/* update ib_device with this local ipaddr */
+	err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
 	if (err)
-		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
-	rds_ib_add_conn(rds_ibdev, conn);
+		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+			err);
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
@@ -168,12 +166,10 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 			u32 max_initiator_depth)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
-	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
 
 	memset(conn_param, 0, sizeof(struct rdma_conn_param));
 
-	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
 	conn_param->responder_resources =
 		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
 	conn_param->initiator_depth =
@@ -241,18 +237,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	struct rds_ib_device *rds_ibdev;
 	int ret;
 
-	/* rds_ib_add_one creates a rds_ib_device object per IB device,
-	 * and allocates a protection domain, memory range and FMR pool
-	 * for each.  If that fails for any reason, it will not register
-	 * the rds_ibdev at all.
+	/*
+	 * It's normal to see a null device if an incoming connection races
+	 * with device removal, so we don't print a warning.
 	 */
-	rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
-	if (!rds_ibdev) {
-		if (printk_ratelimit())
-			printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
-					dev->name);
+	rds_ibdev = rds_ib_get_client_data(dev);
+	if (!rds_ibdev)
 		return -EOPNOTSUPP;
-	}
+
+	/* add the conn now so that connection establishment has the dev */
+	rds_ib_add_conn(rds_ibdev, conn);
 
 	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
 		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -371,6 +365,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 		 ic->i_send_cq, ic->i_recv_cq);
 
 out:
+	rds_ib_dev_put(rds_ibdev);
 	return ret;
 }
 
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 4ba01b9ffd44..64b5ede037c8 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -87,6 +87,7 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 		rcu_read_lock();
 		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 			if (i_ipaddr->ipaddr == ipaddr) {
+				atomic_inc(&rds_ibdev->refcount);
 				rcu_read_unlock();
 				return rds_ibdev;
 			}
@@ -141,8 +142,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 	struct rds_ib_device *rds_ibdev_old;
 
 	rds_ibdev_old = rds_ib_get_device(ipaddr);
-	if (rds_ibdev_old)
+	if (rds_ibdev_old) {
 		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+		rds_ib_dev_put(rds_ibdev_old);
+	}
 
 	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 }
@@ -163,6 +166,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
 	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	ic->rds_ibdev = rds_ibdev;
+	atomic_inc(&rds_ibdev->refcount);
 }
 
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -182,6 +186,7 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
 	spin_unlock(&ib_nodev_conns_lock);
 
 	ic->rds_ibdev = NULL;
+	rds_ib_dev_put(rds_ibdev);
 }
 
 void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
@@ -240,7 +245,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
-	flush_workqueue(rds_wq);
+	cancel_work_sync(&pool->flush_worker);
 	rds_ib_flush_mr_pool(pool, 1);
 	WARN_ON(atomic_read(&pool->item_count));
 	WARN_ON(atomic_read(&pool->free_pinned));
@@ -597,6 +602,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 			queue_work(rds_wq, &pool->flush_worker);
 		}
 	}
+
+	rds_ib_dev_put(rds_ibdev);
 }
 
 void rds_ib_flush_mrs(void)
@@ -640,6 +647,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 
 	ibmr->device = rds_ibdev;
+	rds_ibdev = NULL;
 
  out:
 	if (ret) {
@@ -647,5 +655,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 			rds_ib_free_mr(ibmr, 0);
 		ibmr = ERR_PTR(ret);
 	}
+	if (rds_ibdev)
+		rds_ib_dev_put(rds_ibdev);
 	return ibmr;
 }
-- 
cgit v1.2.3-59-g8ed1b


From f3c6808d3d8513db2b0543538fc35c25a60fe7a7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 24 May 2010 13:14:36 -0700
Subject: RDS: introduce rds_conn_connect_if_down()

A few paths had the same block of code to queue a connection's connect work if
it was in the right state.  Let's move this in to a helper function.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/connection.c | 12 ++++++++++++
 net/rds/rds.h        |  1 +
 net/rds/send.c       | 16 ++--------------
 3 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 180b83ab2607..5bd96d538fb9 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -539,6 +539,18 @@ void rds_conn_drop(struct rds_connection *conn)
 }
 EXPORT_SYMBOL_GPL(rds_conn_drop);
 
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
 /*
  * An error occurred on the connection
  */
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 4ab3d1aa0237..cba5f8bb4780 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -617,6 +617,7 @@ void rds_conn_shutdown(struct rds_connection *conn);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_reset(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens,
diff --git a/net/rds/send.c b/net/rds/send.c
index 437f1e72609c..a6295993e3e1 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1036,13 +1036,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	/* If the connection is down, trigger a connect. We may
-	 * have scheduled a delayed reconnect however - in this case
-	 * we should not interfere.
-	 */
-	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+	rds_conn_connect_if_down(conn);
 
 	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
 	if (ret) {
@@ -1122,13 +1116,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	rm->m_daddr = conn->c_faddr;
 	rm->data.op_active = 1;
 
-	/* If the connection is down, trigger a connect. We may
-	 * have scheduled a delayed reconnect however - in this case
-	 * we should not interfere.
-	 */
-	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+	rds_conn_connect_if_down(conn);
 
 	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From fc19de38be924728fea76026c0d1a6c4b6156084 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 24 May 2010 13:16:57 -0700
Subject: RDS/IB: disconnect when IB devices are removed

Currently IB device removal destroys connections which are associated with the
device.  This prevents connections from being re-established when replacement
devices are added.

Instead we'll queue shutdown work on the connections as their devices are
removed.  When we see that devices are added we triger connection attempts on
all connections that don't currently have a device.

The result is that RDS sockets can resume device-independent work (bcopy, not
RDMA) across IB device removal and restoration.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 1732f8effb59..b21e24fd060c 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -59,6 +59,27 @@ struct list_head rds_ib_devices;
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
+void rds_ib_nodev_connect(void)
+{
+	struct rds_ib_connection *ic;
+
+	spin_lock(&ib_nodev_conns_lock);
+	list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+		rds_conn_connect_if_down(ic->conn);
+	spin_unlock(&ib_nodev_conns_lock);
+}
+
+void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+		rds_conn_drop(ic->conn);
+	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
 /*
  * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
  * from interrupt context so we push freing off into a work struct in krdsd.
@@ -156,6 +177,8 @@ void rds_ib_add_one(struct ib_device *device)
 	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
 	atomic_inc(&rds_ibdev->refcount);
 
+	rds_ib_nodev_connect();
+
 put_dev:
 	rds_ib_dev_put(rds_ibdev);
 free_attr:
@@ -205,7 +228,7 @@ void rds_ib_remove_one(struct ib_device *device)
 	if (!rds_ibdev)
 		return;
 
-	rds_ib_destroy_conns(rds_ibdev);
+	rds_ib_dev_shutdown(rds_ibdev);
 
 	/*
 	 * prevent future connection attempts from getting a reference to this
-- 
cgit v1.2.3-59-g8ed1b


From 0b088e003ccf316a76c51be5dec2d70b93be3be8 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 24 May 2010 20:12:41 -0700
Subject: RDS: Use page_remainder_alloc() for recv bufs

Instead of splitting up a page into RDS_FRAG_SIZE chunks
ourselves, ask rds_page_remainder_alloc() to do it. While it
is possible PAGE_SIZE > FRAG_SIZE, on x86en it isn't, so having
duplicate "carve up a page into buffers" code seems excessive.

The other modification this spawns is the use of a single
struct scatterlist in rds_page_frag instead of a bare page ptr.
This causes verbosity to increase in some places, and decrease
in others.

Finally, I decided to unify the lifetimes and alloc/free of
rds_page_frag and its page. This is a nice simplification in itself,
but will be extra-nice once we come to adding cmason's recycling
patch.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      |  7 +----
 net/rds/ib_recv.c | 94 ++++++++++++++++---------------------------------------
 net/rds/page.c    |  1 +
 3 files changed, 29 insertions(+), 73 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 282ec69fe282..9bb7a7412a44 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -28,13 +28,9 @@ extern struct list_head rds_ib_devices;
  * try and minimize the amount of memory tied up both the device and
  * socket receive queues.
  */
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
 struct rds_page_frag {
 	struct list_head	f_item;
-	struct page		*f_page;
-	unsigned long		f_offset;
-	dma_addr_t 		f_mapped;
+	struct scatterlist	f_sg;
 };
 
 struct rds_ib_incoming {
@@ -107,7 +103,6 @@ struct rds_ib_connection {
 	struct rds_header	*i_recv_hdrs;
 	u64			i_recv_hdrs_dma;
 	struct rds_ib_recv_work *i_recvs;
-	struct rds_page_frag	i_frag;
 	u64			i_ack_recv;	/* last ACK received */
 
 	/* sending acks */
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 24d14615f41a..f6dbf16e0741 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,17 +43,11 @@ static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
 static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
 
-static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
-{
-	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	__free_page(frag->f_page);
-	frag->f_page = NULL;
-}
-
+/* Free frag and attached recv buffer f_sg */
 static void rds_ib_frag_free(struct rds_page_frag *frag)
 {
-	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	BUG_ON(frag->f_page);
+	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+	__free_page(sg_page(&frag->f_sg));
 	kmem_cache_free(rds_ib_frag_slab, frag);
 }
 
@@ -71,12 +65,8 @@ static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
 {
 	struct rds_page_frag *frag = recv->r_frag;
 
-	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
-	if (frag->f_mapped)
-		ib_dma_unmap_page(ic->i_cm_id->device,
-			       frag->f_mapped,
-			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
-	frag->f_mapped = 0;
+	rdsdebug("recv %p frag %p page %p\n", recv, frag, sg_page(&frag->f_sg));
+	ib_dma_unmap_sg(ic->i_cm_id->device, &frag->f_sg, 1, DMA_FROM_DEVICE);
 }
 
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
@@ -116,8 +106,6 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
 	}
 	if (recv->r_frag) {
 		rds_ib_recv_unmap_page(ic, recv);
-		if (recv->r_frag->f_page)
-			rds_ib_frag_drop_page(recv->r_frag);
 		rds_ib_frag_free(recv->r_frag);
 		recv->r_frag = NULL;
 	}
@@ -129,16 +117,12 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 
 	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
 		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-
-	if (ic->i_frag.f_page)
-		rds_ib_frag_drop_page(&ic->i_frag);
 }
 
 static int rds_ib_recv_refill_one(struct rds_connection *conn,
 				  struct rds_ib_recv_work *recv)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
-	dma_addr_t dma_addr;
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
 
@@ -161,50 +145,27 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 		if (!recv->r_frag)
 			goto out;
 		INIT_LIST_HEAD(&recv->r_frag->f_item);
-		recv->r_frag->f_page = NULL;
-	}
-
-	if (!ic->i_frag.f_page) {
-		ic->i_frag.f_page = alloc_page(GFP_NOWAIT);
-		if (!ic->i_frag.f_page)
+		sg_init_table(&recv->r_frag->f_sg, 1);
+		ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
+					       RDS_FRAG_SIZE, GFP_NOWAIT);
+		if (ret) {
+			kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
+			recv->r_frag = NULL;
 			goto out;
-		ic->i_frag.f_offset = 0;
+		}
 	}
 
-	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
-				  ic->i_frag.f_page,
-				  ic->i_frag.f_offset,
-				  RDS_FRAG_SIZE,
-				  DMA_FROM_DEVICE);
-	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
-		goto out;
-
-	/*
-	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
-	 * must be called on this recv.  This happens as completions hit
-	 * in order or on connection shutdown.
-	 */
-	recv->r_frag->f_page = ic->i_frag.f_page;
-	recv->r_frag->f_offset = ic->i_frag.f_offset;
-	recv->r_frag->f_mapped = dma_addr;
+	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+			    1, DMA_FROM_DEVICE);
+	WARN_ON(ret != 1);
 
 	sge = &recv->r_sge[0];
 	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
 	sge->length = sizeof(struct rds_header);
 
 	sge = &recv->r_sge[1];
-	sge->addr = dma_addr;
-	sge->length = RDS_FRAG_SIZE;
-
-	get_page(recv->r_frag->f_page);
-
-	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
-		ic->i_frag.f_offset += RDS_FRAG_SIZE;
-	} else {
-		put_page(ic->i_frag.f_page);
-		ic->i_frag.f_page = NULL;
-		ic->i_frag.f_offset = 0;
-	}
+	sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+	sge->length = sg_dma_len(&recv->r_frag->f_sg);
 
 	ret = 0;
 out:
@@ -247,8 +208,8 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 		/* XXX when can this fail? */
 		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
 		rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
-			 recv->r_ibinc, recv->r_frag->f_page,
-			 (long) recv->r_frag->f_mapped, ret);
+			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+			 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
 		if (ret) {
 			rds_ib_conn_error(conn, "recv post on "
 			       "%pI4 returned %d, disconnecting and "
@@ -281,7 +242,6 @@ static void rds_ib_inc_purge(struct rds_incoming *inc)
 
 	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
 		list_del_init(&frag->f_item);
-		rds_ib_frag_drop_page(frag);
 		rds_ib_frag_free(frag);
 	}
 }
@@ -333,13 +293,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
 		to_copy = min_t(unsigned long, to_copy, len - copied);
 
 		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
-			 "[%p, %lu] + %lu\n",
+			 "[%p, %u] + %lu\n",
 			 to_copy, iov->iov_base, iov->iov_len, iov_off,
-			 frag->f_page, frag->f_offset, frag_off);
+			 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
 
 		/* XXX needs + offset for multiple recvs per page */
-		ret = rds_page_copy_to_user(frag->f_page,
-					    frag->f_offset + frag_off,
+		ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
+					    frag->f_sg.offset + frag_off,
 					    iov->iov_base + iov_off,
 					    to_copy);
 		if (ret) {
@@ -595,7 +555,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
 		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
 		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
 
-		addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+		addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
 
 		src = addr + frag_off;
 		dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -698,12 +658,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		 * the inc is freed.  We don't go that route, so we have to drop the
 		 * page ref ourselves.  We can't just leave the page on the recv
 		 * because that confuses the dma mapping of pages and each recv's use
-		 * of a partial page.  We can leave the frag, though, it will be
-		 * reused.
+		 * of a partial page.
 		 *
 		 * FIXME: Fold this into the code path below.
 		 */
-		rds_ib_frag_drop_page(recv->r_frag);
+		rds_ib_frag_free(recv->r_frag);
+		recv->r_frag = NULL;
 		return;
 	}
 
diff --git a/net/rds/page.c b/net/rds/page.c
index e5b2527ae257..5e44f5ae7898 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -186,6 +186,7 @@ out:
 		 ret ? 0 : scat->length);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
 
 static int rds_page_remainder_cpu_notify(struct notifier_block *self,
 					 unsigned long action, void *hcpu)
-- 
cgit v1.2.3-59-g8ed1b


From 3427e854e1a0e76be8b3d75fc0fa878f59b43693 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Mon, 24 May 2010 20:28:49 -0700
Subject: RDS: Assume recv->r_frag is always NULL in refill_one()

refill_one() should never be called on a recv struct that
doesn't need a new r_frag allocated. Add a WARN and remove
conditional around r_frag alloc code.

Also, add a comment to explain why r_ibinc may or may not
need refilling.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_recv.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index f6dbf16e0741..5c7e6acf7c60 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -126,6 +126,10 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
 
+	/*
+	 * ibinc was taken from recv if recv contained the start of a message.
+	 * recvs that were continuations will still have this allocated.
+	 */
 	if (!recv->r_ibinc) {
 		if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
 			rds_ib_stats_inc(s_ib_rx_alloc_limit);
@@ -140,19 +144,18 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 		rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
 	}
 
-	if (!recv->r_frag) {
-		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
-		if (!recv->r_frag)
-			goto out;
-		INIT_LIST_HEAD(&recv->r_frag->f_item);
-		sg_init_table(&recv->r_frag->f_sg, 1);
-		ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
-					       RDS_FRAG_SIZE, GFP_NOWAIT);
-		if (ret) {
-			kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
-			recv->r_frag = NULL;
-			goto out;
-		}
+	WARN_ON(recv->r_frag); /* leak! */
+	recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
+	if (!recv->r_frag)
+		goto out;
+	INIT_LIST_HEAD(&recv->r_frag->f_item);
+	sg_init_table(&recv->r_frag->f_sg, 1);
+	ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
+				       RDS_FRAG_SIZE, GFP_NOWAIT);
+	if (ret) {
+		kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
+		recv->r_frag = NULL;
+		goto out;
 	}
 
 	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
-- 
cgit v1.2.3-59-g8ed1b


From fc24f78085e8771670af42f2b8929b16a0c98a22 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 25 May 2010 11:20:09 -0700
Subject: RDS/IB: Remove ib_recv_unmap_page()

All it does is call unmap_sg(), so just call that directly.

The comment above unmap_page also may be incorrect, so we
shouldn't hold on to it, either.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_recv.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 5c7e6acf7c60..48add108ddee 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -51,24 +51,6 @@ static void rds_ib_frag_free(struct rds_page_frag *frag)
 	kmem_cache_free(rds_ib_frag_slab, frag);
 }
 
-/*
- * We map a page at a time.  Its fragments are posted in order.  This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
-				   struct rds_ib_recv_work *recv)
-{
-	struct rds_page_frag *frag = recv->r_frag;
-
-	rdsdebug("recv %p frag %p page %p\n", recv, frag, sg_page(&frag->f_sg));
-	ib_dma_unmap_sg(ic->i_cm_id->device, &frag->f_sg, 1, DMA_FROM_DEVICE);
-}
-
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
 	struct rds_ib_recv_work *recv;
@@ -105,7 +87,7 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
 		recv->r_ibinc = NULL;
 	}
 	if (recv->r_frag) {
-		rds_ib_recv_unmap_page(ic, recv);
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
 		rds_ib_frag_free(recv->r_frag);
 		recv->r_frag = NULL;
 	}
@@ -768,7 +750,7 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
 
 		recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 
-		rds_ib_recv_unmap_page(ic, recv);
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
 
 		/*
 		 * Also process recvs in connecting state because it is possible
-- 
cgit v1.2.3-59-g8ed1b


From 33244125871734ebc0d8d147680a0d7e99385e0b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 May 2010 22:05:37 -0700
Subject: RDS/IB: Add caching of frags and incs

This patch is based heavily on an initial patch by Chris Mason.
Instead of freeing slab memory and pages, it keeps them, and
funnels them back to be reused.

The lock minimization strategy uses xchg and cmpxchg atomic ops
for manipulation of pointers to list heads. We anchor the lists with a
pointer to a list_head struct instead of a static list_head struct.
We just have to carefully use the existing primitives with
the difference between a pointer and a static head struct.

For example, 'list_empty()' means that our anchor pointer points to a list with
a single item instead of meaning that our static head element doesn't point to
any list items.

Original patch by Chris, with significant mods and fixes by Andy and Zach.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.h      |  19 ++++
 net/rds/ib_cm.c   |   9 ++
 net/rds/ib_recv.c | 321 +++++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 297 insertions(+), 52 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 9bb7a7412a44..2efd9d11e7d4 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -21,6 +21,8 @@
 
 #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
 
+#define RDS_IB_RECYCLE_BATCH_COUNT	32
+
 extern struct list_head rds_ib_devices;
 
 /*
@@ -30,14 +32,27 @@ extern struct list_head rds_ib_devices;
  */
 struct rds_page_frag {
 	struct list_head	f_item;
+	struct list_head	f_cache_entry;
 	struct scatterlist	f_sg;
 };
 
 struct rds_ib_incoming {
 	struct list_head	ii_frags;
+	struct list_head	ii_cache_entry;
 	struct rds_incoming	ii_inc;
 };
 
+struct rds_ib_cache_head {
+	struct list_head *first;
+	unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+	struct rds_ib_cache_head *percpu;
+	struct list_head	 *xfer;
+	struct list_head	 *ready;
+};
+
 struct rds_ib_connect_private {
 	/* Add new fields at the end, and don't permute existing fields. */
 	__be32			dp_saddr;
@@ -104,6 +119,8 @@ struct rds_ib_connection {
 	u64			i_recv_hdrs_dma;
 	struct rds_ib_recv_work *i_recvs;
 	u64			i_ack_recv;	/* last ACK received */
+	struct rds_ib_refill_cache i_cache_incs;
+	struct rds_ib_refill_cache i_cache_frags;
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
@@ -304,6 +321,8 @@ void rds_ib_flush_mrs(void);
 int __init rds_ib_recv_init(void);
 void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
 int rds_ib_recv_refill(struct rds_connection *conn, int prefill);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index a9fb917c00bb..10f6a8815cd0 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -709,12 +709,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 {
 	struct rds_ib_connection *ic;
 	unsigned long flags;
+	int ret;
 
 	/* XXX too lazy? */
 	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
 	if (!ic)
 		return -ENOMEM;
 
+	ret = rds_ib_recv_alloc_caches(ic);
+	if (ret) {
+		kfree(ic);
+		return ret;
+	}
+
 	INIT_LIST_HEAD(&ic->ib_node);
 	tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
 		     (unsigned long) ic);
@@ -763,6 +770,8 @@ void rds_ib_conn_free(void *arg)
 	list_del(&ic->ib_node);
 	spin_unlock_irq(lock_ptr);
 
+	rds_ib_recv_free_caches(ic);
+
 	kfree(ic);
 }
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 48add108ddee..5b429b7fd81c 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,14 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
 static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
 
-/* Free frag and attached recv buffer f_sg */
-static void rds_ib_frag_free(struct rds_page_frag *frag)
-{
-	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
-	__free_page(sg_page(&frag->f_sg));
-	kmem_cache_free(rds_ib_frag_slab, frag);
-}
-
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
 	struct rds_ib_recv_work *recv;
@@ -79,6 +71,151 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 	}
 }
 
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+				    struct list_head *to)
+{
+	struct list_head *from_last = from->prev;
+
+	list_splice_tail(from_last, to);
+	list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *tmp;
+
+	tmp = xchg(&cache->xfer, NULL);
+	if (tmp) {
+		if (cache->ready)
+			list_splice_entire_tail(tmp, cache->ready);
+		else
+			cache->ready = tmp;
+	}
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+	if (!cache->percpu)
+	       return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		head->first = NULL;
+		head->count = 0;
+	}
+	cache->xfer = NULL;
+	cache->ready = NULL;
+
+	return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+	int ret;
+
+	ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+	if (!ret) {
+		ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+		if (ret)
+			free_percpu(ic->i_cache_incs.percpu);
+	}
+
+	return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+					  struct list_head *caller_list)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		if (head->first) {
+			list_splice_entire_tail(head->first, caller_list);
+			head->first = NULL;
+		}
+	}
+
+	if (cache->ready) {
+		list_splice_entire_tail(cache->ready, caller_list);
+		cache->ready = NULL;
+	}
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+	struct rds_ib_incoming *inc;
+	struct rds_ib_incoming *inc_tmp;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *frag_tmp;
+	LIST_HEAD(list);
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+	free_percpu(ic->i_cache_incs.percpu);
+
+	list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+		list_del(&inc->ii_cache_entry);
+		WARN_ON(!list_empty(&inc->ii_frags));
+		kmem_cache_free(rds_ib_incoming_slab, inc);
+	}
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+	free_percpu(ic->i_cache_frags.percpu);
+
+	list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+		list_del(&frag->f_cache_entry);
+		WARN_ON(!list_empty(&frag->f_item));
+		kmem_cache_free(rds_ib_frag_slab, frag);
+	}
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				  struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+			     struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+	struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+	/* Free attached frags */
+	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_ib_frag_free(ic, frag);
+	}
+	BUG_ON(!list_empty(&ibinc->ii_frags));
+
+	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+	rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
 				  struct rds_ib_recv_work *recv)
 {
@@ -88,7 +225,7 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
 	}
 	if (recv->r_frag) {
 		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
-		rds_ib_frag_free(recv->r_frag);
+		rds_ib_frag_free(ic, recv->r_frag);
 		recv->r_frag = NULL;
 	}
 }
@@ -101,6 +238,61 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
 }
 
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic)
+{
+	struct rds_ib_incoming *ibinc;
+	struct list_head *cache_item;
+	int avail_allocs;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+	if (cache_item) {
+		ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+	} else {
+		avail_allocs = atomic_add_unless(&rds_ib_allocation,
+						 1, rds_ib_sysctl_max_recv_allocation);
+		if (!avail_allocs) {
+			rds_ib_stats_inc(s_ib_rx_alloc_limit);
+			return NULL;
+		}
+		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT);
+		if (!ibinc) {
+			atomic_dec(&rds_ib_allocation);
+			return NULL;
+		}
+	}
+	INIT_LIST_HEAD(&ibinc->ii_frags);
+	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+
+	return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic)
+{
+	struct rds_page_frag *frag;
+	struct list_head *cache_item;
+	int ret;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+	if (cache_item) {
+		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+	} else {
+		frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
+		if (!frag)
+			return NULL;
+
+		ret = rds_page_remainder_alloc(&frag->f_sg,
+					       RDS_FRAG_SIZE, GFP_NOWAIT);
+		if (ret) {
+			kmem_cache_free(rds_ib_frag_slab, frag);
+			return NULL;
+		}
+	}
+
+	INIT_LIST_HEAD(&frag->f_item);
+
+	return frag;
+}
+
 static int rds_ib_recv_refill_one(struct rds_connection *conn,
 				  struct rds_ib_recv_work *recv)
 {
@@ -108,37 +300,25 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
 
+	if (!ic->i_cache_incs.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	if (!ic->i_cache_frags.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
 	/*
 	 * ibinc was taken from recv if recv contained the start of a message.
 	 * recvs that were continuations will still have this allocated.
 	 */
 	if (!recv->r_ibinc) {
-		if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
-			rds_ib_stats_inc(s_ib_rx_alloc_limit);
-			goto out;
-		}
-		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT);
-		if (!recv->r_ibinc) {
-			atomic_dec(&rds_ib_allocation);
+		recv->r_ibinc = rds_ib_refill_one_inc(ic);
+		if (!recv->r_ibinc)
 			goto out;
-		}
-		INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
-		rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
 	}
 
 	WARN_ON(recv->r_frag); /* leak! */
-	recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
+	recv->r_frag = rds_ib_refill_one_frag(ic);
 	if (!recv->r_frag)
 		goto out;
-	INIT_LIST_HEAD(&recv->r_frag->f_item);
-	sg_init_table(&recv->r_frag->f_sg, 1);
-	ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
-				       RDS_FRAG_SIZE, GFP_NOWAIT);
-	if (ret) {
-		kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
-		recv->r_frag = NULL;
-		goto out;
-	}
 
 	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
 			    1, DMA_FROM_DEVICE);
@@ -160,8 +340,7 @@ out:
 /*
  * This tries to allocate and post unused work requests after making sure that
  * they have all the allocations they need to queue received fragments into
- * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
+ * sockets.
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
@@ -216,33 +395,71 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 	return ret;
 }
 
-static void rds_ib_inc_purge(struct rds_incoming *inc)
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				 struct rds_ib_refill_cache *cache)
 {
-	struct rds_ib_incoming *ibinc;
-	struct rds_page_frag *frag;
-	struct rds_page_frag *pos;
+	unsigned long flags;
+	struct rds_ib_cache_head *chp;
+	struct list_head *old;
 
-	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-	rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+	local_irq_save(flags);
 
-	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-		list_del_init(&frag->f_item);
-		rds_ib_frag_free(frag);
-	}
+	chp = per_cpu_ptr(cache->percpu, smp_processor_id());
+	if (!chp->first)
+		INIT_LIST_HEAD(new_item);
+	else /* put on front */
+		list_add_tail(new_item, chp->first);
+	chp->first = new_item;
+	chp->count++;
+
+	if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
+		goto end;
+
+	/*
+	 * Return our per-cpu first list to the cache's xfer by atomically
+	 * grabbing the current xfer list, appending it to our per-cpu list,
+	 * and then atomically returning that entire list back to the
+	 * cache's xfer list as long as it's still empty.
+	 */
+	do {
+		old = xchg(&cache->xfer, NULL);
+		if (old)
+			list_splice_entire_tail(old, chp->first);
+		old = cmpxchg(&cache->xfer, NULL, chp->first);
+	} while (old);
+
+	chp->first = NULL;
+	chp->count = 0;
+end:
+	local_irq_restore(flags);
 }
 
-void rds_ib_inc_free(struct rds_incoming *inc)
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
 {
-	struct rds_ib_incoming *ibinc;
-
-	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	struct list_head *head = cache->ready;
+
+	if (head) {
+		if (!list_empty(head)) {
+			cache->ready = head->next;
+			list_del_init(head);
+		} else
+			cache->ready = NULL;
+	}
 
-	rds_ib_inc_purge(inc);
-	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
-	BUG_ON(!list_empty(&ibinc->ii_frags));
-	kmem_cache_free(rds_ib_incoming_slab, ibinc);
-	atomic_dec(&rds_ib_allocation);
-	BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+	return head;
 }
 
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -647,7 +864,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		 *
 		 * FIXME: Fold this into the code path below.
 		 */
-		rds_ib_frag_free(recv->r_frag);
+		rds_ib_frag_free(ic, recv->r_frag);
 		recv->r_frag = NULL;
 		return;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 037f18a3074753991656189a091a5fa371999107 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 May 2010 21:45:06 -0700
Subject: RDS: use friendly gfp masks for prefill

When prefilling the rds frags, we end up doing a lot of allocations.
We're not in atomic context here, and so there's no reason to dip into
atomic reserves.  This changes the prefills to use masks that allow
waiting.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_recv.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 5b429b7fd81c..1add097fe198 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -238,7 +238,8 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
 }
 
-static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic)
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+						     gfp_t slab_mask)
 {
 	struct rds_ib_incoming *ibinc;
 	struct list_head *cache_item;
@@ -254,7 +255,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
 			rds_ib_stats_inc(s_ib_rx_alloc_limit);
 			return NULL;
 		}
-		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT);
+		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
 		if (!ibinc) {
 			atomic_dec(&rds_ib_allocation);
 			return NULL;
@@ -266,7 +267,8 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
 	return ibinc;
 }
 
-static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic)
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+						    gfp_t slab_mask, gfp_t page_mask)
 {
 	struct rds_page_frag *frag;
 	struct list_head *cache_item;
@@ -276,12 +278,12 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 	if (cache_item) {
 		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
 	} else {
-		frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
+		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
 		if (!frag)
 			return NULL;
 
 		ret = rds_page_remainder_alloc(&frag->f_sg,
-					       RDS_FRAG_SIZE, GFP_NOWAIT);
+					       RDS_FRAG_SIZE, page_mask);
 		if (ret) {
 			kmem_cache_free(rds_ib_frag_slab, frag);
 			return NULL;
@@ -294,11 +296,18 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 }
 
 static int rds_ib_recv_refill_one(struct rds_connection *conn,
-				  struct rds_ib_recv_work *recv)
+				  struct rds_ib_recv_work *recv, int prefill)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
+	gfp_t slab_mask = GFP_NOWAIT;
+	gfp_t page_mask = GFP_NOWAIT;
+
+	if (prefill) {
+		slab_mask = GFP_KERNEL;
+		page_mask = GFP_HIGHUSER;
+	}
 
 	if (!ic->i_cache_incs.ready)
 		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
@@ -310,13 +319,13 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	 * recvs that were continuations will still have this allocated.
 	 */
 	if (!recv->r_ibinc) {
-		recv->r_ibinc = rds_ib_refill_one_inc(ic);
+		recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
 		if (!recv->r_ibinc)
 			goto out;
 	}
 
 	WARN_ON(recv->r_frag); /* leak! */
-	recv->r_frag = rds_ib_refill_one_frag(ic);
+	recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
 	if (!recv->r_frag)
 		goto out;
 
@@ -363,7 +372,7 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 		}
 
 		recv = &ic->i_recvs[pos];
-		ret = rds_ib_recv_refill_one(conn, recv);
+		ret = rds_ib_recv_refill_one(conn, recv, prefill);
 		if (ret) {
 			ret = -1;
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 671202f3491cccdb267f88ad59ba0635aeb2a22e Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 4 Jun 2010 14:26:32 -0700
Subject: rds: remove unused rds_send_acked_before()

rds_send_acked_before() wasn't blocking interrupts when acquiring c_lock from
user context but nothing calls it.  Rather than fix its use of c_lock we just
remove the function.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/rds.h  |  1 -
 net/rds/send.c | 29 -----------------------------
 2 files changed, 30 deletions(-)

(limited to 'net')

diff --git a/net/rds/rds.h b/net/rds/rds.h
index cba5f8bb4780..270ded76fd53 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -724,7 +724,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 			 is_acked_func is_acked);
-int rds_send_acked_before(struct rds_connection *conn, u64 seq);
 void rds_send_remove_from_sock(struct list_head *messages, int status);
 int rds_send_pong(struct rds_connection *conn, __be16 dport);
 struct rds_message *rds_send_get_message(struct rds_connection *,
diff --git a/net/rds/send.c b/net/rds/send.c
index a6295993e3e1..b9e41afef323 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -404,35 +404,6 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
 	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
 }
 
-/*
- * Returns true if there are no messages on the send and retransmit queues
- * which have a sequence number greater than or equal to the given sequence
- * number.
- */
-int rds_send_acked_before(struct rds_connection *conn, u64 seq)
-{
-	struct rds_message *rm, *tmp;
-	int ret = 1;
-
-	spin_lock(&conn->c_lock);
-
-	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
-			ret = 0;
-		break;
-	}
-
-	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
-			ret = 0;
-		break;
-	}
-
-	spin_unlock(&conn->c_lock);
-
-	return ret;
-}
-
 /*
  * This is pretty similar to what happens below in the ACK
  * handling code - except that we call here as soon as we get
-- 
cgit v1.2.3-59-g8ed1b


From 501dcccdb7a2335cde07d4acb56e636182d62944 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 4 Jun 2010 14:25:27 -0700
Subject: rds: block ints when acquiring c_lock in rds_conn_message_info()

conn->c_lock is acquired in interrupt context.  rds_conn_message_info() is
called from user context and was acquiring c_lock without blocking interrupts,
leading to possible deadlocks.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/connection.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 5bd96d538fb9..5bb0eec5ada3 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -375,6 +375,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 	struct rds_connection *conn;
 	struct rds_message *rm;
 	unsigned int total = 0;
+	unsigned long flags;
 	size_t i;
 
 	len /= sizeof(struct rds_info_message);
@@ -389,7 +390,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 			else
 				list = &conn->c_retrans;
 
-			spin_lock(&conn->c_lock);
+			spin_lock_irqsave(&conn->c_lock, flags);
 
 			/* XXX too lazy to maintain counts.. */
 			list_for_each_entry(rm, list, m_conn_item) {
@@ -400,7 +401,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 							  conn->c_faddr, 0);
 			}
 
-			spin_unlock(&conn->c_lock);
+			spin_unlock_irqrestore(&conn->c_lock, flags);
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 0f4b1c7e89e699f588807a914ec6e6396c851a72 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 4 Jun 2010 14:41:41 -0700
Subject: rds: fix rds_send_xmit() serialization

rds_send_xmit() was changed to hold an interrupt masking spinlock instead of a
mutex so that it could be called from the IB receive tasklet path.  This broke
the TCP transport because its xmit method can block and masks and unmasks
interrupts.

This patch serializes callers to rds_send_xmit() with a simple bit instead of
the current spinlock or previous mutex.  This enables rds_send_xmit() to be
called from any context and to call functions which block.  Getting rid of the
c_send_lock exposes the bare c_lock acquisitions which are changed to block
interrupts.

A waitqueue is added so that rds_conn_shutdown() can wait for callers to leave
rds_send_xmit() before tearing down partial send state.  This lets us get rid
of c_senders.

rds_send_xmit() is changed to check the conn state after acquiring the
RDS_IN_XMIT bit to resolve races with the shutdown path.  Previously both
worked with the conn state and then the lock in the same order, allowing them
to race and execute the paths concurrently.

rds_send_reset() isn't racing with rds_send_xmit() now that rds_conn_shutdown()
properly ensures that rds_send_xmit() can't start once the conn state has been
changed.  We can remove its previous use of the spinlock.

Finally, c_send_generation is redundant.  Callers can race to test the c_flags
bit by simply retrying instead of racing to test the c_send_generation atomic.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/connection.c | 19 ++++--------
 net/rds/ib_send.c    |  2 +-
 net/rds/rds.h        |  5 ++--
 net/rds/send.c       | 82 +++++++++++++++++++++++++++-------------------------
 net/rds/threads.c    |  2 +-
 5 files changed, 52 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 5bb0eec5ada3..89871db77f8f 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -148,9 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	spin_lock_init(&conn->c_lock);
 	conn->c_next_tx_seq = 1;
 
-	spin_lock_init(&conn->c_send_lock);
-	atomic_set(&conn->c_send_generation, 1);
-	atomic_set(&conn->c_senders, 0);
+	init_waitqueue_head(&conn->c_waitq);
 	INIT_LIST_HEAD(&conn->c_send_queue);
 	INIT_LIST_HEAD(&conn->c_retrans);
 
@@ -275,15 +273,8 @@ void rds_conn_shutdown(struct rds_connection *conn)
 		}
 		mutex_unlock(&conn->c_cm_lock);
 
-		/* verify everybody's out of rds_send_xmit() */
-		spin_lock_irq(&conn->c_send_lock);
-		spin_unlock_irq(&conn->c_send_lock);
-
-		while(atomic_read(&conn->c_senders)) {
-			schedule_timeout(1);
-			spin_lock_irq(&conn->c_send_lock);
-			spin_unlock_irq(&conn->c_send_lock);
-		}
+		wait_event(conn->c_waitq,
+			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
 
 		conn->c_trans->conn_shutdown(conn);
 		rds_conn_reset(conn);
@@ -477,8 +468,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
 		sizeof(cinfo->transport));
 	cinfo->flags = 0;
 
-	rds_conn_info_set(cinfo->flags,
-			  spin_is_locked(&conn->c_send_lock), SENDING);
+	rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+			  SENDING);
 	/* XXX Future: return the state rather than these funky bits */
 	rds_conn_info_set(cinfo->flags,
 			  atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 3f91e794eae9..e88cb4af009b 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -321,7 +321,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * credits (see rds_ib_send_add_credits below).
  *
  * The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
  * However, the ACK sending code is independent and can race with
  * message SENDs.
  *
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 270ded76fd53..4510344ce8ca 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -80,6 +80,7 @@ enum {
 /* Bits for c_flags */
 #define RDS_LL_SEND_FULL	0
 #define RDS_RECONNECT_PENDING	1
+#define RDS_IN_XMIT		2
 
 struct rds_connection {
 	struct hlist_node	c_hash_node;
@@ -91,9 +92,6 @@ struct rds_connection {
 	struct rds_cong_map	*c_lcong;
 	struct rds_cong_map	*c_fcong;
 
-	spinlock_t		c_send_lock;	/* protect send ring */
-	atomic_t		c_send_generation;
-	atomic_t		c_senders;
 	struct rds_message	*c_xmit_rm;
 	unsigned long		c_xmit_sg;
 	unsigned int		c_xmit_hdr_off;
@@ -120,6 +118,7 @@ struct rds_connection {
 	struct delayed_work	c_conn_w;
 	struct work_struct	c_down_w;
 	struct mutex		c_cm_lock;	/* protect conn state & cm */
+	wait_queue_head_t	c_waitq;
 
 	struct list_head	c_map_item;
 	unsigned long		c_map_queued;
diff --git a/net/rds/send.c b/net/rds/send.c
index b9e41afef323..81471b25373b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -53,14 +53,14 @@ module_param(send_batch_count, int, 0444);
 MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
 
 /*
- * Reset the send state. Caller must hold c_send_lock when calling here.
+ * Reset the send state.  Callers must ensure that this doesn't race with
+ * rds_send_xmit().
  */
 void rds_send_reset(struct rds_connection *conn)
 {
 	struct rds_message *rm, *tmp;
 	unsigned long flags;
 
-	spin_lock_irqsave(&conn->c_send_lock, flags);
 	if (conn->c_xmit_rm) {
 		rm = conn->c_xmit_rm;
 		conn->c_xmit_rm = NULL;
@@ -69,11 +69,7 @@ void rds_send_reset(struct rds_connection *conn)
 		 * independently) but as the connection is down, there's
 		 * no ongoing RDMA to/from that memory */
 		rds_message_unmapped(rm);
-		spin_unlock_irqrestore(&conn->c_send_lock, flags);
-
 		rds_message_put(rm);
-	} else {
-		spin_unlock_irqrestore(&conn->c_send_lock, flags);
 	}
 
 	conn->c_xmit_sg = 0;
@@ -98,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn)
 	spin_unlock_irqrestore(&conn->c_lock, flags);
 }
 
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+	return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+	clear_bit(RDS_IN_XMIT, &conn->c_flags);
+	smp_mb__after_clear_bit();
+	/*
+	 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&conn->c_waitq))
+		wake_up_all(&conn->c_waitq);
+}
+
 /*
  * We're making the concious trade-off here to only send one message
  * down the connection at a time.
@@ -119,12 +134,9 @@ int rds_send_xmit(struct rds_connection *conn)
 	unsigned int tmp;
 	struct scatterlist *sg;
 	int ret = 0;
-	int gen = 0;
 	LIST_HEAD(to_be_dropped);
 
 restart:
-	if (!rds_conn_up(conn))
-		goto out;
 
 	/*
 	 * sendmsg calls here after having queued its message on the send
@@ -133,18 +145,25 @@ restart:
 	 * avoids blocking the caller and trading per-connection data between
 	 * caches per message.
 	 */
-	if (!spin_trylock_irqsave(&conn->c_send_lock, flags)) {
+	if (!acquire_in_xmit(conn)) {
 		rds_stats_inc(s_send_lock_contention);
 		ret = -ENOMEM;
 		goto out;
 	}
-	atomic_inc(&conn->c_senders);
+
+	/*
+	 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+	 * we do the opposite to avoid races.
+	 */
+	if (!rds_conn_up(conn)) {
+		release_in_xmit(conn);
+		ret = 0;
+		goto out;
+	}
 
 	if (conn->c_trans->xmit_prepare)
 		conn->c_trans->xmit_prepare(conn);
 
-	gen = atomic_inc_return(&conn->c_send_generation);
-
 	/*
 	 * spin trying to push headers and data down the connection until
 	 * the connection doesn't make forward progress.
@@ -178,7 +197,7 @@ restart:
 		if (!rm) {
 			unsigned int len;
 
-			spin_lock(&conn->c_lock);
+			spin_lock_irqsave(&conn->c_lock, flags);
 
 			if (!list_empty(&conn->c_send_queue)) {
 				rm = list_entry(conn->c_send_queue.next,
@@ -193,7 +212,7 @@ restart:
 				list_move_tail(&rm->m_conn_item, &conn->c_retrans);
 			}
 
-			spin_unlock(&conn->c_lock);
+			spin_unlock_irqrestore(&conn->c_lock, flags);
 
 			if (!rm)
 				break;
@@ -207,10 +226,10 @@ restart:
 			 */
 			if (rm->rdma.op_active &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
-				spin_lock(&conn->c_lock);
+				spin_lock_irqsave(&conn->c_lock, flags);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
 					list_move(&rm->m_conn_item, &to_be_dropped);
-				spin_unlock(&conn->c_lock);
+				spin_unlock_irqrestore(&conn->c_lock, flags);
 				continue;
 			}
 
@@ -336,19 +355,7 @@ restart:
 	if (conn->c_trans->xmit_complete)
 		conn->c_trans->xmit_complete(conn);
 
-	/*
-	 * We might be racing with another sender who queued a message but
-	 * backed off on noticing that we held the c_send_lock.  If we check
-	 * for queued messages after dropping the sem then either we'll
-	 * see the queued message or the queuer will get the sem.  If we
-	 * notice the queued message then we trigger an immediate retry.
-	 *
-	 * We need to be careful only to do this when we stopped processing
-	 * the send queue because it was empty.  It's the only way we
-	 * stop processing the loop when the transport hasn't taken
-	 * responsibility for forward progress.
-	 */
-	spin_unlock_irqrestore(&conn->c_send_lock, flags);
+	release_in_xmit(conn);
 
 	/* Nuke any messages we decided not to retransmit. */
 	if (!list_empty(&to_be_dropped)) {
@@ -358,13 +365,12 @@ restart:
 		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
 	}
 
-	atomic_dec(&conn->c_senders);
-
 	/*
-	 * Other senders will see we have c_send_lock and exit. We
-	 * need to recheck the send queue and race again for c_send_lock
-	 * to make sure messages don't just sit on the send queue, if
-	 * somebody hasn't already beat us into the loop.
+	 * Other senders can queue a message after we last test the send queue
+	 * but before we clear RDS_IN_XMIT.  In that case they'd back off and
+	 * not try and send their newly queued message.  We need to check the
+	 * send queue after having cleared RDS_IN_XMIT so that their message
+	 * doesn't get stuck on the send queue.
 	 *
 	 * If the transport cannot continue (i.e ret != 0), then it must
 	 * call us when more room is available, such as from the tx
@@ -374,9 +380,7 @@ restart:
 		smp_mb();
 		if (!list_empty(&conn->c_send_queue)) {
 			rds_stats_inc(s_send_lock_queue_raced);
-			if (gen == atomic_read(&conn->c_send_generation)) {
-				goto restart;
-			}
+			goto restart;
 		}
 	}
 out:
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 7a8ca7a1d983..2bab9bf07b91 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -61,7 +61,7 @@
  *
  * Transition to state DISCONNECTING/DOWN:
  *  -	Inside the shutdown worker; synchronizes with xmit path
- *	through c_send_lock, and with connection management callbacks
+ *	through RDS_IN_XMIT, and with connection management callbacks
  *	via c_cm_lock.
  *
  *	For receive callbacks, we rely on the underlying transport
-- 
cgit v1.2.3-59-g8ed1b


From 6fa70da6081bbcf948801fd5ee0be4d222298a43 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Jun 2010 11:17:59 -0700
Subject: rds: recycle FMRs through lockless lists

FRM allocation and recycling is performance critical and fairly lock
intensive.  The current code has a per connection lock that all
processes bang on and it becomes a major bottleneck on large systems.

This changes things to use a number of cmpxchg based lists instead,
allowing us to go through the whole FMR lifecycle without locking inside
RDS.

Zach Brown pointed out that our usage of cmpxchg for xlist removal is
racey if someone manages to remove and add back an FMR struct into the list
while another CPU can see the FMR's address at the head of the list.

The second CPU might assume the list hasn't changed when in fact any
number of operations might have happened in between the deletion and
reinsertion.

This commit maintains a per cpu count of CPUs that are currently
in xlist removal, and establishes a grace period to make sure that
nobody can see an entry we have just removed from the list.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_rdma.c | 214 +++++++++++++++++++++++++++++++++++++++++++-----------
 net/rds/xlist.h   | 110 ++++++++++++++++++++++++++++
 2 files changed, 282 insertions(+), 42 deletions(-)
 create mode 100644 net/rds/xlist.h

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 64b5ede037c8..8c40391de5a2 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -36,7 +36,10 @@
 
 #include "rds.h"
 #include "ib.h"
+#include "xlist.h"
 
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
 
 /*
  * This is stored as mr->r_trans_private.
@@ -45,7 +48,11 @@ struct rds_ib_mr {
 	struct rds_ib_device	*device;
 	struct rds_ib_mr_pool	*pool;
 	struct ib_fmr		*fmr;
-	struct list_head	list;
+
+	struct xlist_head	xlist;
+
+	/* unmap_list is for freeing */
+	struct list_head	unmap_list;
 	unsigned int		remap_count;
 
 	struct scatterlist	*sg;
@@ -61,12 +68,14 @@ struct rds_ib_mr_pool {
 	struct mutex		flush_lock;		/* serialize fmr invalidate */
 	struct work_struct	flush_worker;		/* flush worker */
 
-	spinlock_t		list_lock;		/* protect variables below */
 	atomic_t		item_count;		/* total # of MRs */
 	atomic_t		dirty_count;		/* # dirty of MRs */
-	struct list_head	drop_list;		/* MRs that have reached their max_maps limit */
-	struct list_head	free_list;		/* unused MRs */
-	struct list_head	clean_list;		/* unused & unamapped MRs */
+
+	struct xlist_head	drop_list;		/* MRs that have reached their max_maps limit */
+	struct xlist_head	free_list;		/* unused MRs */
+	struct xlist_head	clean_list;		/* global unused & unamapped MRs */
+	wait_queue_head_t	flush_wait;
+
 	atomic_t		free_pinned;		/* memory pinned by free MRs */
 	unsigned long		max_items;
 	unsigned long		max_items_soft;
@@ -74,7 +83,7 @@ struct rds_ib_mr_pool {
 	struct ib_fmr_attr	fmr_attr;
 };
 
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
 static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
 
@@ -212,11 +221,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
 	if (!pool)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&pool->free_list);
-	INIT_LIST_HEAD(&pool->drop_list);
-	INIT_LIST_HEAD(&pool->clean_list);
+	INIT_XLIST_HEAD(&pool->free_list);
+	INIT_XLIST_HEAD(&pool->drop_list);
+	INIT_XLIST_HEAD(&pool->clean_list);
 	mutex_init(&pool->flush_lock);
-	spin_lock_init(&pool->list_lock);
+	init_waitqueue_head(&pool->flush_wait);
 	INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
 
 	pool->fmr_attr.max_pages = fmr_message_size;
@@ -246,27 +255,50 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
 	cancel_work_sync(&pool->flush_worker);
-	rds_ib_flush_mr_pool(pool, 1);
+	rds_ib_flush_mr_pool(pool, 1, NULL);
 	WARN_ON(atomic_read(&pool->item_count));
 	WARN_ON(atomic_read(&pool->free_pinned));
 	kfree(pool);
 }
 
+static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
+			 struct rds_ib_mr **ibmr_ret)
+{
+	struct xlist_head *ibmr_xl;
+	ibmr_xl = xlist_del_head_fast(xl);
+	*ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
+}
+
 static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
-	unsigned long flags;
+	struct xlist_head *ret;
+	unsigned long *flag;
 
-	spin_lock_irqsave(&pool->list_lock, flags);
-	if (!list_empty(&pool->clean_list)) {
-		ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
-		list_del_init(&ibmr->list);
-	}
-	spin_unlock_irqrestore(&pool->list_lock, flags);
+	preempt_disable();
+	flag = &__get_cpu_var(clean_list_grace);
+	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	ret = xlist_del_head(&pool->clean_list);
+	if (ret)
+		ibmr = list_entry(ret, struct rds_ib_mr, xlist);
 
+	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+	preempt_enable();
 	return ibmr;
 }
 
+static inline void wait_clean_list_grace(void)
+{
+	int cpu;
+	unsigned long *flag;
+
+	for_each_online_cpu(cpu) {
+		flag = &per_cpu(clean_list_grace, cpu);
+		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+			cpu_relax();
+	}
+}
+
 static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 {
 	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
@@ -299,7 +331,9 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 
 		/* We do have some empty MRs. Flush them out. */
 		rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
-		rds_ib_flush_mr_pool(pool, 0);
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
 	}
 
 	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
@@ -493,34 +527,110 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
 	return 0;
 }
 
+/*
+ * given an xlist of mrs, put them all into the list_head for more processing
+ */
+static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+{
+	struct rds_ib_mr *ibmr;
+	struct xlist_head splice;
+	struct xlist_head *cur;
+	struct xlist_head *next;
+
+	splice.next = NULL;
+	xlist_splice(xlist, &splice);
+	cur = splice.next;
+	while (cur) {
+		next = cur->next;
+		ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+		list_add_tail(&ibmr->unmap_list, list);
+		cur = next;
+	}
+}
+
+/*
+ * this takes a list head of mrs and turns it into an xlist of clusters.
+ * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
+ * reuse.
+ */
+static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
+				struct list_head *list, struct xlist_head *xlist,
+				struct xlist_head **tail_ret)
+{
+	struct rds_ib_mr *ibmr;
+	struct xlist_head *cur_mr = xlist;
+	struct xlist_head *tail_mr = NULL;
+
+	list_for_each_entry(ibmr, list, unmap_list) {
+		tail_mr = &ibmr->xlist;
+		tail_mr->next = NULL;
+		cur_mr->next = tail_mr;
+		cur_mr = tail_mr;
+	}
+	*tail_ret = tail_mr;
+}
+
 /*
  * Flush our pool of MRs.
  * At a minimum, all currently unused MRs are unmapped.
  * If the number of MRs allocated exceeds the limit, we also try
  * to free as many MRs as needed to get back to this limit.
  */
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+			        int free_all, struct rds_ib_mr **ibmr_ret)
 {
 	struct rds_ib_mr *ibmr, *next;
+	struct xlist_head clean_xlist;
+	struct xlist_head *clean_tail;
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(fmr_list);
 	unsigned long unpinned = 0;
-	unsigned long flags;
 	unsigned int nfreed = 0, ncleaned = 0, free_goal;
 	int ret = 0;
 
 	rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
 
-	mutex_lock(&pool->flush_lock);
+	if (ibmr_ret) {
+		DEFINE_WAIT(wait);
+		while(!mutex_trylock(&pool->flush_lock)) {
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+
+			prepare_to_wait(&pool->flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (xlist_empty(&pool->clean_list))
+				schedule();
+
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+		}
+		finish_wait(&pool->flush_wait, &wait);
+	} else
+		mutex_lock(&pool->flush_lock);
+
+	if (ibmr_ret) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr) {
+			*ibmr_ret = ibmr;
+			goto out;
+		}
+	}
 
-	spin_lock_irqsave(&pool->list_lock, flags);
 	/* Get the list of all MRs to be dropped. Ordering matters -
-	 * we want to put drop_list ahead of free_list. */
-	list_splice_init(&pool->free_list, &unmap_list);
-	list_splice_init(&pool->drop_list, &unmap_list);
+	 * we want to put drop_list ahead of free_list.
+	 */
+	xlist_append_to_list(&pool->drop_list, &unmap_list);
+	xlist_append_to_list(&pool->free_list, &unmap_list);
 	if (free_all)
-		list_splice_init(&pool->clean_list, &unmap_list);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
+		xlist_append_to_list(&pool->clean_list, &unmap_list);
 
 	free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -528,19 +638,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 		goto out;
 
 	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-	list_for_each_entry(ibmr, &unmap_list, list)
+	list_for_each_entry(ibmr, &unmap_list, unmap_list)
 		list_add(&ibmr->fmr->list, &fmr_list);
+
 	ret = ib_unmap_fmr(&fmr_list);
 	if (ret)
 		printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
 
 	/* Now we can destroy the DMA mapping and unpin any pages */
-	list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+	list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
 		unpinned += ibmr->sg_len;
 		__rds_ib_teardown_mr(ibmr);
 		if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
 			rds_ib_stats_inc(s_ib_rdma_mr_free);
-			list_del(&ibmr->list);
+			list_del(&ibmr->unmap_list);
 			ib_dealloc_fmr(ibmr->fmr);
 			kfree(ibmr);
 			nfreed++;
@@ -548,9 +659,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 		ncleaned++;
 	}
 
-	spin_lock_irqsave(&pool->list_lock, flags);
-	list_splice(&unmap_list, &pool->clean_list);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
+	if (!list_empty(&unmap_list)) {
+		/* we have to make sure that none of the things we're about
+		 * to put on the clean list would race with other cpus trying
+		 * to pull items off.  The xlist would explode if we managed to
+		 * remove something from the clean list and then add it back again
+		 * while another CPU was spinning on that same item in xlist_del_head.
+		 *
+		 * This is pretty unlikely, but just in case  wait for an xlist grace period
+		 * here before adding anything back into the clean list.
+		 */
+		wait_clean_list_grace();
+
+		list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+		if (ibmr_ret)
+			refill_local(pool, &clean_xlist, ibmr_ret);
+
+		/* refill_local may have emptied our list */
+		if (!xlist_empty(&clean_xlist))
+			xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+
+	}
 
 	atomic_sub(unpinned, &pool->free_pinned);
 	atomic_sub(ncleaned, &pool->dirty_count);
@@ -558,6 +687,9 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 
 out:
 	mutex_unlock(&pool->flush_lock);
+	if (waitqueue_active(&pool->flush_wait))
+		wake_up(&pool->flush_wait);
+out_nolock:
 	return ret;
 }
 
@@ -565,7 +697,7 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 {
 	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
 
-	rds_ib_flush_mr_pool(pool, 0);
+	rds_ib_flush_mr_pool(pool, 0, NULL);
 }
 
 void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -573,20 +705,17 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	struct rds_ib_mr *ibmr = trans_private;
 	struct rds_ib_device *rds_ibdev = ibmr->device;
 	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
-	unsigned long flags;
 
 	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
 	/* Return it to the pool's free list */
-	spin_lock_irqsave(&pool->list_lock, flags);
 	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-		list_add(&ibmr->list, &pool->drop_list);
+		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
 	else
-		list_add(&ibmr->list, &pool->free_list);
+		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
 
 	atomic_add(ibmr->sg_len, &pool->free_pinned);
 	atomic_inc(&pool->dirty_count);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
 
 	/* If we've pinned too many pages, request a flush */
 	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
@@ -595,7 +724,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 
 	if (invalidate) {
 		if (likely(!in_interrupt())) {
-			rds_ib_flush_mr_pool(pool, 0);
+			rds_ib_flush_mr_pool(pool, 0, NULL);
 		} else {
 			/* We get here if the user created a MR marked
 			 * as use_once and invalidate at the same time. */
@@ -614,7 +743,7 @@ void rds_ib_flush_mrs(void)
 		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 
 		if (pool)
-			rds_ib_flush_mr_pool(pool, 0);
+			rds_ib_flush_mr_pool(pool, 0, NULL);
 	}
 }
 
@@ -659,3 +788,4 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		rds_ib_dev_put(rds_ibdev);
 	return ibmr;
 }
+
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644
index 000000000000..8c21aca49d50
--- /dev/null
+++ b/net/rds/xlist.h
@@ -0,0 +1,110 @@
+#ifndef _LINUX_XLIST_H
+#define _LINUX_XLIST_H
+
+#include <linux/stddef.h>
+#include <linux/poison.h>
+#include <linux/prefetch.h>
+#include <asm/system.h>
+
+struct xlist_head {
+	struct xlist_head *next;
+};
+
+/*
+ * XLIST_PTR_TAIL can be used to prevent double insertion.  See
+ * xlist_protect()
+ */
+#define XLIST_PTR_TAIL ((struct xlist_head *)0x1)
+
+static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, struct xlist_head *head)
+{
+	struct xlist_head *cur;
+	struct xlist_head *check;
+
+	while (1) {
+		cur = head->next;
+		tail->next = cur;
+		check = cmpxchg(&head->next, cur, new);
+		if (check == cur)
+			break;
+	}
+}
+
+/*
+ * To avoid duplicate insertion by two CPUs of the same xlist item
+ * you can call xlist_protect.  It will stuff XLIST_PTR_TAIL
+ * into the entry->next pointer with xchg, and only return 1
+ * if there was a NULL there before.
+ *
+ * if xlist_protect returns zero, someone else is busy working
+ * on this entry.  Getting a NULL into the entry in a race
+ * free manner is the caller's job.
+ */
+static inline int xlist_protect(struct xlist_head *entry)
+{
+	struct xlist_head *val;
+
+	val = xchg(&entry->next, XLIST_PTR_TAIL);
+	if (val == NULL)
+		return 1;
+	return 0;
+}
+
+static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
+{
+	struct xlist_head *cur;
+	struct xlist_head *check;
+	struct xlist_head *next;
+
+	while (1) {
+		cur = head->next;
+		if (!cur)
+			goto out;
+
+		if (cur == XLIST_PTR_TAIL) {
+			cur = NULL;
+			goto out;
+		}
+
+		next = cur->next;
+		check = cmpxchg(&head->next, cur, next);
+		if (check == cur)
+			goto out;
+	}
+out:
+	return cur;
+}
+
+static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
+{
+	struct xlist_head *cur;
+
+	cur = head->next;
+	if (!cur || cur == XLIST_PTR_TAIL)
+		return NULL;
+
+	head->next = cur->next;
+	return cur;
+}
+
+static inline void xlist_splice(struct xlist_head *list,
+				struct xlist_head *head)
+{
+	struct xlist_head *cur;
+
+	WARN_ON(head->next);
+	cur = xchg(&list->next, NULL);
+	head->next = cur;
+}
+
+static inline void INIT_XLIST_HEAD(struct xlist_head *list)
+{
+	list->next = NULL;
+}
+
+static inline int xlist_empty(struct xlist_head *head)
+{
+	return head->next == NULL || head->next == XLIST_PTR_TAIL;
+}
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From eabb732279f1a41ac9d066aeb56973ae505c4cbc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Jun 2010 11:18:57 -0700
Subject: rds: more FMRs are faster

When we add more FMRs, we flush them less often and so we go faster.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 2efd9d11e7d4..d2dd23d5c1a0 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -9,7 +9,7 @@
 #include "rdma_transport.h"
 
 #define RDS_FMR_SIZE			256
-#define RDS_FMR_POOL_SIZE		4096
+#define RDS_FMR_POOL_SIZE		8192
 
 #define RDS_IB_MAX_SGE			8
 #define RDS_IB_RECV_SGE 		2
-- 
cgit v1.2.3-59-g8ed1b


From 7a0ff5dbdd0b4cb7ea8764da9d78f4bb2eebaf31 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Jun 2010 11:26:02 -0700
Subject: RDS: use delayed work for the FMR flushes

Using a delayed work queue helps us make sure a healthy number of FMRs
have queued up over the limit.  It makes for a large improvement in RDMA
iops.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_rdma.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 8c40391de5a2..3a275af9d52f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -66,7 +66,7 @@ struct rds_ib_mr {
  */
 struct rds_ib_mr_pool {
 	struct mutex		flush_lock;		/* serialize fmr invalidate */
-	struct work_struct	flush_worker;		/* flush worker */
+	struct delayed_work	flush_worker;		/* flush worker */
 
 	atomic_t		item_count;		/* total # of MRs */
 	atomic_t		dirty_count;		/* # dirty of MRs */
@@ -226,7 +226,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
 	INIT_XLIST_HEAD(&pool->clean_list);
 	mutex_init(&pool->flush_lock);
 	init_waitqueue_head(&pool->flush_wait);
-	INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
 
 	pool->fmr_attr.max_pages = fmr_message_size;
 	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
@@ -254,7 +254,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
-	cancel_work_sync(&pool->flush_worker);
+	cancel_delayed_work_sync(&pool->flush_worker);
 	rds_ib_flush_mr_pool(pool, 1, NULL);
 	WARN_ON(atomic_read(&pool->item_count));
 	WARN_ON(atomic_read(&pool->free_pinned));
@@ -695,7 +695,7 @@ out_nolock:
 
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 {
-	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
 
 	rds_ib_flush_mr_pool(pool, 0, NULL);
 }
@@ -720,7 +720,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	/* If we've pinned too many pages, request a flush */
 	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
 	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-		queue_work(rds_wq, &pool->flush_worker);
+		queue_delayed_work(rds_wq, &pool->flush_worker, 10);
 
 	if (invalidate) {
 		if (likely(!in_interrupt())) {
@@ -728,7 +728,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 		} else {
 			/* We get here if the user created a MR marked
 			 * as use_once and invalidate at the same time. */
-			queue_work(rds_wq, &pool->flush_worker);
+			queue_delayed_work(rds_wq, &pool->flush_worker, 10);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From c9455d9996ba84af1f534c7e3944ea6f35d2fc54 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 11 Jun 2010 15:18:51 -0700
Subject: RDS: whitespace

---
 net/rds/connection.c | 1 -
 net/rds/ib_rdma.c    | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 89871db77f8f..7670b45680e1 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -120,7 +120,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	unsigned long flags;
 	int ret;
 
-
 	rcu_read_lock();
 	conn = rds_conn_lookup(head, laddr, faddr, trans);
 	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 3a275af9d52f..2a4ec1182ba6 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -349,7 +349,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 			 IB_ACCESS_REMOTE_READ |
 			 IB_ACCESS_REMOTE_WRITE|
 			 IB_ACCESS_REMOTE_ATOMIC),
-
 			&pool->fmr_attr);
 	if (IS_ERR(ibmr->fmr)) {
 		err = PTR_ERR(ibmr->fmr);
-- 
cgit v1.2.3-59-g8ed1b


From fbf4d7e3d03587a983ee4e536251ea6c1c848ec2 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 11 Jun 2010 16:24:42 -0700
Subject: RDS: Remove unused XLIST_PTR_TAIL and xlist_protect()

Not used.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/xlist.h | 54 ++++++++++++------------------------------------------
 1 file changed, 12 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/rds/xlist.h b/net/rds/xlist.h
index 8c21aca49d50..e6b5190daddd 100644
--- a/net/rds/xlist.h
+++ b/net/rds/xlist.h
@@ -10,13 +10,18 @@ struct xlist_head {
 	struct xlist_head *next;
 };
 
-/*
- * XLIST_PTR_TAIL can be used to prevent double insertion.  See
- * xlist_protect()
- */
-#define XLIST_PTR_TAIL ((struct xlist_head *)0x1)
+static inline void INIT_XLIST_HEAD(struct xlist_head *list)
+{
+	list->next = NULL;
+}
+
+static inline int xlist_empty(struct xlist_head *head)
+{
+	return head->next == NULL;
+}
 
-static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, struct xlist_head *head)
+static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
+			     struct xlist_head *head)
 {
 	struct xlist_head *cur;
 	struct xlist_head *check;
@@ -30,26 +35,6 @@ static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, st
 	}
 }
 
-/*
- * To avoid duplicate insertion by two CPUs of the same xlist item
- * you can call xlist_protect.  It will stuff XLIST_PTR_TAIL
- * into the entry->next pointer with xchg, and only return 1
- * if there was a NULL there before.
- *
- * if xlist_protect returns zero, someone else is busy working
- * on this entry.  Getting a NULL into the entry in a race
- * free manner is the caller's job.
- */
-static inline int xlist_protect(struct xlist_head *entry)
-{
-	struct xlist_head *val;
-
-	val = xchg(&entry->next, XLIST_PTR_TAIL);
-	if (val == NULL)
-		return 1;
-	return 0;
-}
-
 static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
 {
 	struct xlist_head *cur;
@@ -61,11 +46,6 @@ static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
 		if (!cur)
 			goto out;
 
-		if (cur == XLIST_PTR_TAIL) {
-			cur = NULL;
-			goto out;
-		}
-
 		next = cur->next;
 		check = cmpxchg(&head->next, cur, next);
 		if (check == cur)
@@ -80,7 +60,7 @@ static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
 	struct xlist_head *cur;
 
 	cur = head->next;
-	if (!cur || cur == XLIST_PTR_TAIL)
+	if (!cur)
 		return NULL;
 
 	head->next = cur->next;
@@ -97,14 +77,4 @@ static inline void xlist_splice(struct xlist_head *list,
 	head->next = cur;
 }
 
-static inline void INIT_XLIST_HEAD(struct xlist_head *list)
-{
-	list->next = NULL;
-}
-
-static inline int xlist_empty(struct xlist_head *head)
-{
-	return head->next == NULL || head->next == XLIST_PTR_TAIL;
-}
-
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From b6fb0df12db6c8b6bbcc7b5c9459b3bbf5f0cee6 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 23 Jun 2010 18:06:30 -0700
Subject: RDS/IB: Make ib_recv_refill return void

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib.h      | 2 +-
 net/rds/ib_recv.c | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index d2dd23d5c1a0..6265ada08959 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -323,7 +323,7 @@ void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
 int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
 void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-int rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 1add097fe198..c8c60981cf2e 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -353,7 +353,7 @@ out:
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_recv_work *recv;
@@ -367,14 +367,12 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 		if (pos >= ic->i_recv_ring.w_nr) {
 			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
 					pos);
-			ret = -EINVAL;
 			break;
 		}
 
 		recv = &ic->i_recvs[pos];
 		ret = rds_ib_recv_refill_one(conn, recv, prefill);
 		if (ret) {
-			ret = -1;
 			break;
 		}
 
@@ -388,7 +386,6 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 			       "%pI4 returned %d, disconnecting and "
 			       "reconnecting\n", &conn->c_faddr,
 			       ret);
-			ret = -1;
 			break;
 		}
 
@@ -401,7 +398,6 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 
 	if (ret)
 		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
-	return ret;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 24fa163a4bae74b3378d30e1bc776568cfca8121 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 25 Jun 2010 14:59:49 -0700
Subject: RDS/IB: wait for IB dev freeing work to finish during rmmod

The RDS IB client removal callback can queue work to drop the final reference
to an IB device.  We have to make sure that this function has returned before
we complete rmmod or the work threads can try to execute freed code.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index b21e24fd060c..fc14f637d645 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -336,11 +336,18 @@ static int rds_ib_laddr_check(__be32 addr)
 	return ret;
 }
 
+static void rds_ib_unregister_client(void)
+{
+	ib_unregister_client(&rds_ib_client);
+	/* wait for rds_ib_dev_free() to complete */
+	flush_workqueue(rds_wq);
+}
+
 void rds_ib_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 	rds_ib_destroy_nodev_conns();
-	ib_unregister_client(&rds_ib_client);
+	rds_ib_unregister_client();
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
 	rds_trans_unregister(&rds_ib_transport);
@@ -404,7 +411,7 @@ out_recv:
 out_sysctl:
 	rds_ib_sysctl_exit();
 out_ibreg:
-	ib_unregister_client(&rds_ib_client);
+	rds_ib_unregister_client();
 out:
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 8aeb1ba6630ffd44001ae9833842794df0107676 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 25 Jun 2010 14:58:16 -0700
Subject: RDS/IB: destroy connections on rmmod

IB connections were not being destroyed during rmmod.

First, recently IB device removal callback was changed to disconnect
connections that used the removing device rather than destroying them.  So
connections with devices during rmmod were not being destroyed.

Second, rds_ib_destroy_nodev_conns() was being called before connections are
disassociated with devices.  It would almost never find connections in the
nodev list.

We first get rid of rds_ib_destroy_conns(), which is no longer called, and
refactor the existing caller into the main body of the function and get rid of
the list and lock wrappers.

Then we call rds_ib_destroy_nodev_conns() *after* ib_unregister_client() has
removed the IB device from all the conns and put the conns on the nodev list.

The result is that IB connections are destroyed by rmmod.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.c      |  2 +-
 net/rds/ib.h      | 10 +---------
 net/rds/ib_rdma.c |  9 ++++-----
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index fc14f637d645..af1ef18b6ff0 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -346,8 +346,8 @@ static void rds_ib_unregister_client(void)
 void rds_ib_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
-	rds_ib_destroy_nodev_conns();
 	rds_ib_unregister_client();
+	rds_ib_destroy_nodev_conns();
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
 	rds_trans_unregister(&rds_ib_transport);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 6265ada08959..e9f9ddf440ca 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -299,15 +299,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_ib_destroy_nodev_conns(void)
-{
-	__rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
-}
-static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
-{
-	__rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
-}
+void rds_ib_destroy_nodev_conns(void);
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 2a4ec1182ba6..00f3995351c8 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -198,16 +198,15 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
 	rds_ib_dev_put(rds_ibdev);
 }
 
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+void rds_ib_destroy_nodev_conns(void)
 {
 	struct rds_ib_connection *ic, *_ic;
 	LIST_HEAD(tmp_list);
 
 	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(list_lock);
-	list_splice(list, &tmp_list);
-	INIT_LIST_HEAD(list);
-	spin_unlock_irq(list_lock);
+	spin_lock_irq(&ib_nodev_conns_lock);
+	list_splice(&ib_nodev_conns, &tmp_list);
+	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
 		rds_conn_destroy(ic->conn);
-- 
cgit v1.2.3-59-g8ed1b


From 515e079dab19cf774d1eec6e5f4ed65509e31ef1 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 6 Jul 2010 15:09:56 -0700
Subject: RDS/IB: create a work queue for FMR flushing

This patch moves the FMR flushing work in to its own mult-threaded work queue.
This is to maintain performance in preparation for returning the main krdsd
work queue back to a single threaded work queue to avoid deep-rooted
concurrency bugs.

This is also good because it further separates FMRs, which might be removed
some day, from the rest of the code base.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.c      |  9 ++++++++-
 net/rds/ib.h      |  2 ++
 net/rds/ib_rdma.c | 25 +++++++++++++++++++++++--
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index af1ef18b6ff0..d2007b931616 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -351,6 +351,7 @@ void rds_ib_exit(void)
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
 	rds_trans_unregister(&rds_ib_transport);
+	rds_ib_fmr_exit();
 }
 
 struct rds_transport rds_ib_transport = {
@@ -386,10 +387,14 @@ int __init rds_ib_init(void)
 
 	INIT_LIST_HEAD(&rds_ib_devices);
 
-	ret = ib_register_client(&rds_ib_client);
+	ret = rds_ib_fmr_init();
 	if (ret)
 		goto out;
 
+	ret = ib_register_client(&rds_ib_client);
+	if (ret)
+		goto out_fmr_exit;
+
 	ret = rds_ib_sysctl_init();
 	if (ret)
 		goto out_ibreg;
@@ -412,6 +417,8 @@ out_sysctl:
 	rds_ib_sysctl_exit();
 out_ibreg:
 	rds_ib_unregister_client();
+out_fmr_exit:
+	rds_ib_fmr_exit();
 out:
 	return ret;
 }
diff --git a/net/rds/ib.h b/net/rds/ib.h
index e9f9ddf440ca..fd4ea69d2443 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -308,6 +308,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 void rds_ib_sync_mr(void *trans_private, int dir);
 void rds_ib_free_mr(void *trans_private, int invalidate);
 void rds_ib_flush_mrs(void);
+int __init rds_ib_fmr_init(void);
+void __exit rds_ib_fmr_exit(void);
 
 /* ib_recv.c */
 int __init rds_ib_recv_init(void);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 00f3995351c8..0eb597670c5b 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -691,6 +691,26 @@ out_nolock:
 	return ret;
 }
 
+struct workqueue_struct *rds_ib_fmr_wq;
+
+int __init rds_ib_fmr_init(void)
+{
+	rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
+	if (!rds_ib_fmr_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * By the time this is called all the IB devices should have been torn down and
+ * had their pools freed.  As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void __exit rds_ib_fmr_exit(void)
+{
+	destroy_workqueue(rds_ib_fmr_wq);
+}
+
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 {
 	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
@@ -718,7 +738,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	/* If we've pinned too many pages, request a flush */
 	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
 	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-		queue_delayed_work(rds_wq, &pool->flush_worker, 10);
+		queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
 
 	if (invalidate) {
 		if (likely(!in_interrupt())) {
@@ -726,7 +746,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 		} else {
 			/* We get here if the user created a MR marked
 			 * as use_once and invalidate at the same time. */
-			queue_delayed_work(rds_wq, &pool->flush_worker, 10);
+			queue_delayed_work(rds_ib_fmr_wq,
+					   &pool->flush_worker, 10);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 80c51be56ffa257d3177f0d750d90be65d30c22f Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 6 Jul 2010 15:08:48 -0700
Subject: RDS: return to a single-threaded krdsd

We were seeing very nasty bugs due to fundamental assumption the current code
makes about concurrent work struct processing.  The code simpy isn't able to
handle concurrent connection shutdown work function execution today, for
example, which is very much possible once a multi-threaded krdsd was
introduced.  The problem compounds as additional work structs are added to the
mix.

krdsd is no longer perforance critical now that send and receive posting and
FMR flushing are done elsewhere, so the safest fix is to move back to the
single threaded krdsd that the current code was built around.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/threads.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/threads.c b/net/rds/threads.c
index 2bab9bf07b91..c08c220efac5 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -214,7 +214,7 @@ void rds_threads_exit(void)
 
 int __init rds_threads_init(void)
 {
-	rds_wq = create_workqueue("krdsd");
+	rds_wq = create_singlethread_workqueue("krdsd");
 	if (!rds_wq)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-59-g8ed1b


From d455ab64096b9a86849c7315c53e595330842db6 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 6 Jul 2010 15:04:34 -0700
Subject: RDS/IB: always process recv completions

The recv refill path was leaking fragments because the recv event handler had
marked a ring element as free without freeing its frag.  This was happening
because it wasn't processing receives when the conn wasn't marked up or
connecting, as can be the case if it races with rmmod.

Two observations support always processing receives in the callback.

First, buildup should only post receives, thus triggering recv event handler
calls, once it has built up all the state to handle them.  Teardown should
destroy the CQ and drain the ring before tearing down the state needed to
process recvs.  Both appear to be true today.

Second, this test was fundamentally racy.  There is nothing to stop rmmod and
connection destruction from swooping in the moment after the conn state was
sampled but before real receive procesing starts.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib_recv.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c8c60981cf2e..9c4208f6b451 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -979,18 +979,22 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
 		 * to get a recv completion _before_ the rdmacm ESTABLISHED
 		 * event is processed.
 		 */
-		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+		if (wc.status == IB_WC_SUCCESS) {
+			rds_ib_process_recv(conn, recv, wc.byte_len, state);
+		} else {
 			/* We expect errors as the qp is drained during shutdown */
-			if (wc.status == IB_WC_SUCCESS) {
-				rds_ib_process_recv(conn, recv, wc.byte_len, state);
-			} else {
+			if (rds_conn_up(conn) || rds_conn_connecting(conn))
 				rds_ib_conn_error(conn, "recv completion on "
-				       "%pI4 had status %u, disconnecting and "
-				       "reconnecting\n", &conn->c_faddr,
-				       wc.status);
-			}
+						  "%pI4 had status %u, disconnecting and "
+						  "reconnecting\n", &conn->c_faddr,
+						  wc.status);
 		}
 
+		/*
+		 * It's very important that we only free this ring entry if we've truly
+		 * freed the resources allocated to the entry.  The refilling path can
+		 * leak if we don't.
+		 */
 		rds_ib_ring_free(&ic->i_recv_ring, 1);
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From c20f5b9633bb0953bd2422f0f1430a2028cdbd0a Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 7 Jul 2010 16:46:26 -0700
Subject: RDS/IB: Use SLAB_HWCACHE_ALIGN flag for kmem_cache_create()

We are *definitely* counting cycles as closely as DaveM, so
ensure hwcache alignment for our recv ring control structs.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 net/rds/ib_recv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 9c4208f6b451..37dab2898ad0 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1051,13 +1051,13 @@ int __init rds_ib_recv_init(void)
 
 	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
 					sizeof(struct rds_ib_incoming),
-					0, 0, NULL);
+					0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!rds_ib_incoming_slab)
 		goto out;
 
 	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
 					sizeof(struct rds_page_frag),
-					0, 0, NULL);
+					0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!rds_ib_frag_slab)
 		kmem_cache_destroy(rds_ib_incoming_slab);
 	else
-- 
cgit v1.2.3-59-g8ed1b


From ef87b7ea39a91906218a262686bcb8bad8b6b46e Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 9 Jul 2010 12:26:20 -0700
Subject: RDS: remove __init and __exit annotation

The trivial amount of memory saved isn't worth the cost of dealing with section
mismatches.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/af_rds.c         |  4 ++--
 net/rds/connection.c     |  2 +-
 net/rds/ib.c             |  2 +-
 net/rds/ib.h             | 10 +++++-----
 net/rds/ib_rdma.c        |  4 ++--
 net/rds/ib_recv.c        |  2 +-
 net/rds/ib_sysctl.c      |  2 +-
 net/rds/iw.c             |  2 +-
 net/rds/iw.h             |  6 +++---
 net/rds/iw_recv.c        |  2 +-
 net/rds/iw_sysctl.c      |  2 +-
 net/rds/rdma_transport.c |  4 ++--
 net/rds/rds.h            | 10 +++++-----
 net/rds/stats.c          |  2 +-
 net/rds/sysctl.c         |  2 +-
 net/rds/tcp.c            |  2 +-
 net/rds/tcp.h            |  6 +++---
 net/rds/tcp_listen.c     |  2 +-
 net/rds/tcp_recv.c       |  2 +-
 net/rds/threads.c        |  2 +-
 20 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index f16d2a92cb89..57ef0ec4f03d 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -521,7 +521,7 @@ out:
 	spin_unlock_irqrestore(&rds_sock_lock, flags);
 }
 
-static void __exit rds_exit(void)
+static void rds_exit(void)
 {
 	sock_unregister(rds_family_ops.family);
 	proto_unregister(&rds_proto);
@@ -536,7 +536,7 @@ static void __exit rds_exit(void)
 }
 module_exit(rds_exit);
 
-static int __init rds_init(void)
+static int rds_init(void)
 {
 	int ret;
 
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7670b45680e1..0de40d9563ca 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -488,7 +488,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_connection));
 }
 
-int __init rds_conn_init(void)
+int rds_conn_init(void)
 {
 	rds_conn_slab = kmem_cache_create("rds_connection",
 					  sizeof(struct rds_connection),
diff --git a/net/rds/ib.c b/net/rds/ib.c
index d2007b931616..3eb5617649c6 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -381,7 +381,7 @@ struct rds_transport rds_ib_transport = {
 	.t_type			= RDS_TRANS_IB
 };
 
-int __init rds_ib_init(void)
+int rds_ib_init(void)
 {
 	int ret;
 
diff --git a/net/rds/ib.h b/net/rds/ib.h
index fd4ea69d2443..acda2dbc6576 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -282,7 +282,7 @@ void rds_ib_conn_free(void *arg);
 int rds_ib_conn_connect(struct rds_connection *conn);
 void rds_ib_conn_shutdown(struct rds_connection *conn);
 void rds_ib_state_change(struct sock *sk);
-int __init rds_ib_listen_init(void);
+int rds_ib_listen_init(void);
 void rds_ib_listen_stop(void);
 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -308,11 +308,11 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 void rds_ib_sync_mr(void *trans_private, int dir);
 void rds_ib_free_mr(void *trans_private, int invalidate);
 void rds_ib_flush_mrs(void);
-int __init rds_ib_fmr_init(void);
-void __exit rds_ib_fmr_exit(void);
+int rds_ib_fmr_init(void);
+void rds_ib_fmr_exit(void);
 
 /* ib_recv.c */
-int __init rds_ib_recv_init(void);
+int rds_ib_recv_init(void);
 void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
 int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
@@ -363,7 +363,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
 /* ib_sysctl.c */
-int __init rds_ib_sysctl_init(void);
+int rds_ib_sysctl_init(void);
 void rds_ib_sysctl_exit(void);
 extern unsigned long rds_ib_sysctl_max_send_wr;
 extern unsigned long rds_ib_sysctl_max_recv_wr;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0eb597670c5b..3efdddc39d49 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -693,7 +693,7 @@ out_nolock:
 
 struct workqueue_struct *rds_ib_fmr_wq;
 
-int __init rds_ib_fmr_init(void)
+int rds_ib_fmr_init(void)
 {
 	rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
 	if (!rds_ib_fmr_wq)
@@ -706,7 +706,7 @@ int __init rds_ib_fmr_init(void)
  * had their pools freed.  As each pool is freed its work struct is waited on,
  * so the pool flushing work queue should be idle by the time we get here.
  */
-void __exit rds_ib_fmr_exit(void)
+void rds_ib_fmr_exit(void)
 {
 	destroy_workqueue(rds_ib_fmr_wq);
 }
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 37dab2898ad0..f25c4837d2f6 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1040,7 +1040,7 @@ int rds_ib_recv(struct rds_connection *conn)
 	return ret;
 }
 
-int __init rds_ib_recv_init(void)
+int rds_ib_recv_init(void)
 {
 	struct sysinfo si;
 	int ret = -ENOMEM;
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index b556c5c2a902..fc3da37220fd 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -119,7 +119,7 @@ void rds_ib_sysctl_exit(void)
 		unregister_sysctl_table(rds_ib_sysctl_hdr);
 }
 
-int __init rds_ib_sysctl_init(void)
+int rds_ib_sysctl_init(void)
 {
 	rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
 	if (!rds_ib_sysctl_hdr)
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 467790da1316..56808cac0fc7 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -287,7 +287,7 @@ struct rds_transport rds_iw_transport = {
 	.t_prefer_loopback	= 1,
 };
 
-int __init rds_iw_init(void)
+int rds_iw_init(void)
 {
 	int ret;
 
diff --git a/net/rds/iw.h b/net/rds/iw.h
index f112105faced..543e665fafe3 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg);
 int rds_iw_conn_connect(struct rds_connection *conn);
 void rds_iw_conn_shutdown(struct rds_connection *conn);
 void rds_iw_state_change(struct sock *sk);
-int __init rds_iw_listen_init(void);
+int rds_iw_listen_init(void);
 void rds_iw_listen_stop(void);
 void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -321,7 +321,7 @@ void rds_iw_flush_mrs(void);
 void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
 
 /* ib_recv.c */
-int __init rds_iw_recv_init(void);
+int rds_iw_recv_init(void);
 void rds_iw_recv_exit(void);
 int rds_iw_recv(struct rds_connection *conn);
 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
@@ -370,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
 /* ib_sysctl.c */
-int __init rds_iw_sysctl_init(void);
+int rds_iw_sysctl_init(void);
 void rds_iw_sysctl_exit(void);
 extern unsigned long rds_iw_sysctl_max_send_wr;
 extern unsigned long rds_iw_sysctl_max_recv_wr;
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 7e929f701a0c..5e57347f49ff 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
 	return ret;
 }
 
-int __init rds_iw_recv_init(void)
+int rds_iw_recv_init(void)
 {
 	struct sysinfo si;
 	int ret = -ENOMEM;
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 3cb0587d6f50..23e3a9a26aaf 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -122,7 +122,7 @@ void rds_iw_sysctl_exit(void)
 		unregister_sysctl_table(rds_iw_sysctl_hdr);
 }
 
-int __init rds_iw_sysctl_init(void)
+int rds_iw_sysctl_init(void)
 {
 	rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
 	if (!rds_iw_sysctl_hdr)
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index e599ba2f950d..550d34837fe7 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -122,7 +122,7 @@ out:
 	return ret;
 }
 
-static int __init rds_rdma_listen_init(void)
+static int rds_rdma_listen_init(void)
 {
 	struct sockaddr_in sin;
 	struct rdma_cm_id *cm_id;
@@ -177,7 +177,7 @@ static void rds_rdma_listen_stop(void)
 	}
 }
 
-int __init rds_rdma_init(void)
+int rds_rdma_init(void)
 {
 	int ret;
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 4510344ce8ca..8a8a4822d02a 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -606,7 +606,7 @@ void rds_cong_exit(void);
 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
 
 /* conn.c */
-int __init rds_conn_init(void);
+int rds_conn_init(void);
 void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
 				       struct rds_transport *trans, gfp_t gfp);
@@ -769,14 +769,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 	put_cpu();					\
 } while (0)
 #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int __init rds_stats_init(void);
+int rds_stats_init(void);
 void rds_stats_exit(void);
 void rds_stats_info_copy(struct rds_info_iterator *iter,
 			 uint64_t *values, const char *const *names,
 			 size_t nr);
 
 /* sysctl.c */
-int __init rds_sysctl_init(void);
+int rds_sysctl_init(void);
 void rds_sysctl_exit(void);
 extern unsigned long rds_sysctl_sndbuf_min;
 extern unsigned long rds_sysctl_sndbuf_default;
@@ -790,7 +790,7 @@ extern unsigned long rds_sysctl_trace_flags;
 extern unsigned int  rds_sysctl_trace_level;
 
 /* threads.c */
-int __init rds_threads_init(void);
+int rds_threads_init(void);
 void rds_threads_exit(void);
 extern struct workqueue_struct *rds_wq;
 void rds_queue_reconnect(struct rds_connection *conn);
@@ -806,7 +806,7 @@ void rds_trans_unregister(struct rds_transport *trans);
 struct rds_transport *rds_trans_get_preferred(__be32 addr);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
 				       unsigned int avail);
-int __init rds_trans_init(void);
+int rds_trans_init(void);
 void rds_trans_exit(void);
 
 #endif
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 344929a663e5..10c759ccac0c 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
 	rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
 }
 
-int __init rds_stats_init(void)
+int rds_stats_init(void)
 {
 	rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
 	return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 5a6dd81de188..25ad0c77a26c 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -105,7 +105,7 @@ void rds_sysctl_exit(void)
 		unregister_sysctl_table(rds_sysctl_reg_table);
 }
 
-int __init rds_sysctl_init(void)
+int rds_sysctl_init(void)
 {
 	rds_sysctl_reconnect_min = msecs_to_jiffies(1);
 	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 3262992f5f2b..eeb08e6ab96b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -274,7 +274,7 @@ struct rds_transport rds_tcp_transport = {
 	.t_prefer_loopback	= 1,
 };
 
-int __init rds_tcp_init(void)
+int rds_tcp_init(void)
 {
 	int ret;
 
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 16b166379ea9..f5e6f7bebb50 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -43,7 +43,7 @@ struct rds_tcp_statistics {
 };
 
 /* tcp.c */
-int __init rds_tcp_init(void);
+int rds_tcp_init(void);
 void rds_tcp_exit(void);
 void rds_tcp_tune(struct socket *sock);
 void rds_tcp_nonagle(struct socket *sock);
@@ -61,12 +61,12 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
 void rds_tcp_state_change(struct sock *sk);
 
 /* tcp_listen.c */
-int __init rds_tcp_listen_init(void);
+int rds_tcp_listen_init(void);
 void rds_tcp_listen_stop(void);
 void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
 
 /* tcp_recv.c */
-int __init rds_tcp_recv_init(void);
+int rds_tcp_recv_init(void);
 void rds_tcp_recv_exit(void);
 void rds_tcp_data_ready(struct sock *sk, int bytes);
 int rds_tcp_recv(struct rds_connection *conn);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 50b649eb6933..ae27869dfc21 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -135,7 +135,7 @@ out:
 	ready(sk, bytes);
 }
 
-int __init rds_tcp_listen_init(void)
+int rds_tcp_listen_init(void)
 {
 	struct sockaddr_in sin;
 	struct socket *sock = NULL;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 49c966010788..7017f3af80b6 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -342,7 +342,7 @@ out:
 	ready(sk, bytes);
 }
 
-int __init rds_tcp_recv_init(void)
+int rds_tcp_recv_init(void)
 {
 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
 					sizeof(struct rds_tcp_incoming),
diff --git a/net/rds/threads.c b/net/rds/threads.c
index c08c220efac5..0fd90f8c5f59 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -212,7 +212,7 @@ void rds_threads_exit(void)
 	destroy_workqueue(rds_wq);
 }
 
-int __init rds_threads_init(void)
+int rds_threads_init(void)
 {
 	rds_wq = create_singlethread_workqueue("krdsd");
 	if (!rds_wq)
-- 
cgit v1.2.3-59-g8ed1b


From f046011cd73c372267befd10242988eb744649fe Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Wed, 14 Jul 2010 13:55:35 -0700
Subject: RDS/IB: track signaled sends

We're seeing bugs today where IB connection shutdown clears the send
ring while the tasklet is processing completed sends.  Implementation
details cause this to dereference a null pointer.  Shutdown needs to
wait for send completion to stop before tearing down the connection.  We
can't simply wait for the ring to empty because it may contain
unsignaled sends that will never be processed.

This patch tracks the number of signaled sends that we've posted and
waits for them to complete.  It also makes sure that the tasklet has
finished executing.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.h      |  1 +
 net/rds/ib_cm.c   | 14 +++++++++++---
 net/rds/ib_send.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 54 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index acda2dbc6576..a13ced504145 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -108,6 +108,7 @@ struct rds_ib_connection {
 	struct rds_header	*i_send_hdrs;
 	u64			i_send_hdrs_dma;
 	struct rds_ib_send_work *i_sends;
+	atomic_t		i_signaled_sends;
 
 	/* rx */
 	struct tasklet_struct	i_recv_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 10f6a8815cd0..123c7d33b54e 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -615,11 +615,18 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 		}
 
 		/*
-		 * Don't wait for the send ring to be empty -- there may be completed
-		 * non-signaled entries sitting on there. We unmap these below.
+		 * We want to wait for tx and rx completion to finish
+		 * before we tear down the connection, but we have to be
+		 * careful not to get stuck waiting on a send ring that
+		 * only has unsignaled sends in it.  We've shutdown new
+		 * sends before getting here so by waiting for signaled
+		 * sends to complete we're ensured that there will be no
+		 * more tx processing.
 		 */
 		wait_event(rds_ib_ring_empty_wait,
-			rds_ib_ring_empty(&ic->i_recv_ring));
+			   rds_ib_ring_empty(&ic->i_recv_ring) &&
+			   (atomic_read(&ic->i_signaled_sends) == 0));
+		tasklet_kill(&ic->i_recv_tasklet);
 
 		if (ic->i_send_hdrs)
 			ib_dma_free_coherent(dev,
@@ -729,6 +736,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 #ifndef KERNEL_HAS_ATOMIC64
 	spin_lock_init(&ic->i_ack_lock);
 #endif
+	atomic_set(&ic->i_signaled_sends, 0);
 
 	/*
 	 * rds_ib_conn_shutdown() waits for these to be emptied so they
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index e88cb4af009b..15f75692574c 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -219,6 +219,18 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
 	}
 }
 
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+	if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+	BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
 /*
  * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
  * operations performed in the send path.  As the sender allocs and potentially
@@ -236,6 +248,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 	u32 oldest;
 	u32 i = 0;
 	int ret;
+	int nr_sig = 0;
 
 	rdsdebug("cq %p conn %p\n", cq, conn);
 	rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -262,6 +275,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 		for (i = 0; i < completed; i++) {
 			send = &ic->i_sends[oldest];
+			if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+				nr_sig++;
 
 			rm = rds_ib_send_unmap_op(ic, send, wc.status);
 
@@ -282,6 +297,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 		}
 
 		rds_ib_ring_free(&ic->i_send_ring, completed);
+		rds_ib_sub_signaled(ic, nr_sig);
+		nr_sig = 0;
 
 		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
 		    test_bit(0, &conn->c_map_queued))
@@ -440,9 +457,9 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 }
 
-static inline void rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
-					      struct rds_ib_send_work *send,
-					      bool notify)
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+					     struct rds_ib_send_work *send,
+					     bool notify)
 {
 	/*
 	 * We want to delay signaling completions just enough to get
@@ -452,7 +469,9 @@ static inline void rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
 	if (ic->i_unsignaled_wrs-- == 0 || notify) {
 		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
 		send->s_wr.send_flags |= IB_SEND_SIGNALED;
+		return 1;
 	}
+	return 0;
 }
 
 /*
@@ -488,6 +507,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	int bytes_sent = 0;
 	int ret;
 	int flow_controlled = 0;
+	int nr_sig = 0;
 
 	BUG_ON(off % RDS_FRAG_SIZE);
 	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
@@ -645,6 +665,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
 			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 
+		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+			nr_sig++;
+
 		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
 			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
 
@@ -689,6 +712,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	if (ic->i_flowctl && i < credit_alloc)
 		rds_ib_send_add_credits(conn, credit_alloc - i);
 
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
 	/* XXX need to worry about failed_wr and partial sends. */
 	failed_wr = &first->s_wr;
 	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -699,6 +725,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
 		if (prev->s_op) {
 			ic->i_data_op = prev->s_op;
 			prev->s_op = NULL;
@@ -728,6 +755,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 	u32 pos;
 	u32 work_alloc;
 	int ret;
+	int nr_sig = 0;
 
 	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
 
@@ -752,7 +780,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 		send->s_wr.wr.atomic.compare_add = op->op_swap_add;
 		send->s_wr.wr.atomic.swap = 0;
 	}
-	rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 	send->s_wr.num_sge = 1;
 	send->s_wr.next = NULL;
 	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
@@ -778,6 +806,9 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
 		 send->s_sge[0].addr, send->s_sge[0].length);
 
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
 	failed_wr = &send->s_wr;
 	ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
 	rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
@@ -787,6 +818,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
 		goto out;
 	}
 
@@ -817,6 +849,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 	int sent;
 	int ret;
 	int num_sge;
+	int nr_sig = 0;
 
 	/* map the op the first time we see it */
 	if (!op->op_mapped) {
@@ -859,7 +892,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 		send->s_queued = jiffies;
 		send->s_op = NULL;
 
-		rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+		nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 
 		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
@@ -910,6 +943,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 		work_alloc = i;
 	}
 
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
 	failed_wr = &first->s_wr;
 	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
 	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -919,6 +955,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
 		goto out;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b4e1da3c9a0ac9b01f45a8578b7347e3a31f9fb8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 19 Jul 2010 17:02:41 -0700
Subject: RDS: properly use sg_init_table

This is only needed to keep debugging code from bugging.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_recv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index f25c4837d2f6..a2f5f6faf35c 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -282,6 +282,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 		if (!frag)
 			return NULL;
 
+		sg_init_table(&frag->f_sg, 1);
 		ret = rds_page_remainder_alloc(&frag->f_sg,
 					       RDS_FRAG_SIZE, page_mask);
 		if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From 8576f374ac9537674e3cccb0a9d43fa2b7ebbf5b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 19 Jul 2010 17:06:46 -0700
Subject: RDS: flush fmrs before allocating new ones

Flushing FMRs is somewhat expensive, and is currently kicked off when
the interrupt handler notices that we are getting low.  The result of
this is that FMR flushing only happens from the interrupt cpus.

This spreads the load more effectively by triggering flushes just before
we allocate a new FMR.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 net/rds/ib_rdma.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 3efdddc39d49..0017964f2fcf 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -38,6 +38,8 @@
 #include "ib.h"
 #include "xlist.h"
 
+struct workqueue_struct *rds_ib_fmr_wq;
+
 static DEFINE_PER_CPU(unsigned long, clean_list_grace);
 #define CLEAN_LIST_BUSY_BIT 0
 
@@ -304,6 +306,9 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 	struct rds_ib_mr *ibmr = NULL;
 	int err = 0, iter = 0;
 
+	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+
 	while (1) {
 		ibmr = rds_ib_reuse_fmr(pool);
 		if (ibmr)
@@ -691,8 +696,6 @@ out_nolock:
 	return ret;
 }
 
-struct workqueue_struct *rds_ib_fmr_wq;
-
 int rds_ib_fmr_init(void)
 {
 	rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
-- 
cgit v1.2.3-59-g8ed1b


From 1bde04a63d532c2540d6fdee0a661530a62b1686 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Wed, 14 Jul 2010 14:01:21 -0700
Subject: RDS/IB: print IB event strings as well as their number

It's nice to not have to go digging in the code to see which event
occurred.  It's easy to throw together a quick array that maps the ib
event enums to their strings.  I didn't see anything in the stack that
does this translation for us, but I also didn't look very hard.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib_cm.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 123c7d33b54e..0e2fea893a76 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -38,6 +38,38 @@
 #include "rds.h"
 #include "ib.h"
 
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) [IB_EVENT_##foo] = __stringify(foo)
+	RDS_IB_EVENT_STRING(CQ_ERR),
+	RDS_IB_EVENT_STRING(QP_FATAL),
+	RDS_IB_EVENT_STRING(QP_REQ_ERR),
+	RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+	RDS_IB_EVENT_STRING(COMM_EST),
+	RDS_IB_EVENT_STRING(SQ_DRAINED),
+	RDS_IB_EVENT_STRING(PATH_MIG),
+	RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+	RDS_IB_EVENT_STRING(DEVICE_FATAL),
+	RDS_IB_EVENT_STRING(PORT_ACTIVE),
+	RDS_IB_EVENT_STRING(PORT_ERR),
+	RDS_IB_EVENT_STRING(LID_CHANGE),
+	RDS_IB_EVENT_STRING(PKEY_CHANGE),
+	RDS_IB_EVENT_STRING(SM_CHANGE),
+	RDS_IB_EVENT_STRING(SRQ_ERR),
+	RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+	RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+	RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+	if (type < ARRAY_SIZE(rds_ib_event_type_strings) &&
+	    rds_ib_event_type_strings[type])
+		return rds_ib_event_type_strings[type];
+	else
+		return "unknown";
+};
+
 /*
  * Set the selected protocol version
  */
@@ -202,7 +234,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
 {
-	rdsdebug("event %u data %p\n", event->event, data);
+	rdsdebug("event %u (%s) data %p\n",
+		 event->event, rds_ib_event_str(event->event), data);
 }
 
 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -210,16 +243,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
 	struct rds_connection *conn = data;
 	struct rds_ib_connection *ic = conn->c_transport_data;
 
-	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+		 rds_ib_event_str(event->event));
 
 	switch (event->event) {
 	case IB_EVENT_COMM_EST:
 		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
 		break;
 	default:
-		rdsdebug("Fatal QP Event %u "
+		rdsdebug("Fatal QP Event %u (%s) "
 			"- connection %pI4->%pI4, reconnecting\n",
-			event->event, &conn->c_laddr, &conn->c_faddr);
+			event->event, rds_ib_event_str(event->event),
+			&conn->c_laddr, &conn->c_faddr);
 		rds_conn_drop(conn);
 		break;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ea819867b788728aca60717e4fdacb3df771f670 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Jul 2010 12:34:33 -0700
Subject: RDS/IB: protect the list of IB devices

The RDS IB device list wasn't protected by any locking.  Traversal in
both the get_mr and FMR flushing paths could race with additon and
removal.

List manipulation is done with RCU primatives and is protected by the
write side of a rwsem.  The list traversal in the get_mr fast path is
protected by a rcu read critical section.  The FMR list traversal is
more problematic because it can block while traversing the list.  We
protect this with the read side of the rwsem.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/ib.c      | 27 ++++++++++++++++++++-------
 net/rds/ib.h      |  1 +
 net/rds/ib_rdma.c |  8 +++++---
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 3eb5617649c6..b12a3951167d 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -53,6 +53,12 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
 module_param(rds_ib_retry_count, int, 0444);
 MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
 
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
 struct list_head rds_ib_devices;
 
 /* NOTE: if also grabbing ibdev lock, grab this first */
@@ -171,7 +177,10 @@ void rds_ib_add_one(struct ib_device *device)
 
 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
 	INIT_LIST_HEAD(&rds_ibdev->conn_list);
-	list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+
+	down_write(&rds_ib_devices_lock);
+	list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+	up_write(&rds_ib_devices_lock);
 	atomic_inc(&rds_ibdev->refcount);
 
 	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
@@ -230,16 +239,20 @@ void rds_ib_remove_one(struct ib_device *device)
 
 	rds_ib_dev_shutdown(rds_ibdev);
 
+	/* stop connection attempts from getting a reference to this device. */
+	ib_set_client_data(device, &rds_ib_client, NULL);
+
+	down_write(&rds_ib_devices_lock);
+	list_del_rcu(&rds_ibdev->list);
+	up_write(&rds_ib_devices_lock);
+
 	/*
-	 * prevent future connection attempts from getting a reference to this
-	 * device and wait for currently racing connection attempts to finish
-	 * getting their reference
+	 * This synchronize rcu is waiting for readers of both the ib
+	 * client data and the devices list to finish before we drop
+	 * both of those references.
 	 */
-	ib_set_client_data(device, &rds_ib_client, NULL);
 	synchronize_rcu();
 	rds_ib_dev_put(rds_ibdev);
-
-	list_del(&rds_ibdev->list);
 	rds_ib_dev_put(rds_ibdev);
 }
 
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a13ced504145..2189fd47a738 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -23,6 +23,7 @@
 
 #define RDS_IB_RECYCLE_BATCH_COUNT	32
 
+extern struct rw_semaphore rds_ib_devices_lock;
 extern struct list_head rds_ib_devices;
 
 /*
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0017964f2fcf..8f6e221c9f78 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -94,8 +94,8 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 	struct rds_ib_device *rds_ibdev;
 	struct rds_ib_ipaddr *i_ipaddr;
 
-	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-		rcu_read_lock();
+	rcu_read_lock();
+	list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
 		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 			if (i_ipaddr->ipaddr == ipaddr) {
 				atomic_inc(&rds_ibdev->refcount);
@@ -103,8 +103,8 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 				return rds_ibdev;
 			}
 		}
-		rcu_read_unlock();
 	}
+	rcu_read_unlock();
 
 	return NULL;
 }
@@ -761,12 +761,14 @@ void rds_ib_flush_mrs(void)
 {
 	struct rds_ib_device *rds_ibdev;
 
+	down_read(&rds_ib_devices_lock);
 	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
 		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 
 		if (pool)
 			rds_ib_flush_mr_pool(pool, 0, NULL);
 	}
+	up_read(&rds_ib_devices_lock);
 }
 
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
-- 
cgit v1.2.3-59-g8ed1b


From fe8ff6b58f040dd52d2db45972db8e0301847f1c Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 23 Jul 2010 10:30:45 -0700
Subject: RDS: lock rds_conn_count decrement in rds_conn_destroy()

rds_conn_destroy() can race with all other modifications of the
rds_conn_count but it was modifying the count without locking.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/connection.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 0de40d9563ca..75a1a37d64d3 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -313,6 +313,7 @@ void rds_conn_shutdown(struct rds_connection *conn)
 void rds_conn_destroy(struct rds_connection *conn)
 {
 	struct rds_message *rm, *rtmp;
+	unsigned long flags;
 
 	rdsdebug("freeing conn %p for %pI4 -> "
 		 "%pI4\n", conn, &conn->c_laddr,
@@ -350,7 +351,9 @@ void rds_conn_destroy(struct rds_connection *conn)
 	BUG_ON(!list_empty(&conn->c_retrans));
 	kmem_cache_free(rds_conn_slab, conn);
 
+	spin_lock_irqsave(&rds_conn_lock, flags);
 	rds_conn_count--;
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
 }
 EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
-- 
cgit v1.2.3-59-g8ed1b


From 77510481c0c3980c8979ed236d63e59221fb8ce5 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Wed, 21 Jul 2010 15:13:25 -0700
Subject: RDS: remove old rs_transport comment

rs_transport is now also used by the rdma paths once the socket is
bound.  We don't need this stale comment to tell us what cscope can.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/rds.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8a8a4822d02a..2ff7fc9f0539 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -457,12 +457,6 @@ struct rds_sock {
 	__be32			rs_conn_addr;
 	__be16			rs_bound_port;
 	__be16			rs_conn_port;
-
-	/*
-	 * This is only used to communicate the transport between bind and
-	 * initiating connections.  All other trans use is referenced through
-	 * the connection.
-	 */
 	struct rds_transport    *rs_transport;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 5adb5bc65f93e52341c3fc9d03d4030dd375e256 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 23 Jul 2010 10:32:31 -0700
Subject: RDS: have sockets get transport module references

Right now there's nothing to stop the various paths that use
rs->rs_transport from racing with rmmod and executing freed transport
code.  The simple fix is to have binding to a transport also hold a
reference to the transport's module, removing this class of races.

We already had an unused t_owner field which was set for the modular
transports and which wasn't set for the built-in loop transport.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/af_rds.c     |  2 ++
 net/rds/connection.c |  5 ++++-
 net/rds/rds.h        |  1 +
 net/rds/transport.c  | 19 ++++++++++++++-----
 4 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 57ef0ec4f03d..8e3886d353b5 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -90,6 +90,8 @@ static int rds_release(struct socket *sock)
 	rds_sock_count--;
 	spin_unlock_irqrestore(&rds_sock_lock, flags);
 
+	rds_trans_put(rs->rs_transport);
+
 	sock->sk = NULL;
 	sock_put(sk);
 out:
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 75a1a37d64d3..968b7a798398 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,6 +117,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 {
 	struct rds_connection *conn, *parent = NULL;
 	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+	struct rds_transport *loop_trans;
 	unsigned long flags;
 	int ret;
 
@@ -163,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	 * can bind to the destination address then we'd rather the messages
 	 * flow through loopback rather than either transport.
 	 */
-	if (rds_trans_get_preferred(faddr)) {
+	loop_trans = rds_trans_get_preferred(faddr);
+	if (loop_trans) {
+		rds_trans_put(loop_trans);
 		conn->c_loopback = 1;
 		if (is_outgoing && trans->t_prefer_loopback) {
 			/* "outgoing" connection - and the transport
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 2ff7fc9f0539..aab5e949fa93 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -798,6 +798,7 @@ void rds_connect_complete(struct rds_connection *conn);
 int rds_trans_register(struct rds_transport *trans);
 void rds_trans_unregister(struct rds_transport *trans);
 struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
 				       unsigned int avail);
 int rds_trans_init(void);
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7e1067901353..7f2ac4fec367 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
 }
 EXPORT_SYMBOL_GPL(rds_trans_unregister);
 
+void rds_trans_put(struct rds_transport *trans)
+{
+	if (trans && trans->t_owner)
+		module_put(trans->t_owner);
+}
+
 struct rds_transport *rds_trans_get_preferred(__be32 addr)
 {
 	struct rds_transport *ret = NULL;
-	int i;
+	struct rds_transport *trans;
+	unsigned int i;
 
 	if (IN_LOOPBACK(ntohl(addr)))
 		return &rds_loop_transport;
 
 	down_read(&rds_trans_sem);
-	for (i = 0; i < RDS_TRANS_COUNT; i++)
-	{
-		if (transports[i] && (transports[i]->laddr_check(addr) == 0)) {
-			ret = transports[i];
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+
+		if (trans && (trans->laddr_check(addr) == 0) &&
+		    (!trans->t_owner || try_module_get(trans->t_owner))) {
+			ret = trans;
 			break;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ffcec0e110c198717eb0f6ac000c1e5397db9451 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 23 Jul 2010 10:36:58 -0700
Subject: RDS: don't call rds_conn_shutdown() from rds_conn_destroy()

rds_conn_shutdown() can return before the connection is shut down when
it encounters an existing state that it doesn't understand.  This lets
rds_conn_destroy() then start tearing down the conn from under paths
that are still using it.

It's more reliable the shutdown work and wait for krdsd to complete the
shutdown callback.  This stopped some hangs I was seeing where krdsd was
trying to shut down a freed conn.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/connection.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 968b7a798398..519b4fe962b6 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -312,6 +312,10 @@ void rds_conn_shutdown(struct rds_connection *conn)
 
 /*
  * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances.  It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
  */
 void rds_conn_destroy(struct rds_connection *conn)
 {
@@ -326,10 +330,11 @@ void rds_conn_destroy(struct rds_connection *conn)
 	spin_lock_irq(&rds_conn_lock);
 	hlist_del_init_rcu(&conn->c_hash_node);
 	spin_unlock_irq(&rds_conn_lock);
-
 	synchronize_rcu();
 
-	rds_conn_shutdown(conn);
+	/* shut the connection down */
+	rds_conn_drop(conn);
+	flush_work(&conn->c_down_w);
 
 	/* tear down queued messages */
 	list_for_each_entry_safe(rm, rtmp,
-- 
cgit v1.2.3-59-g8ed1b


From 4518071ac1bcb76c64a55a3fddb39fb3d39add41 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 23 Jul 2010 10:37:33 -0700
Subject: RDS: cancel connection work structs as we shut down

Nothing was canceling the send and receive work that might have been
queued as a conn was being destroyed.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/connection.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 519b4fe962b6..870992e08cae 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -336,6 +336,10 @@ void rds_conn_destroy(struct rds_connection *conn)
 	rds_conn_drop(conn);
 	flush_work(&conn->c_down_w);
 
+	/* make sure lingering queued work won't try to ref the conn */
+	cancel_delayed_work_sync(&conn->c_send_w);
+	cancel_delayed_work_sync(&conn->c_recv_w);
+
 	/* tear down queued messages */
 	list_for_each_entry_safe(rm, rtmp,
 				 &conn->c_send_queue,
-- 
cgit v1.2.3-59-g8ed1b


From 59f740a6aeb2cde2f79fe0df38262d4c1ef35cd8 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 3 Aug 2010 13:52:47 -0700
Subject: RDS/IB: print string constants in more places

This prints the constant identifier for work completion status and rdma
cm event types, like we already do for IB event types.

A core string array helper is added that each string type uses.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 net/rds/af_rds.c         |  9 +++++++++
 net/rds/ib.h             |  1 +
 net/rds/ib_cm.c          | 10 ++++------
 net/rds/ib_recv.c        | 12 +++++++-----
 net/rds/ib_send.c        | 47 +++++++++++++++++++++++++++++++++++++++++------
 net/rds/rdma_transport.c | 38 ++++++++++++++++++++++++++++++++++----
 net/rds/rds.h            |  1 +
 7 files changed, 97 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 8e3886d353b5..bb6ad81b671d 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -40,6 +40,15 @@
 
 #include "rds.h"
 
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+	if ((index < elements) && array[index])
+		return array[index];
+	else
+		return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
+
 /* this is just used for stats gathering :/ */
 static DEFINE_SPINLOCK(rds_sock_lock);
 static unsigned long rds_sock_count;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 2189fd47a738..7ad3d57e06a5 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -345,6 +345,7 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
 extern wait_queue_head_t rds_ib_ring_empty_wait;
 
 /* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
 void rds_ib_xmit_complete(struct rds_connection *conn);
 int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		unsigned int hdr_off, unsigned int sg, unsigned int off);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0e2fea893a76..bc3dbc1ba61f 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -39,7 +39,8 @@
 #include "ib.h"
 
 static char *rds_ib_event_type_strings[] = {
-#define RDS_IB_EVENT_STRING(foo) [IB_EVENT_##foo] = __stringify(foo)
+#define RDS_IB_EVENT_STRING(foo) \
+		[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
 	RDS_IB_EVENT_STRING(CQ_ERR),
 	RDS_IB_EVENT_STRING(QP_FATAL),
 	RDS_IB_EVENT_STRING(QP_REQ_ERR),
@@ -63,11 +64,8 @@ static char *rds_ib_event_type_strings[] = {
 
 static char *rds_ib_event_str(enum ib_event_type type)
 {
-	if (type < ARRAY_SIZE(rds_ib_event_type_strings) &&
-	    rds_ib_event_type_strings[type])
-		return rds_ib_event_type_strings[type];
-	else
-		return "unknown";
+	return rds_str_array(rds_ib_event_type_strings,
+			     ARRAY_SIZE(rds_ib_event_type_strings), type);
 };
 
 /*
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index a2f5f6faf35c..e29e0ca32f74 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -966,8 +966,9 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
 	struct rds_ib_recv_work *recv;
 
 	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
 			 be32_to_cpu(wc.ex.imm_data));
 		rds_ib_stats_inc(s_ib_rx_cq_event);
 
@@ -985,10 +986,11 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
 		} else {
 			/* We expect errors as the qp is drained during shutdown */
 			if (rds_conn_up(conn) || rds_conn_connecting(conn))
-				rds_ib_conn_error(conn, "recv completion on "
-						  "%pI4 had status %u, disconnecting and "
+				rds_ib_conn_error(conn, "recv completion on %pI4 had "
+						  "status %u (%s), disconnecting and "
 						  "reconnecting\n", &conn->c_faddr,
-						  wc.status);
+						  wc.status,
+						  rds_ib_wc_status_str(wc.status));
 		}
 
 		/*
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 15f75692574c..808544aebb70 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -38,6 +38,40 @@
 #include "rds.h"
 #include "ib.h"
 
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+		[IB_WC_##foo] = __stringify(IB_WC_##foo)
+	RDS_IB_WC_STATUS_STR(SUCCESS),
+	RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+	RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+	RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+	RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+	RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+	RDS_IB_WC_STATUS_STR(FATAL_ERR),
+	RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+	RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+	return rds_str_array(rds_ib_wc_status_strings,
+			     ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
 /*
  * Convert IB-specific error message to RDS error message and call core
  * completion handler.
@@ -257,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
 
 	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
 			 be32_to_cpu(wc.ex.imm_data));
 		rds_ib_stats_inc(s_ib_tx_cq_event);
 
@@ -306,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 		/* We expect errors as the qp is drained during shutdown */
 		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-			rds_ib_conn_error(conn,
-				"send completion on %pI4 "
-				"had status %u, disconnecting and reconnecting\n",
-				&conn->c_faddr, wc.status);
+			rds_ib_conn_error(conn, "send completion on %pI4 had status "
+					  "%u (%s), disconnecting and reconnecting\n",
+					  &conn->c_faddr, wc.status,
+					  rds_ib_wc_status_str(wc.status));
 		}
 	}
 }
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 550d34837fe7..e6ed10aee190 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -36,6 +36,34 @@
 
 static struct rdma_cm_id *rds_rdma_listen_id;
 
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+		[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+	RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+	RDS_CM_EVENT_STRING(ADDR_ERROR),
+	RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+	RDS_CM_EVENT_STRING(ROUTE_ERROR),
+	RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+	RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+	RDS_CM_EVENT_STRING(CONNECT_ERROR),
+	RDS_CM_EVENT_STRING(UNREACHABLE),
+	RDS_CM_EVENT_STRING(REJECTED),
+	RDS_CM_EVENT_STRING(ESTABLISHED),
+	RDS_CM_EVENT_STRING(DISCONNECTED),
+	RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+	RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+	RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+	RDS_CM_EVENT_STRING(ADDR_CHANGE),
+	RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+	return rds_str_array(rds_cm_event_strings,
+			     ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 			      struct rdma_cm_event *event)
 {
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 	struct rds_transport *trans;
 	int ret = 0;
 
-	rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
-		 event->event);
+	rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+		 event->event, rds_cm_event_str(event->event));
 
 	if (cm_id->device->node_type == RDMA_NODE_RNIC)
 		trans = &rds_iw_transport;
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
 	default:
 		/* things like device disconnect? */
-		printk(KERN_ERR "RDS: unknown event %u!\n", event->event);
+		printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+		       event->event, rds_cm_event_str(event->event));
 		break;
 	}
 
@@ -117,7 +146,8 @@ out:
 	if (conn)
 		mutex_unlock(&conn->c_cm_lock);
 
-	rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+	rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+		 rds_cm_event_str(event->event), ret);
 
 	return ret;
 }
diff --git a/net/rds/rds.h b/net/rds/rds.h
index aab5e949fa93..aadaddba88a7 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -566,6 +566,7 @@ struct rds_statistics {
 };
 
 /* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
 void rds_sock_addref(struct rds_sock *rs);
 void rds_sock_put(struct rds_sock *rs);
 void rds_wake_sk_sleep(struct rds_sock *rs);
-- 
cgit v1.2.3-59-g8ed1b


From 20c72bd5f5f902e5a8745d51573699605bf8d21c Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 25 Aug 2010 05:51:28 -0700
Subject: RDS: Implement masked atomic operations

Add two CMSGs for masked versions of cswp and fadd. args
struct modified to use a union for different atomic op type's
arguments. Change IB to do masked atomic ops. Atomic op type
in rds_message similarly unionized.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
 include/linux/rds.h | 12 ++++++++++++
 net/rds/ib_send.c   | 14 +++++++++-----
 net/rds/rdma.c      | 33 +++++++++++++++++++++++++++------
 net/rds/rds.h       | 14 ++++++++++++--
 net/rds/send.c      |  4 ++++
 5 files changed, 64 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/linux/rds.h b/include/linux/rds.h
index 109f1d343318..a2a5edb4a276 100644
--- a/include/linux/rds.h
+++ b/include/linux/rds.h
@@ -75,6 +75,8 @@
 #define RDS_CMSG_CONG_UPDATE		5
 #define RDS_CMSG_ATOMIC_FADD		6
 #define RDS_CMSG_ATOMIC_CSWP		7
+#define RDS_CMSG_MASKED_ATOMIC_FADD	8
+#define RDS_CMSG_MASKED_ATOMIC_CSWP	9
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000
@@ -251,6 +253,16 @@ struct rds_atomic_args {
 		struct {
 			uint64_t	add;
 		} fadd;
+		struct {
+			uint64_t	compare;
+			uint64_t	swap;
+			uint64_t	compare_mask;
+			uint64_t	swap_mask;
+		} m_cswp;
+		struct {
+			uint64_t	add;
+			uint64_t	nocarry_mask;
+		} m_fadd;
 	};
 	uint64_t	flags;
 	uint64_t	user_token;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 808544aebb70..71f373c421bc 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -807,13 +807,17 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 	send->s_queued = jiffies;
 
 	if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
-		send->s_wr.opcode = IB_WR_ATOMIC_CMP_AND_SWP;
-		send->s_wr.wr.atomic.compare_add = op->op_compare;
-		send->s_wr.wr.atomic.swap = op->op_swap_add;
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+		send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+		send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+		send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
 	} else { /* FADD */
-		send->s_wr.opcode = IB_WR_ATOMIC_FETCH_AND_ADD;
-		send->s_wr.wr.atomic.compare_add = op->op_swap_add;
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+		send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
 		send->s_wr.wr.atomic.swap = 0;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+		send->s_wr.wr.atomic.swap_mask = 0;
 	}
 	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 	send->s_wr.num_sge = 1;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 48781fe4431c..48064673fc76 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -738,13 +738,34 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 
 	args = CMSG_DATA(cmsg);
 
-	if (cmsg->cmsg_type == RDS_CMSG_ATOMIC_CSWP) {
-		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
-		rm->atomic.op_swap_add = args->cswp.swap;
-		rm->atomic.op_compare = args->cswp.compare;
-	} else {
+	/* Nonmasked & masked cmsg ops converted to masked hw ops */
+	switch (cmsg->cmsg_type) {
+	case RDS_CMSG_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = 0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_FADD:
 		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
-		rm->atomic.op_swap_add = args->fadd.add;
+		rm->atomic.op_m_fadd.add = args->m_fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+		break;
+	case RDS_CMSG_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = ~0;
+		rm->atomic.op_m_cswp.swap_mask = ~0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+		break;
+	default:
+		BUG(); /* should never happen */
 	}
 
 	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index aadaddba88a7..8103dcf8b976 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -316,8 +316,18 @@ struct rds_message {
 	struct {
 		struct rm_atomic_op {
 			int			op_type;
-			uint64_t		op_swap_add;
-			uint64_t		op_compare;
+			union {
+				struct {
+					uint64_t	compare;
+					uint64_t	swap;
+					uint64_t	compare_mask;
+					uint64_t	swap_mask;
+				} op_m_cswp;
+				struct {
+					uint64_t	add;
+					uint64_t	nocarry_mask;
+				} op_m_fadd;
+			};
 
 			u32			op_rkey;
 			u64			op_remote_addr;
diff --git a/net/rds/send.c b/net/rds/send.c
index 81471b25373b..9b951a0ab6b7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -843,6 +843,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 		case RDS_CMSG_ATOMIC_CSWP:
 		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
 			cmsg_groups |= 1;
 			size += sizeof(struct scatterlist);
 			break;
@@ -894,6 +896,8 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			break;
 		case RDS_CMSG_ATOMIC_CSWP:
 		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
 			ret = rds_cmsg_atomic(rs, rm, cmsg);
 			break;
 
-- 
cgit v1.2.3-59-g8ed1b