aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds/ib_cm.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds/ib_cm.c')
-rw-r--r--net/rds/ib_cm.c184
1 files changed, 126 insertions, 58 deletions
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f68832798db2..ee369d201a65 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -38,6 +38,36 @@
#include "rds.h"
#include "ib.h"
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+ [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+ RDS_IB_EVENT_STRING(CQ_ERR),
+ RDS_IB_EVENT_STRING(QP_FATAL),
+ RDS_IB_EVENT_STRING(QP_REQ_ERR),
+ RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+ RDS_IB_EVENT_STRING(COMM_EST),
+ RDS_IB_EVENT_STRING(SQ_DRAINED),
+ RDS_IB_EVENT_STRING(PATH_MIG),
+ RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+ RDS_IB_EVENT_STRING(DEVICE_FATAL),
+ RDS_IB_EVENT_STRING(PORT_ACTIVE),
+ RDS_IB_EVENT_STRING(PORT_ERR),
+ RDS_IB_EVENT_STRING(LID_CHANGE),
+ RDS_IB_EVENT_STRING(PKEY_CHANGE),
+ RDS_IB_EVENT_STRING(SM_CHANGE),
+ RDS_IB_EVENT_STRING(SRQ_ERR),
+ RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+ RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+ RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+ return rds_str_array(rds_ib_event_type_strings,
+ ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
/*
* Set the selected protocol version
*/
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
{
const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data;
- struct rds_ib_device *rds_ibdev;
struct ib_qp_attr qp_attr;
int err;
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
}
- printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
- &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
+ if (conn->c_version < RDS_PROTOCOL(3,1)) {
+ printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+ " no longer supported\n",
+ &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version));
+ rds_conn_destroy(conn);
+ return;
+ } else {
+ printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+ &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+ }
/*
* Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_recv_init_ring(ic);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
- rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+ rds_ib_recv_refill(conn, 1);
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
if (err)
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
- /* update ib_device with this local ipaddr & conn */
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
- err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+ /* update ib_device with this local ipaddr */
+ err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
if (err)
- printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
- rds_ib_add_conn(rds_ibdev, conn);
+ printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+ err);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
struct rdma_conn_param *conn_param,
struct rds_ib_connect_private *dp,
- u32 protocol_version)
+ u32 protocol_version,
+ u32 max_responder_resources,
+ u32 max_initiator_depth)
{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
memset(conn_param, 0, sizeof(struct rdma_conn_param));
- /* XXX tune these? */
- conn_param->responder_resources = 1;
- conn_param->initiator_depth = 1;
+
+ conn_param->responder_resources =
+ min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+ conn_param->initiator_depth =
+ min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
conn_param->rnr_retry_count = 7;
if (dp) {
- struct rds_ib_connection *ic = conn->c_transport_data;
-
memset(dp, 0, sizeof(*dp));
dp->dp_saddr = conn->c_laddr;
dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{
- rdsdebug("event %u data %p\n", event->event, data);
+ rdsdebug("event %u (%s) data %p\n",
+ event->event, rds_ib_event_str(event->event), data);
}
static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
struct rds_connection *conn = data;
struct rds_ib_connection *ic = conn->c_transport_data;
- rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+ rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+ rds_ib_event_str(event->event));
switch (event->event) {
case IB_EVENT_COMM_EST:
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
- rdsdebug("Fatal QP Event %u "
+ rdsdebug("Fatal QP Event %u (%s) "
"- connection %pI4->%pI4, reconnecting\n",
- event->event, &conn->c_laddr, &conn->c_faddr);
+ event->event, rds_ib_event_str(event->event),
+ &conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
break;
}
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_device *rds_ibdev;
int ret;
- /* rds_ib_add_one creates a rds_ib_device object per IB device,
- * and allocates a protection domain, memory range and FMR pool
- * for each. If that fails for any reason, it will not register
- * the rds_ibdev at all.
+ /*
+ * It's normal to see a null device if an incoming connection races
+ * with device removal, so we don't print a warning.
*/
- rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
- if (rds_ibdev == NULL) {
- if (printk_ratelimit())
- printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
- dev->name);
+ rds_ibdev = rds_ib_get_client_data(dev);
+ if (!rds_ibdev)
return -EOPNOTSUPP;
- }
+
+ /* add the conn now so that connection establishment has the dev */
+ rds_ib_add_conn(rds_ibdev, conn);
if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
- if (ic->i_send_hdrs == NULL) {
+ if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (ic->i_recv_hdrs == NULL) {
+ if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
- if (ic->i_ack == NULL) {
+ if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
- ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
- if (ic->i_sends == NULL) {
+ ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+ ibdev_to_node(dev));
+ if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
}
memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
- ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
- if (ic->i_recvs == NULL) {
+ ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+ ibdev_to_node(dev));
+ if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_cq, ic->i_recv_cq);
out:
+ rds_ib_dev_put(rds_ibdev);
return ret;
}
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param;
u32 version;
- int err, destroy = 1;
+ int err = 1, destroy = 1;
/* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
/* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc(s_ib_connect_raced);
}
- mutex_unlock(&conn->c_cm_lock);
goto out;
}
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
err = rds_ib_setup_qp(conn);
if (err) {
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
- mutex_unlock(&conn->c_cm_lock);
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+ event->param.conn.responder_resources,
+ event->param.conn.initiator_depth);
/* rdma_accept() calls rdma_reject() internally if it fails */
err = rdma_accept(cm_id, &conn_param);
- mutex_unlock(&conn->c_cm_lock);
- if (err) {
+ if (err)
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
- goto out;
- }
-
- return 0;
out:
- rdma_reject(cm_id, NULL, 0);
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+ if (err)
+ rdma_reject(cm_id, NULL, 0);
return destroy;
}
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+ UINT_MAX, UINT_MAX);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
ic->i_cm_id, err);
}
+ /*
+ * We want to wait for tx and rx completion to finish
+ * before we tear down the connection, but we have to be
+ * careful not to get stuck waiting on a send ring that
+ * only has unsignaled sends in it. We've shutdown new
+ * sends before getting here so by waiting for signaled
+ * sends to complete we're ensured that there will be no
+ * more tx processing.
+ */
wait_event(rds_ib_ring_empty_wait,
- rds_ib_ring_empty(&ic->i_send_ring) &&
- rds_ib_ring_empty(&ic->i_recv_ring));
+ rds_ib_ring_empty(&ic->i_recv_ring) &&
+ (atomic_read(&ic->i_signaled_sends) == 0));
+ tasklet_kill(&ic->i_recv_tasklet);
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
BUG_ON(ic->rds_ibdev);
/* Clear pending transmit */
- if (ic->i_rm) {
- rds_message_put(ic->i_rm);
- ic->i_rm = NULL;
+ if (ic->i_data_op) {
+ struct rds_message *rm;
+
+ rm = container_of(ic->i_data_op, struct rds_message, data);
+ rds_message_put(rm);
+ ic->i_data_op = NULL;
}
/* Clear the ACK state */
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_ib_connection *ic;
unsigned long flags;
+ int ret;
/* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
- if (ic == NULL)
+ if (!ic)
return -ENOMEM;
+ ret = rds_ib_recv_alloc_caches(ic);
+ if (ret) {
+ kfree(ic);
+ return ret;
+ }
+
INIT_LIST_HEAD(&ic->ib_node);
tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
(unsigned long) ic);
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
#endif
+ atomic_set(&ic->i_signaled_sends, 0);
/*
* rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg)
list_del(&ic->ib_node);
spin_unlock_irq(lock_ptr);
+ rds_ib_recv_free_caches(ic);
+
kfree(ic);
}