diff options
Diffstat (limited to 'drivers/infiniband/core/ucma.c')
-rw-r--r-- | drivers/infiniband/core/ucma.c | 740 |
1 files changed, 389 insertions, 351 deletions
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 0274e9b704be..bf42650f125b 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,7 @@ #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> +#include <rdma/ib_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" @@ -79,28 +80,22 @@ struct ucma_file { struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; - struct workqueue_struct *close_wq; }; struct ucma_context { u32 id; struct completion comp; - atomic_t ref; + refcount_t ref; int events_reported; - int backlog; + atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; + struct mutex mutex; u64 uid; struct list_head list; struct list_head mc_list; - /* mark that device is in process of destroying the internal HW - * resources, protected by the ctx_table lock - */ - int closing; - /* sync between removal event and id destroy, protected by file mut */ - int destroying; struct work_struct close_work; }; @@ -117,17 +112,17 @@ struct ucma_multicast { struct ucma_event { struct ucma_context *ctx; + struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; - struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; - struct work_struct close_work; }; static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; +static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) @@ -137,7 +132,7 @@ static inline struct ucma_context *_ucma_find_context(int id, ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file || !ctx->cm_id) + else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } @@ -148,19 +143,16 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) { - if (ctx->closing) - ctx = ERR_PTR(-EIO); - else - atomic_inc(&ctx->ref); - } + if (!IS_ERR(ctx)) + if (!refcount_inc_not_zero(&ctx->ref)) + ctx = ERR_PTR(-ENXIO); xa_unlock(&ctx_table); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { - if (atomic_dec_and_test(&ctx->ref)) + if (refcount_dec_and_test(&ctx->ref)) complete(&ctx->comp); } @@ -181,26 +173,21 @@ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) return ctx; } -static void ucma_close_event_id(struct work_struct *work) -{ - struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); - - rdma_destroy_id(uevent_close->cm_id); - kfree(uevent_close); -} - static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing - * by its creator. + * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); + + /* Reading the cm_id without holding a positive ref is not allowed */ + ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) @@ -212,40 +199,32 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); - atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); + /* So list_del() will work if we don't do ucma_finish_ctx() */ + INIT_LIST_HEAD(&ctx->list); ctx->file = file; + mutex_init(&ctx->mutex); - if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&ctx->list, &file->ctx_list); + if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } return ctx; - -error: - kfree(ctx); - return NULL; } -static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) +static void ucma_set_ctx_cm_id(struct ucma_context *ctx, + struct rdma_cm_id *cm_id) { - struct ucma_multicast *mc; - - mc = kzalloc(sizeof(*mc), GFP_KERNEL); - if (!mc) - return NULL; - - mc->ctx = ctx; - if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&mc->list, &ctx->mc_list); - return mc; + refcount_set(&ctx->ref, 1); + ctx->cm_id = cm_id; +} -error: - kfree(mc); - return NULL; +static void ucma_finish_ctx(struct ucma_context *ctx) +{ + lockdep_assert_held(&ctx->file->mut); + list_add_tail(&ctx->list, &ctx->file->ctx_list); + xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, @@ -255,7 +234,7 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - dst->responder_resources =src->responder_resources; + dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; @@ -277,10 +256,15 @@ static void ucma_copy_ud_event(struct ib_device *device, dst->qkey = src->qkey; } -static void ucma_set_event_context(struct ucma_context *ctx, - struct rdma_cm_event *event, - struct ucma_event *uevent) +static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, + struct rdma_cm_event *event) { + struct ucma_event *uevent; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) + return NULL; + uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: @@ -295,44 +279,55 @@ static void ucma_set_event_context(struct ucma_context *ctx, uevent->resp.id = ctx->id; break; } + uevent->resp.event = event->event; + uevent->resp.status = event->status; + if (ctx->cm_id->qp_type == IB_QPT_UD) + ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, + &event->param.ud); + else + ucma_copy_conn_event(&uevent->resp.param.conn, + &event->param.conn); + + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + return uevent; } -/* Called with file->mut locked for the relevant context. */ -static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) { - struct ucma_context *ctx = cm_id->context; - struct ucma_event *con_req_eve; - int event_found = 0; + struct ucma_context *listen_ctx = cm_id->context; + struct ucma_context *ctx; + struct ucma_event *uevent; - if (ctx->destroying) - return; + if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) + return -ENOMEM; + ctx = ucma_alloc_ctx(listen_ctx->file); + if (!ctx) + goto err_backlog; + ucma_set_ctx_cm_id(ctx, cm_id); - /* only if context is pointing to cm_id that it owns it and can be - * queued to be closed, otherwise that cm_id is an inflight one that - * is part of that context event list pending to be detached and - * reattached to its new context as part of ucma_get_event, - * handled separately below. - */ - if (ctx->cm_id == cm_id) { - xa_lock(&ctx_table); - ctx->closing = 1; - xa_unlock(&ctx_table); - queue_work(ctx->file->close_wq, &ctx->close_work); - return; - } + uevent = ucma_create_uevent(listen_ctx, event); + if (!uevent) + goto err_alloc; + uevent->conn_req_ctx = ctx; + uevent->resp.id = ctx->id; - list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { - if (con_req_eve->cm_id == cm_id && - con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - list_del(&con_req_eve->list); - INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); - queue_work(ctx->file->close_wq, &con_req_eve->close_work); - event_found = 1; - break; - } - } - if (!event_found) - pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); + ctx->cm_id->context = ctx; + + mutex_lock(&ctx->file->mut); + ucma_finish_ctx(ctx); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + return 0; + +err_alloc: + ucma_destroy_private_ctx(ctx); +err_backlog: + atomic_inc(&listen_ctx->backlog); + /* Returning error causes the new ID to be destroyed */ + return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, @@ -340,69 +335,49 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; - int ret = 0; - uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); - if (!uevent) - return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + return ucma_connect_event_handler(cm_id, event); - mutex_lock(&ctx->file->mut); - uevent->cm_id = cm_id; - ucma_set_event_context(ctx, event, uevent); - uevent->resp.event = event->event; - uevent->resp.status = event->status; - if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, - &event->param.ud); - else - ucma_copy_conn_event(&uevent->resp.param.conn, - &event->param.conn); - - if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { - if (!ctx->backlog) { - ret = -ENOMEM; - kfree(uevent); - goto out; - } - ctx->backlog--; - } else if (!ctx->uid || ctx->cm_id != cm_id) { - /* - * We ignore events for new connections until userspace has set - * their context. This can only happen if an error occurs on a - * new connection before the user accepts it. This is okay, - * since the accept will just fail later. However, we do need - * to release the underlying HW resources in case of a device - * removal event. - */ - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); - - kfree(uevent); - goto out; + /* + * We ignore events for new connections until userspace has set their + * context. This can only happen if an error occurs on a new connection + * before the user accepts it. This is okay, since the accept will just + * fail later. However, we do need to release the underlying HW + * resources in case of a device removal event. + */ + if (ctx->uid) { + uevent = ucma_create_uevent(ctx, event); + if (!uevent) + return 0; + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); } - list_add_tail(&uevent->list, &ctx->file->event_list); - wake_up_interruptible(&ctx->file->poll_wait); - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); -out: - mutex_unlock(&ctx->file->mut); - return ret; + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { + xa_lock(&ctx_table); + if (xa_load(&ctx_table, ctx->id) == ctx) + queue_work(system_unbound_wq, &ctx->close_work); + xa_unlock(&ctx_table); + } + return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct ucma_context *ctx; struct rdma_ucm_get_event cmd; struct ucma_event *uevent; - int ret = 0; /* * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ - if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved)) + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - + sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -422,35 +397,25 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, mutex_lock(&file->mut); } - uevent = list_entry(file->event_list.next, struct ucma_event, list); - - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - ctx = ucma_alloc_ctx(file); - if (!ctx) { - ret = -ENOMEM; - goto done; - } - uevent->ctx->backlog++; - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } + uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { - ret = -EFAULT; - goto done; + mutex_unlock(&file->mut); + return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; - kfree(uevent); -done: + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); - return ret; + + kfree(uevent); + return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) @@ -491,38 +456,32 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, if (ret) return ret; - mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); - mutex_unlock(&file->mut); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; - cm_id = __rdma_create_id(current->nsproxy->net_ns, - ucma_event_handler, ctx, cmd.ps, qp_type, NULL); + cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } + ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err2; + goto err1; } - ctx->cm_id = cm_id; + mutex_lock(&file->mut); + ucma_finish_ctx(ctx); + mutex_unlock(&file->mut); return 0; -err2: - rdma_destroy_id(cm_id); err1: - xa_erase(&ctx_table, ctx->id); - mutex_lock(&file->mut); - list_del(&ctx->list); - mutex_unlock(&file->mut); - kfree(ctx); + ucma_destroy_private_ctx(ctx); return ret; } @@ -530,19 +489,25 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; - mutex_lock(&ctx->file->mut); + xa_lock(&multicast_table); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); - xa_erase(&multicast_table, mc->id); + /* + * At this point mc->ctx->ref is 0 so the mc cannot leave the + * lock on the reader and this is enough serialization + */ + __xa_erase(&multicast_table, mc->id); kfree(mc); } - mutex_unlock(&ctx->file->mut); + xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; + rdma_lock_handler(mc->ctx->cm_id); + mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; @@ -550,45 +515,75 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) list_del(&uevent->list); kfree(uevent); } + mutex_unlock(&mc->ctx->file->mut); + rdma_unlock_handler(mc->ctx->cm_id); } -/* - * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At - * this point, no new events will be reported from the hardware. However, we - * still need to cleanup the UCMA context for this ID. Specifically, there - * might be events that have not yet been consumed by the user space software. - * These might include pending connect requests which we have not completed - * processing. We cannot call rdma_destroy_id while holding the lock of the - * context (file->mut), as it might cause a deadlock. We therefore extract all - * relevant events from the context pending events list while holding the - * mutex. After that we release them as needed. - */ -static int ucma_free_ctx(struct ucma_context *ctx) +static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); - - ucma_cleanup_multicast(ctx); - - /* Cleanup events not yet reported to the user. */ + /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx == ctx) + if (uevent->ctx != ctx) + continue; + + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && + xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, + uevent->conn_req_ctx, XA_ZERO_ENTRY, + GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); + continue; + } + list_del(&uevent->list); + kfree(uevent); } list_del(&ctx->list); + events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); + /* + * If this was a listening ID then any connections spawned from it that + * have not been delivered to userspace are cleaned up too. Must be done + * outside any locks. + */ list_for_each_entry_safe(uevent, tmp, &list, list) { - list_del(&uevent->list); - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) - rdma_destroy_id(uevent->cm_id); + ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } + return events_reported; +} - events_reported = ctx->events_reported; +/* + * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie + * the ctx is not public to the user). This either because: + * - ucma_finish_ctx() hasn't been called + * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) + */ +static int ucma_destroy_private_ctx(struct ucma_context *ctx) +{ + int events_reported; + + /* + * Destroy the underlying cm_id. New work queuing is prevented now by + * the removal from the xarray. Once the work is cancled ref will either + * be 0 because the work ran to completion and consumed the ref from the + * xarray, or it will be positive because we still have the ref from the + * xarray. This can also be 0 in cases where cm_id was never set + */ + cancel_work_sync(&ctx->close_work); + if (refcount_read(&ctx->ref)) + ucma_close_id(&ctx->close_work); + + events_reported = ucma_cleanup_ctx_events(ctx); + ucma_cleanup_multicast(ctx); + + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, + GFP_KERNEL) != NULL); + mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } @@ -609,31 +604,17 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); - if (!IS_ERR(ctx)) - __xa_erase(&ctx_table, ctx->id); + if (!IS_ERR(ctx)) { + if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx) + ctx = ERR_PTR(-ENOENT); + } xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&ctx->file->mut); - ctx->destroying = 1; - mutex_unlock(&ctx->file->mut); - - flush_workqueue(ctx->file->close_wq); - /* At this point it's guaranteed that there is no inflight - * closing task */ - xa_lock(&ctx_table); - if (!ctx->closing) { - xa_unlock(&ctx_table); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - rdma_destroy_id(ctx->cm_id); - } else { - xa_unlock(&ctx_table); - } - - resp.events_reported = ucma_free_ctx(ctx); + resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -658,7 +639,10 @@ static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); return ret; } @@ -681,7 +665,9 @@ static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -705,8 +691,10 @@ static ssize_t ucma_resolve_ip(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -731,8 +719,10 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -752,7 +742,9 @@ static ssize_t ucma_resolve_route(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -762,8 +754,8 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, { struct rdma_dev_addr *dev_addr; - resp->num_paths = route->num_paths; - switch (route->num_paths) { + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, @@ -775,7 +767,7 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); - /* fall through */ + fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); @@ -789,8 +781,8 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { - resp->num_paths = route->num_paths; - switch (route->num_paths) { + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); @@ -801,7 +793,7 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); - /* fall through */ + fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); @@ -831,7 +823,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -841,6 +833,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? @@ -854,6 +847,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) @@ -864,8 +858,9 @@ static ssize_t ucma_query_route(struct ucma_file *file, ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) + mutex_unlock(&ctx->mutex); + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, + min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); @@ -879,6 +874,7 @@ static void ucma_query_device_addr(struct rdma_cm_id *cm_id, return; resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); @@ -891,7 +887,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -906,7 +902,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, ucma_query_device_addr(ctx->cm_id, &resp); - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -925,7 +921,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, if (!resp) return -ENOMEM; - resp->num_paths = ctx->cm_id->route.num_paths; + resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { @@ -958,7 +954,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, struct sockaddr_ib *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -991,7 +987,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, &ctx->cm_id->route.addr.dst_addr); } - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -1014,6 +1010,7 @@ static ssize_t ucma_query(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); switch (cmd.option) { case RDMA_USER_CM_QUERY_ADDR: ret = ucma_query_addr(ctx, response, out_len); @@ -1028,6 +1025,7 @@ static ssize_t ucma_query(struct ucma_file *file, ret = -ENOSYS; break; } + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1039,25 +1037,30 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id, { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; - dst->responder_resources =src->responder_resources; + dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; - dst->qp_num = src->qp_num; + dst->qp_num = src->qp_num & 0xFFFFFF; dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_connect cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct rdma_ucm_connect cmd; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) @@ -1068,7 +1071,14 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - ret = rdma_connect(ctx->cm_id, &conn_param); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + + mutex_lock(&ctx->mutex); + ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1087,9 +1097,13 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? - cmd.backlog : max_backlog; - ret = rdma_listen(ctx->cm_id, ctx->backlog); + if (cmd.backlog <= 0 || cmd.backlog > max_backlog) + cmd.backlog = max_backlog; + atomic_set(&ctx->backlog, cmd.backlog); + + mutex_lock(&ctx->mutex); + ret = rdma_listen(ctx->cm_id, cmd.backlog); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1099,26 +1113,44 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - mutex_lock(&file->mut); - ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); - if (!ret) + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); + if (!ret) { + /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; - mutex_unlock(&file->mut); - } else - ret = __rdma_accept(ctx->cm_id, NULL, NULL); - + } + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } ucma_put_ctx(ctx); return ret; } @@ -1133,11 +1165,25 @@ static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + if (!cmd.reason) + cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; + + switch (cmd.reason) { + case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: + break; + default: + return -EINVAL; + } + ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); + mutex_lock(&ctx->mutex); + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, + cmd.reason); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1156,7 +1202,9 @@ static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_disconnect(ctx->cm_id); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1187,7 +1235,9 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; + mutex_lock(&ctx->mutex); ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); + mutex_unlock(&ctx->mutex); if (ret) goto out; @@ -1273,9 +1323,13 @@ static int ucma_set_ib_path(struct ucma_context *ctx, struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); + mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &opa); + mutex_unlock(&ctx->mutex); } else { + mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &sa_path); + mutex_unlock(&ctx->mutex); } if (ret) return ret; @@ -1308,7 +1362,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level, switch (level) { case RDMA_OPTION_ID: + mutex_lock(&ctx->mutex); ret = ucma_set_option_id(ctx, optname, optval, optlen); + mutex_unlock(&ctx->mutex); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); @@ -1368,8 +1424,10 @@ static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); if (ctx->cm_id->device) ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1403,42 +1461,59 @@ static ssize_t ucma_process_join(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&file->mut); - mc = ucma_alloc_multicast(ctx); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; - goto err1; + goto err_put_ctx; } + + mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); + + xa_lock(&multicast_table); + if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_mc; + } + + list_add_tail(&mc->list, &ctx->mc_list); + xa_unlock(&multicast_table); + + mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); + mutex_unlock(&ctx->mutex); if (ret) - goto err2; + goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err3; + goto err_leave_multicast; } xa_store(&multicast_table, mc->id, mc, 0); - mutex_unlock(&file->mut); ucma_put_ctx(ctx); return 0; -err3: +err_leave_multicast: + mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); -err2: - xa_erase(&multicast_table, mc->id); +err_xa_erase: + xa_lock(&multicast_table); list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); +err_free_mc: + xa_unlock(&multicast_table); kfree(mc); -err1: - mutex_unlock(&file->mut); +err_put_ctx: ucma_put_ctx(ctx); return ret; } @@ -1500,24 +1575,26 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); - else if (mc->ctx->file != file) + else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); - else if (!atomic_inc_not_zero(&mc->ctx->ref)) + else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); - else - __xa_erase(&multicast_table, mc->id); - xa_unlock(&multicast_table); if (IS_ERR(mc)) { + xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); + + mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); - mutex_lock(&mc->ctx->file->mut); + mutex_unlock(&mc->ctx->mutex); + ucma_cleanup_mc_events(mc); - list_del(&mc->list); - mutex_unlock(&mc->ctx->file->mut); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; @@ -1530,45 +1607,15 @@ out: return ret; } -static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - /* Acquire mutex's based on pointer comparison to prevent deadlock. */ - if (file1 < file2) { - mutex_lock(&file1->mut); - mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&file2->mut); - mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); - } -} - -static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - if (file1 < file2) { - mutex_unlock(&file2->mut); - mutex_unlock(&file1->mut); - } else { - mutex_unlock(&file1->mut); - mutex_unlock(&file2->mut); - } -} - -static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) -{ - struct ucma_event *uevent, *tmp; - - list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) - if (uevent->ctx == ctx) - list_move_tail(&uevent->list, &file->event_list); -} - static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; + struct ucma_event *uevent, *tmp; struct ucma_context *ctx; + LIST_HEAD(event_list); struct fd f; struct ucma_file *cur_file; int ret = 0; @@ -1584,40 +1631,53 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, ret = -EINVAL; goto file_put; } + cur_file = f.file->private_data; /* Validate current fd and prevent destruction of id. */ - ctx = ucma_get_ctx(f.file->private_data, cmd.id); + ctx = ucma_get_ctx(cur_file, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; } - cur_file = ctx->file; - if (cur_file == new_file) { - resp.events_reported = ctx->events_reported; - goto response; - } - + rdma_lock_handler(ctx->cm_id); /* - * Migrate events between fd's, maintaining order, and avoiding new - * events being added before existing events. + * ctx->file can only be changed under the handler & xa_lock. xa_load() + * must be checked again to ensure the ctx hasn't begun destruction + * since the ucma_get_ctx(). */ - ucma_lock_files(cur_file, new_file); xa_lock(&ctx_table); - - list_move_tail(&ctx->list, &new_file->ctx_list); - ucma_move_events(ctx, new_file); + if (_ucma_find_context(cmd.id, cur_file) != ctx) { + xa_unlock(&ctx_table); + ret = -ENOENT; + goto err_unlock; + } ctx->file = new_file; + xa_unlock(&ctx_table); + + mutex_lock(&cur_file->mut); + list_del(&ctx->list); + /* + * At this point lock_handler() prevents addition of new uevents for + * this ctx. + */ + list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; + mutex_unlock(&cur_file->mut); - xa_unlock(&ctx_table); - ucma_unlock_files(cur_file, new_file); + mutex_lock(&new_file->mut); + list_add_tail(&ctx->list, &new_file->ctx_list); + list_splice_tail(&event_list, &new_file->event_list); + mutex_unlock(&new_file->mut); -response: if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; +err_unlock: + rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); file_put: fdput(f); @@ -1660,8 +1720,8 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf, ssize_t ret; if (!ib_safe_file_access(filp)) { - pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", - task_tgid_vnr(current), current->comm); + pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", + __func__, task_tgid_vnr(current), current->comm); return -EACCES; } @@ -1717,13 +1777,6 @@ static int ucma_open(struct inode *inode, struct file *filp) if (!file) return -ENOMEM; - file->close_wq = alloc_ordered_workqueue("ucma_close_id", - WQ_MEM_RECLAIM); - if (!file->close_wq) { - kfree(file); - return -ENOMEM; - } - INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); @@ -1738,37 +1791,23 @@ static int ucma_open(struct inode *inode, struct file *filp) static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; - struct ucma_context *ctx, *tmp; - - mutex_lock(&file->mut); - list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { - ctx->destroying = 1; - mutex_unlock(&file->mut); - xa_erase(&ctx_table, ctx->id); - flush_workqueue(file->close_wq); - /* At that step once ctx was marked as destroying and workqueue - * was flushed we are safe from any inflights handlers that - * might put other closing task. - */ - xa_lock(&ctx_table); - if (!ctx->closing) { - xa_unlock(&ctx_table); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - /* rdma_destroy_id ensures that no event handlers are - * inflight for that id before releasing it. - */ - rdma_destroy_id(ctx->cm_id); - } else { - xa_unlock(&ctx_table); - } + /* + * All paths that touch ctx_list or ctx_list starting from write() are + * prevented by this being a FD release function. The list_add_tail() in + * ucma_connect_event_handler() can run concurrently, however it only + * adds to the list *after* a listening ID. By only reading the first of + * the list, and relying on ucma_destroy_private_ctx() to block + * ucma_connect_event_handler(), no additional locking is needed. + */ + while (!list_empty(&file->ctx_list)) { + struct ucma_context *ctx = list_first_entry( + &file->ctx_list, struct ucma_context, list); - ucma_free_ctx(ctx); - mutex_lock(&file->mut); + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx); + ucma_destroy_private_ctx(ctx); } - mutex_unlock(&file->mut); - destroy_workqueue(file->close_wq); kfree(file); return 0; } @@ -1803,13 +1842,12 @@ static struct ib_client rdma_cma_client = { }; MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); -static ssize_t show_abi_version(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t abi_version_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); + return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); } -static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static DEVICE_ATTR_RO(abi_version); static int __init ucma_init(void) { |