diff options
Diffstat (limited to 'drivers/infiniband/core')
34 files changed, 2224 insertions, 1141 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 69dee36e0e89..313f2349b518 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -15,8 +15,6 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ nldev.o restrack.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o -ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o ib_cm-y := cm.o @@ -39,3 +37,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o +ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 7b04590f307f..43c67e5f43c6 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -185,7 +185,7 @@ EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) { - return device->cache.ports[port - rdma_start_port(device)].gid; + return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) @@ -547,21 +547,19 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, unsigned long mask; int ret; - if (ib_dev->ops.get_netdev) { - idev = ib_dev->ops.get_netdev(ib_dev, port); - if (idev && attr->ndev != idev) { - union ib_gid default_gid; + idev = ib_device_get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; - /* Adding default GIDs in not permitted */ - make_default_gid(idev, &default_gid); - if (!memcmp(gid, &default_gid, sizeof(*gid))) { - dev_put(idev); - return -EPERM; - } - } - if (idev) + /* Adding default GIDs is not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { dev_put(idev); + return -EPERM; + } } + if (idev) + dev_put(idev); mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | @@ -765,7 +763,7 @@ err_free_table: return NULL; } -static void release_gid_table(struct ib_device *device, u8 port, +static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { bool leak = false; @@ -863,31 +861,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, static void gid_table_release_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + unsigned int p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - release_gid_table(ib_dev, port, table); - ib_dev->cache.ports[port].gid = NULL; + rdma_for_each_port (ib_dev, p) { + release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); + ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { - u8 port; struct ib_gid_table *table; + unsigned int rdma_port; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - u8 rdma_port = port + rdma_start_port(ib_dev); - - table = alloc_gid_table( - ib_dev->port_immutable[rdma_port].gid_tbl_len); + rdma_for_each_port (ib_dev, rdma_port) { + table = alloc_gid_table( + ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); - ib_dev->cache.ports[port].gid = table; + ib_dev->port_data[rdma_port].cache.gid = table; } return 0; @@ -898,14 +892,11 @@ rollback_table_setup: static void gid_table_cleanup_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + unsigned int p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - } + rdma_for_each_port (ib_dev, p) + cleanup_gid_table_port(ib_dev, p, + ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) @@ -983,17 +974,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - u8 p; + unsigned int p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; - for (p = 0; p < device->phys_port_cnt; p++) { + rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; - table = device->cache.ports[p].gid; + table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { @@ -1025,7 +1016,7 @@ int ib_get_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; if (index < 0 || index >= cache->table_len) ret = -EINVAL; @@ -1043,14 +1034,12 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx) { unsigned long flags; - int p; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - p = port_num - rdma_start_port(device); read_lock_irqsave(&device->cache.lock, flags); - *sn_pfx = device->cache.ports[p].subnet_prefix; + *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache.lock, flags); return 0; @@ -1073,7 +1062,7 @@ int ib_find_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; *index = -1; @@ -1113,7 +1102,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; *index = -1; @@ -1141,7 +1130,7 @@ int ib_get_cached_lmc(struct ib_device *device, return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc; + *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -1159,8 +1148,7 @@ int ib_get_cached_port_state(struct ib_device *device, return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *port_state = device->cache.ports[port_num - - rdma_start_port(device)].port_state; + *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -1361,16 +1349,13 @@ static void ib_cache_update(struct ib_device *device, write_lock_irq(&device->cache.lock); - old_pkey_cache = device->cache.ports[port - - rdma_start_port(device)].pkey; + old_pkey_cache = device->port_data[port].cache.pkey; - device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; - device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; - device->cache.ports[port - rdma_start_port(device)].port_state = - tprops->state; + device->port_data[port].cache.pkey = pkey_cache; + device->port_data[port].cache.lmc = tprops->lmc; + device->port_data[port].cache.port_state = tprops->state; - device->cache.ports[port - rdma_start_port(device)].subnet_prefix = - tprops->subnet_prefix; + device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache.lock); if (enforce_security) @@ -1428,27 +1413,17 @@ static void ib_cache_event(struct ib_event_handler *handler, int ib_cache_setup_one(struct ib_device *device) { - int p; + unsigned int p; int err; rwlock_init(&device->cache.lock); - device->cache.ports = - kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1, - sizeof(*device->cache.ports), - GFP_KERNEL); - if (!device->cache.ports) - return -ENOMEM; - err = gid_table_setup_one(device); - if (err) { - kfree(device->cache.ports); - device->cache.ports = NULL; + if (err) return err; - } - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - ib_cache_update(device, p + rdma_start_port(device), true); + rdma_for_each_port (device, p) + ib_cache_update(device, p, true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); @@ -1458,7 +1433,7 @@ int ib_cache_setup_one(struct ib_device *device) void ib_cache_release_one(struct ib_device *device) { - int p; + unsigned int p; /* * The release function frees all the cache elements. @@ -1466,11 +1441,10 @@ void ib_cache_release_one(struct ib_device *device) * all the device's resources when the cache could no * longer be accessed. */ - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - kfree(device->cache.ports[p].pkey); + rdma_for_each_port (device, p) + kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); - kfree(device->cache.ports); } void ib_cache_cleanup_one(struct ib_device *device) diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c index 126ac5f99db7..388fd04e5f63 100644 --- a/drivers/infiniband/core/cgroup.c +++ b/drivers/infiniband/core/cgroup.c @@ -21,12 +21,11 @@ * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. - * Returns 0 on success or otherwise failure code. */ -int ib_device_register_rdmacg(struct ib_device *device) +void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; - return rdmacg_register_device(&device->cg_device); + rdmacg_register_device(&device->cg_device); } /** diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 37980c7564c0..b9416a6fca36 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -4052,8 +4052,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, atomic_long_inc(&port->counter_group[CM_RECV]. counter[attr_id - CM_ATTR_ID_OFFSET]); - work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths, - GFP_KERNEL); + work = kmalloc(struct_size(work, path, paths), GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); return; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 63a7cc00bae0..68c997be2429 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -494,7 +494,10 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); - rdma_restrack_kadd(&id_priv->res); + if (id_priv->res.kern_name) + rdma_restrack_kadd(&id_priv->res); + else + rdma_restrack_uadd(&id_priv->res); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -656,7 +659,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; - u8 port; + unsigned int port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -670,8 +673,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { - for (port = rdma_start_port(cma_dev->device); - port <= rdma_end_port(cma_dev->device); port++) { + rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; @@ -885,6 +887,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); @@ -1127,6 +1130,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = -ENOSYS; + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); @@ -2407,6 +2413,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) return PTR_ERR(id); id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -2459,6 +2466,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; + dev_id_priv->tos_set = id_priv->tos_set; + dev_id_priv->tos = id_priv->tos; ret = rdma_listen(id, id_priv->backlog); if (ret) @@ -2487,6 +2496,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + static void cma_query_handler(int status, struct sa_path_rec *path_rec, void *context) { @@ -2963,13 +3000,22 @@ static void addr_handler(int status, struct sockaddr *src_addr, { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; + struct sockaddr *addr; + struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); + /* + * Store the previous src address, so that if we fail to acquire + * matching rdma device, old address can be restored back, which helps + * to cancel the cma listen operation correctly. + */ + addr = cma_src_addr(id_priv); + memcpy(&old_addr, addr, rdma_addr_size(addr)); + memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) @@ -2980,6 +3026,8 @@ static void addr_handler(int status, struct sockaddr *src_addr, } if (status) { + memcpy(addr, &old_addr, + rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; @@ -3795,6 +3843,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, return PTR_ERR(cm_id); cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), @@ -4498,7 +4547,7 @@ static void cma_add_one(struct ib_device *device) if (!cma_dev->default_roce_tos) goto free_gid_type; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) @@ -4602,85 +4651,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data) kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS, - NLM_F_MULTI); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_src_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) - goto out; - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_dst_addr(id_priv)), - cma_dst_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) - goto out; - - id_stats->pid = task_pid_vnr(id_priv->res.task); - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - nlmsg_end(skb, nlh); - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, -}; - static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); @@ -4729,7 +4699,6 @@ static int __init cma_init(void) if (ret) goto err; - rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4745,7 +4714,6 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); @@ -4753,7 +4721,5 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } -MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); - module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index cf47c69436a7..ca7307277518 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -84,9 +84,11 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; - bool tos_set; + u8 tos_set:1; + u8 timeout_set:1; u8 reuseaddr; u8 afonly; + u8 timeout; enum ib_gid_type gid_type; /* diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 3cd830d52967..08c690249594 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -54,9 +54,9 @@ struct pkey_index_qp_list { struct list_head qp_list; }; -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +extern const struct attribute_group ib_dev_attr_group; + +int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); @@ -66,6 +66,9 @@ typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + unsigned int port); + void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, @@ -117,7 +120,7 @@ void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); #ifdef CONFIG_CGROUP_RDMA -int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_register_rdmacg(struct ib_device *device); void ib_device_unregister_rdmacg(struct ib_device *device); int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, @@ -128,21 +131,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); #else -static inline int ib_device_register_rdmacg(struct ib_device *device) -{ return 0; } +static inline void ib_device_register_rdmacg(struct ib_device *device) +{ +} static inline void ib_device_unregister_rdmacg(struct ib_device *device) -{ } +{ +} static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ return 0; } +{ + return 0; +} static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ } +{ +} #endif static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, @@ -178,7 +186,7 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -void ib_security_destroy_port_pkey_list(struct ib_device *device); +void ib_security_release_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, u8 port_num, @@ -199,8 +207,9 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type); void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +void ib_mad_agent_security_change(void); #else -static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) +static inline void ib_security_release_port_pkey_list(struct ib_device *device) { } @@ -264,10 +273,13 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, { return 0; } + +static inline void ib_mad_agent_security_change(void) +{ +} #endif struct ib_device *ib_device_get_by_index(u32 ifindex); -void ib_device_put(struct ib_device *device); /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8872453e26c0..7421ec4883fb 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -37,54 +37,111 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> -#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/notifier.h> +#include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include "core_priv.h" +#include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); -struct ib_client_data { - struct list_head list; - struct ib_client *client; - void * data; - /* The device or client is going down. Do not call client or device - * callbacks other than remove(). */ - bool going_down; -}; - struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); -/* The device_list and client_list contain devices and clients after their - * registration has completed, and the devices and clients are removed - * during unregistration. */ -static LIST_HEAD(device_list); -static LIST_HEAD(client_list); +/* + * Each of the three rwsem locks (devices, clients, client_data) protects the + * xarray of the same name. Specifically it allows the caller to assert that + * the MARK will/will not be changing under the lock, and for devices and + * clients, that the value in the xarray is still a valid pointer. Change of + * the MARK is linked to the object state, so holding the lock and testing the + * MARK also asserts that the contained object is in a certain state. + * + * This is used to build a two stage register/unregister flow where objects + * can continue to be in the xarray even though they are still in progress to + * register/unregister. + * + * The xarray itself provides additional locking, and restartable iteration, + * which is also relied on. + * + * Locks should not be nested, with the exception of client_data, which is + * allowed to nest under the read side of the other two locks. + * + * The devices_rwsem also protects the device name list, any change or + * assignment of device name must also hold the write side to guarantee unique + * names. + */ /* - * device_mutex and lists_rwsem protect access to both device_list and - * client_list. device_mutex protects writer access by device and client - * registration / de-registration. lists_rwsem protects reader access to - * these lists. Iterators of these lists must lock it for read, while updates - * to the lists must be done with a write lock. A special case is when the - * device_mutex is locked. In this case locking the lists for read access is - * not necessary as the device_mutex implies it. + * devices contains devices that have had their names assigned. The + * devices may not be registered. Users that care about the registration + * status need to call ib_device_try_get() on the device to ensure it is + * registered, and keep it registered, for the required duration. * - * lists_rwsem also protects access to the client data list. */ -static DEFINE_MUTEX(device_mutex); -static DECLARE_RWSEM(lists_rwsem); +static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(devices_rwsem); +#define DEVICE_REGISTERED XA_MARK_1 +static LIST_HEAD(client_list); +#define CLIENT_REGISTERED XA_MARK_1 +static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(clients_rwsem); + +/* + * If client_data is registered then the corresponding client must also still + * be registered. + */ +#define CLIENT_DATA_REGISTERED XA_MARK_1 +/* + * xarray has this behavior where it won't iterate over NULL values stored in + * allocated arrays. So we need our own iterator to see all values stored in + * the array. This does the same thing as xa_for_each except that it also + * returns NULL valued entries if the array is allocating. Simplified to only + * work on simple xarrays. + */ +static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, + xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, filter); + if (xa_is_zero(entry)) + break; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) { + *indexp = xas.xa_index; + if (xa_is_zero(entry)) + return NULL; + return entry; + } + return XA_ERROR(-ENOENT); +} +#define xan_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ + !xa_is_err(entry); \ + (index)++, entry = xan_find_marked(xa, &(index), filter)) + +/* RCU hash table mapping netdevice pointers to struct ib_port_data */ +static DEFINE_SPINLOCK(ndev_hash_lock); +static DECLARE_HASHTABLE(ndev_hash, 5); + +static void free_netdevs(struct ib_device *ib_dev); +static void ib_unregister_work(struct work_struct *work); +static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); @@ -94,6 +151,12 @@ static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; +/* Pointer to the RCU head at the start of the ib_port_data array */ +struct ib_port_data_rcu { + struct rcu_head rcu_head; + struct ib_port_data pdata[]; +}; + static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } @@ -121,30 +184,18 @@ static int ib_device_check_mandatory(struct ib_device *device) }; int i; + device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { - dev_warn(&device->dev, - "Device is missing mandatory function %s\n", - mandatory_table[i].name); - return -EINVAL; + device->kverbs_provider = false; + break; } } return 0; } -static struct ib_device *__ib_device_get_by_index(u32 index) -{ - struct ib_device *device; - - list_for_each_entry(device, &device_list, core_list) - if (device->index == index) - return device; - - return NULL; -} - /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. @@ -153,48 +204,83 @@ struct ib_device *ib_device_get_by_index(u32 index) { struct ib_device *device; - down_read(&lists_rwsem); - device = __ib_device_get_by_index(index); + down_read(&devices_rwsem); + device = xa_load(&devices, index); if (device) { - /* Do not return a device if unregistration has started. */ - if (!refcount_inc_not_zero(&device->refcount)) + if (!ib_device_try_get(device)) device = NULL; } - up_read(&lists_rwsem); + up_read(&devices_rwsem); return device; } +/** + * ib_device_put - Release IB device reference + * @device: device whose reference to be released + * + * ib_device_put() releases reference to the IB device to allow it to be + * unregistered and eventually free. + */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } +EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; + unsigned long index; - list_for_each_entry(device, &device_list, core_list) + xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } -int ib_device_rename(struct ib_device *ibdev, const char *name) +/** + * ib_device_get_by_name - Find an IB device by name + * @name: The name to look for + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device by its name. The caller must call + * ib_device_put() on the returned pointer. + */ +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id) { struct ib_device *device; - int ret = 0; - if (!strcmp(name, dev_name(&ibdev->dev))) - return ret; + down_read(&devices_rwsem); + device = __ib_device_get_by_name(name); + if (device && driver_id != RDMA_DRIVER_UNKNOWN && + device->driver_id != driver_id) + device = NULL; - mutex_lock(&device_mutex); - list_for_each_entry(device, &device_list, core_list) { - if (!strcmp(name, dev_name(&device->dev))) { - ret = -EEXIST; - goto out; - } + if (device) { + if (!ib_device_try_get(device)) + device = NULL; + } + up_read(&devices_rwsem); + return device; +} +EXPORT_SYMBOL(ib_device_get_by_name); + +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + int ret; + + down_write(&devices_rwsem); + if (!strcmp(name, dev_name(&ibdev->dev))) { + ret = 0; + goto out; + } + + if (__ib_device_get_by_name(name)) { + ret = -EEXIST; + goto out; } ret = device_rename(&ibdev->dev, name); @@ -202,53 +288,60 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) goto out; strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); out: - mutex_unlock(&device_mutex); + up_write(&devices_rwsem); return ret; } static int alloc_name(struct ib_device *ibdev, const char *name) { - unsigned long *inuse; struct ib_device *device; + unsigned long index; + struct ida inuse; + int rc; int i; - inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); - if (!inuse) - return -ENOMEM; - - list_for_each_entry(device, &device_list, core_list) { + lockdep_assert_held_exclusive(&devices_rwsem); + ida_init(&inuse); + xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; - if (i < 0 || i >= PAGE_SIZE * 8) + if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); - if (!strcmp(buf, dev_name(&device->dev))) - set_bit(i, inuse); + if (strcmp(buf, dev_name(&device->dev)) != 0) + continue; + + rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); + if (rc < 0) + goto out; } - i = find_first_zero_bit(inuse, PAGE_SIZE * 8); - free_page((unsigned long) inuse); + rc = ida_alloc(&inuse, GFP_KERNEL); + if (rc < 0) + goto out; - return dev_set_name(&ibdev->dev, name, i); + rc = dev_set_name(&ibdev->dev, name, rc); +out: + ida_destroy(&inuse); + return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); - WARN_ON(dev->reg_state == IB_DEV_REGISTERED); - if (dev->reg_state == IB_DEV_UNREGISTERED) { - /* - * In IB_DEV_UNINITIALIZED state, cache or port table - * is not even created. Free cache and port table only when - * device reaches UNREGISTERED state. - */ - ib_cache_release_one(dev); - kfree(dev->port_immutable); - } - kfree(dev); + free_netdevs(dev); + WARN_ON(refcount_read(&dev->refcount)); + ib_cache_release_one(dev); + ib_security_release_port_pkey_list(dev); + xa_destroy(&dev->client_data); + if (dev->port_data) + kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, + pdata[0]), + rcu_head); + kfree_rcu(dev, rcu_head); } static int ib_device_uevent(struct device *device, @@ -271,7 +364,7 @@ static struct class ib_class = { }; /** - * ib_alloc_device - allocate an IB device struct + * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct @@ -280,7 +373,7 @@ static struct class ib_class = { * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; @@ -291,24 +384,32 @@ struct ib_device *ib_alloc_device(size_t size) if (!device) return NULL; - rdma_restrack_init(&device->res); + if (rdma_restrack_init(device)) { + kfree(device); + return NULL; + } device->dev.class = &ib_class; + device->groups[0] = &ib_dev_attr_group; + device->dev.groups = device->groups; device_initialize(&device->dev); - dev_set_drvdata(&device->dev, device); - INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - rwlock_init(&device->client_data_lock); - INIT_LIST_HEAD(&device->client_data_list); + mutex_init(&device->unregistration_lock); + /* + * client_data needs to be alloc because we don't want our mark to be + * destroyed if the user stores NULL in the client data. + */ + xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); + init_rwsem(&device->client_data_rwsem); INIT_LIST_HEAD(&device->port_list); - refcount_set(&device->refcount, 1); init_completion(&device->unreg_completion); + INIT_WORK(&device->unregistration_work, ib_unregister_work); return device; } -EXPORT_SYMBOL(ib_alloc_device); +EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct @@ -318,32 +419,153 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { - WARN_ON(!list_empty(&device->client_data_list)); - WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && - device->reg_state != IB_DEV_UNINITIALIZED); - rdma_restrack_clean(&device->res); + if (device->ops.dealloc_driver) + device->ops.dealloc_driver(device); + + /* + * ib_unregister_driver() requires all devices to remain in the xarray + * while their ops are callable. The last op we call is dealloc_driver + * above. This is needed to create a fence on op callbacks prior to + * allowing the driver module to unload. + */ + down_write(&devices_rwsem); + if (xa_load(&devices, device->index) == device) + xa_erase(&devices, device->index); + up_write(&devices_rwsem); + + /* Expedite releasing netdev references */ + free_netdevs(device); + + WARN_ON(!xa_empty(&device->client_data)); + WARN_ON(refcount_read(&device->refcount)); + rdma_restrack_clean(device); + /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); -static int add_client_context(struct ib_device *device, struct ib_client *client) +/* + * add_client_context() and remove_client_context() must be safe against + * parallel calls on the same device - registration/unregistration of both the + * device and client can be occurring in parallel. + * + * The routines need to be a fence, any caller must not return until the add + * or remove is fully completed. + */ +static int add_client_context(struct ib_device *device, + struct ib_client *client) { - struct ib_client_data *context; + int ret = 0; - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return -ENOMEM; + if (!device->kverbs_provider && !client->no_kverbs_req) + return 0; + + down_write(&device->client_data_rwsem); + /* + * Another caller to add_client_context got here first and has already + * completely initialized context. + */ + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED)) + goto out; + + ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL)); + if (ret) + goto out; + downgrade_write(&device->client_data_rwsem); + if (client->add) + client->add(device); + + /* Readers shall not see a client until add has been completed */ + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); + up_read(&device->client_data_rwsem); + return 0; + +out: + up_write(&device->client_data_rwsem); + return ret; +} + +static void remove_client_context(struct ib_device *device, + unsigned int client_id) +{ + struct ib_client *client; + void *client_data; + + down_write(&device->client_data_rwsem); + if (!xa_get_mark(&device->client_data, client_id, + CLIENT_DATA_REGISTERED)) { + up_write(&device->client_data_rwsem); + return; + } + client_data = xa_load(&device->client_data, client_id); + xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); + client = xa_load(&clients, client_id); + downgrade_write(&device->client_data_rwsem); + + /* + * Notice we cannot be holding any exclusive locks when calling the + * remove callback as the remove callback can recurse back into any + * public functions in this module and thus try for any locks those + * functions take. + * + * For this reason clients and drivers should not call the + * unregistration functions will holdling any locks. + * + * It tempting to drop the client_data_rwsem too, but this is required + * to ensure that unregister_client does not return until all clients + * are completely unregistered, which is required to avoid module + * unloading races. + */ + if (client->remove) + client->remove(device, client_data); + + xa_erase(&device->client_data, client_id); + up_read(&device->client_data_rwsem); +} + +static int alloc_port_data(struct ib_device *device) +{ + struct ib_port_data_rcu *pdata_rcu; + unsigned int port; + + if (device->port_data) + return 0; + + /* This can only be called once the physical port range is defined */ + if (WARN_ON(!device->phys_port_cnt)) + return -EINVAL; - context->client = client; - context->data = NULL; - context->going_down = false; + /* + * device->port_data is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_data is declared as a 1 based array with potential + * empty slots at the beginning. + */ + pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, + rdma_end_port(device) + 1), + GFP_KERNEL); + if (!pdata_rcu) + return -ENOMEM; + /* + * The rcu_head is put in front of the port data array and the stored + * pointer is adjusted since we never need to see that member until + * kfree_rcu. + */ + device->port_data = pdata_rcu->pdata; - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_add(&context->list, &device->client_data_list); - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + pdata->ib_dev = device; + spin_lock_init(&pdata->pkey_list_lock); + INIT_LIST_HEAD(&pdata->pkey_list); + spin_lock_init(&pdata->netdev_lock); + INIT_HLIST_NODE(&pdata->ndev_hash_link); + } return 0; } @@ -353,29 +575,20 @@ static int verify_immutable(const struct ib_device *dev, u8 port) rdma_max_mad_size(dev, port) != 0); } -static int read_port_immutable(struct ib_device *device) +static int setup_port_data(struct ib_device *device) { + unsigned int port; int ret; - u8 start_port = rdma_start_port(device); - u8 end_port = rdma_end_port(device); - u8 port; - /** - * device->port_immutable is indexed directly by the port number to make - * access to this data as efficient as possible. - * - * Therefore port_immutable is declared as a 1 based array with - * potential empty slots at the beginning. - */ - device->port_immutable = kcalloc(end_port + 1, - sizeof(*device->port_immutable), - GFP_KERNEL); - if (!device->port_immutable) - return -ENOMEM; + ret = alloc_port_data(device); + if (ret) + return ret; - for (port = start_port; port <= end_port; ++port) { - ret = device->ops.get_port_immutable( - device, port, &device->port_immutable[port]); + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + ret = device->ops.get_port_immutable(device, port, + &pdata->immutable); if (ret) return ret; @@ -394,39 +607,16 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str) } EXPORT_SYMBOL(ib_get_device_fw_str); -static int setup_port_pkey_list(struct ib_device *device) -{ - int i; - - /** - * device->port_pkey_list is indexed directly by the port number, - * Therefore it is declared as a 1 based array with potential empty - * slots at the beginning. - */ - device->port_pkey_list = kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_pkey_list), - GFP_KERNEL); - - if (!device->port_pkey_list) - return -ENOMEM; - - for (i = 0; i < (rdma_end_port(device) + 1); i++) { - spin_lock_init(&device->port_pkey_list[i].list_lock); - INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list); - } - - return 0; -} - static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; + unsigned long index; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { - int i; + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned int i; - for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { + rdma_for_each_port (dev, i) { u64 sp; int ret = ib_get_cached_subnet_prefix(dev, i, @@ -439,7 +629,7 @@ static void ib_policy_change_task(struct work_struct *work) ib_security_cache_change(dev, i, sp); } } - up_read(&lists_rwsem); + up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, @@ -449,32 +639,43 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; schedule_work(&ib_policy_change_work); + ib_mad_agent_security_change(); return NOTIFY_OK; } -/** - * __dev_new_index - allocate an device index - * - * Returns a suitable unique value for a new device interface - * number. It assumes that there are less than 2^32-1 ib devices - * will be present in the system. +/* + * Assign the unique string device name and the unique device index. This is + * undone by ib_dealloc_device. */ -static u32 __dev_new_index(void) +static int assign_name(struct ib_device *device, const char *name) { - /* - * The device index to allow stable naming. - * Similar to struct net -> ifindex. - */ - static u32 index; + static u32 last_id; + int ret; - for (;;) { - if (!(++index)) - index = 1; + down_write(&devices_rwsem); + /* Assign a unique name to the device */ + if (strchr(name, '%')) + ret = alloc_name(device, name); + else + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; - if (!__ib_device_get_by_index(index)) - return index; + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + + ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret > 0) + ret = 0; + +out: + up_write(&devices_rwsem); + return ret; } static void setup_dma_device(struct ib_device *device) @@ -512,27 +713,25 @@ static void setup_dma_device(struct ib_device *device) } } -static void cleanup_device(struct ib_device *device) -{ - ib_cache_cleanup_one(device); - ib_cache_release_one(device); - kfree(device->port_pkey_list); - kfree(device->port_immutable); -} - +/* + * setup_device() allocates memory and sets up data that requires calling the + * device ops, this is the only reason these actions are not done during + * ib_alloc_device. It is undone by ib_dealloc_device(). + */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; + setup_dma_device(device); + ret = ib_device_check_mandatory(device); if (ret) return ret; - ret = read_port_immutable(device); + ret = setup_port_data(device); if (ret) { - dev_warn(&device->dev, - "Couldn't create per port immutable data\n"); + dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } @@ -541,27 +740,76 @@ static int setup_device(struct ib_device *device) if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); - goto port_cleanup; + return ret; } - ret = setup_port_pkey_list(device); - if (ret) { - dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto port_cleanup; + return 0; +} + +static void disable_device(struct ib_device *device) +{ + struct ib_client *client; + + WARN_ON(!refcount_read(&device->refcount)); + + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + down_read(&clients_rwsem); + list_for_each_entry_reverse(client, &client_list, list) + remove_client_context(device, client->client_id); + up_read(&clients_rwsem); + + /* Pairs with refcount_set in enable_device */ + ib_device_put(device); + wait_for_completion(&device->unreg_completion); + + /* Expedite removing unregistered pointers from the hash table */ + free_netdevs(device); +} + +/* + * An enabled device is visible to all clients and to all the public facing + * APIs that return a device pointer. This always returns with a new get, even + * if it fails. + */ +static int enable_device_and_get(struct ib_device *device) +{ + struct ib_client *client; + unsigned long index; + int ret = 0; + + /* + * One ref belongs to the xa and the other belongs to this + * thread. This is needed to guard against parallel unregistration. + */ + refcount_set(&device->refcount, 2); + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); + + /* + * By using downgrade_write() we ensure that no other thread can clear + * DEVICE_REGISTERED while we are completing the client setup. + */ + downgrade_write(&devices_rwsem); + + if (device->ops.enable_driver) { + ret = device->ops.enable_driver(device); + if (ret) + goto out; } - ret = ib_cache_setup_one(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto pkey_cleanup; + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + ret = add_client_context(device, client); + if (ret) + break; } - return 0; + up_read(&clients_rwsem); -pkey_cleanup: - kfree(device->port_pkey_list); -port_cleanup: - kfree(device->port_immutable); +out: + up_read(&devices_rwsem); return ret; } @@ -573,132 +821,254 @@ port_cleanup: * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). + * + * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() + * asynchronously then the device pointer may become freed as soon as this + * function returns. */ -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)) +int ib_register_device(struct ib_device *device, const char *name) { int ret; - struct ib_client *client; - - setup_dma_device(device); - mutex_lock(&device_mutex); - - if (strchr(name, '%')) { - ret = alloc_name(device, name); - if (ret) - goto out; - } else { - ret = dev_set_name(&device->dev, name); - if (ret) - goto out; - } - if (__ib_device_get_by_name(dev_name(&device->dev))) { - ret = -ENFILE; - goto out; - } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + ret = assign_name(device, name); + if (ret) + return ret; ret = setup_device(device); if (ret) - goto out; - - device->index = __dev_new_index(); + return ret; - ret = ib_device_register_rdmacg(device); + ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, - "Couldn't register device with rdma cgroup\n"); - goto dev_cleanup; + "Couldn't set up InfiniBand P_Key/GID cache\n"); + return ret; } - ret = ib_device_register_sysfs(device, port_callback); + ib_device_register_rdmacg(device); + + ret = device_add(&device->dev); + if (ret) + goto cg_cleanup; + + ret = ib_device_register_sysfs(device); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); - goto cg_cleanup; + goto dev_cleanup; } - device->reg_state = IB_DEV_REGISTERED; + ret = enable_device_and_get(device); + if (ret) { + void (*dealloc_fn)(struct ib_device *); - list_for_each_entry(client, &client_list, list) - if (!add_client_context(device, client) && client->add) - client->add(device); + /* + * If we hit this error flow then we don't want to + * automatically dealloc the device since the caller is + * expected to call ib_dealloc_device() after + * ib_register_device() fails. This is tricky due to the + * possibility for a parallel unregistration along with this + * error flow. Since we have a refcount here we know any + * parallel flow is stopped in disable_device and will see the + * NULL pointers, causing the responsibility to + * ib_dealloc_device() to revert back to this thread. + */ + dealloc_fn = device->ops.dealloc_driver; + device->ops.dealloc_driver = NULL; + ib_device_put(device); + __ib_unregister_device(device); + device->ops.dealloc_driver = dealloc_fn; + return ret; + } + ib_device_put(device); - down_write(&lists_rwsem); - list_add_tail(&device->core_list, &device_list); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); return 0; +dev_cleanup: + device_del(&device->dev); cg_cleanup: ib_device_unregister_rdmacg(device); -dev_cleanup: - cleanup_device(device); -out: - mutex_unlock(&device_mutex); + ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); +/* Callers must hold a get on the device. */ +static void __ib_unregister_device(struct ib_device *ib_dev) +{ + /* + * We have a registration lock so that all the calls to unregister are + * fully fenced, once any unregister returns the device is truely + * unregistered even if multiple callers are unregistering it at the + * same time. This also interacts with the registration flow and + * provides sane semantics if register and unregister are racing. + */ + mutex_lock(&ib_dev->unregistration_lock); + if (!refcount_read(&ib_dev->refcount)) + goto out; + + disable_device(ib_dev); + ib_device_unregister_sysfs(ib_dev); + device_del(&ib_dev->dev); + ib_device_unregister_rdmacg(ib_dev); + ib_cache_cleanup_one(ib_dev); + + /* + * Drivers using the new flow may not call ib_dealloc_device except + * in error unwind prior to registration success. + */ + if (ib_dev->ops.dealloc_driver) { + WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); + ib_dealloc_device(ib_dev); + } +out: + mutex_unlock(&ib_dev->unregistration_lock); +} + /** * ib_unregister_device - Unregister an IB device - * @device:Device to unregister + * @device: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. + * + * Callers should call this routine only once, and protect against races with + * registration. Typically it should only be called as part of a remove + * callback in an implementation of driver core's struct device_driver and + * related. + * + * If ops.dealloc_driver is used then ib_dev will be freed upon return from + * this function. */ -void ib_unregister_device(struct ib_device *device) +void ib_unregister_device(struct ib_device *ib_dev) { - struct ib_client_data *context, *tmp; - unsigned long flags; + get_device(&ib_dev->dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device); - /* - * Wait for all netlink command callers to finish working on the - * device. - */ - ib_device_put(device); - wait_for_completion(&device->unreg_completion); +/** + * ib_unregister_device_and_put - Unregister a device while holding a 'get' + * device: The device to unregister + * + * This is the same as ib_unregister_device(), except it includes an internal + * ib_device_put() that should match a 'get' obtained by the caller. + * + * It is safe to call this routine concurrently from multiple threads while + * holding the 'get'. When the function returns the device is fully + * unregistered. + * + * Drivers using this flow MUST use the driver_unregister callback to clean up + * their resources associated with the device and dealloc it. + */ +void ib_unregister_device_and_put(struct ib_device *ib_dev) +{ + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + ib_device_put(ib_dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_and_put); - mutex_lock(&device_mutex); +/** + * ib_unregister_driver - Unregister all IB devices for a driver + * @driver_id: The driver to unregister + * + * This implements a fence for device unregistration. It only returns once all + * devices associated with the driver_id have fully completed their + * unregistration and returned from ib_unregister_device*(). + * + * If device's are not yet unregistered it goes ahead and starts unregistering + * them. + * + * This does not block creation of new devices with the given driver_id, that + * is the responsibility of the caller. + */ +void ib_unregister_driver(enum rdma_driver_id driver_id) +{ + struct ib_device *ib_dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, ib_dev) { + if (ib_dev->driver_id != driver_id) + continue; - down_write(&lists_rwsem); - list_del(&device->core_list); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - context->going_down = true; - write_unlock_irq(&device->client_data_lock); - downgrade_write(&lists_rwsem); + get_device(&ib_dev->dev); + up_read(&devices_rwsem); - list_for_each_entry(context, &device->client_data_list, list) { - if (context->client->remove) - context->client->remove(device, context->data); + WARN_ON(!ib_dev->ops.dealloc_driver); + __ib_unregister_device(ib_dev); + + put_device(&ib_dev->dev); + down_read(&devices_rwsem); } - up_read(&lists_rwsem); + up_read(&devices_rwsem); +} +EXPORT_SYMBOL(ib_unregister_driver); - ib_device_unregister_sysfs(device); - ib_device_unregister_rdmacg(device); +static void ib_unregister_work(struct work_struct *work) +{ + struct ib_device *ib_dev = + container_of(work, struct ib_device, unregistration_work); - mutex_unlock(&device_mutex); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} - ib_cache_cleanup_one(device); +/** + * ib_unregister_device_queued - Unregister a device using a work queue + * device: The device to unregister + * + * This schedules an asynchronous unregistration using a WQ for the device. A + * driver should use this to avoid holding locks while doing unregistration, + * such as holding the RTNL lock. + * + * Drivers using this API must use ib_unregister_driver before module unload + * to ensure that all scheduled unregistrations have completed. + */ +void ib_unregister_device_queued(struct ib_device *ib_dev) +{ + WARN_ON(!refcount_read(&ib_dev->refcount)); + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_queued); - ib_security_destroy_port_pkey_list(device); - kfree(device->port_pkey_list); +static int assign_client_id(struct ib_client *client) +{ + int ret; - down_write(&lists_rwsem); - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { - list_del(&context->list); - kfree(context); + down_write(&clients_rwsem); + /* + * The add/remove callbacks must be called in FIFO/LIFO order. To + * achieve this we assign client_ids so they are sorted in + * registration order, and retain a linked list we can reverse iterate + * to get the LIFO order. The extra linked list can go away if xarray + * learns to reverse iterate. + */ + if (list_empty(&client_list)) { + client->client_id = 0; + } else { + struct ib_client *last; + + last = list_last_entry(&client_list, struct ib_client, list); + client->client_id = last->client_id + 1; } - write_unlock_irqrestore(&device->client_data_lock, flags); - up_write(&lists_rwsem); + ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); + if (ret) + goto out; + + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); + list_add_tail(&client->list, &client_list); - device->reg_state = IB_DEV_UNREGISTERED; +out: + up_write(&clients_rwsem); + return ret; } -EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client @@ -716,19 +1086,23 @@ EXPORT_SYMBOL(ib_unregister_device); int ib_register_client(struct ib_client *client) { struct ib_device *device; + unsigned long index; + int ret; - mutex_lock(&device_mutex); - - list_for_each_entry(device, &device_list, core_list) - if (!add_client_context(device, client) && client->add) - client->add(device); - - down_write(&lists_rwsem); - list_add_tail(&client->list, &client_list); - up_write(&lists_rwsem); - - mutex_unlock(&device_mutex); + ret = assign_client_id(client); + if (ret) + return ret; + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&devices_rwsem); + ib_unregister_client(client); + return ret; + } + } + up_read(&devices_rwsem); return 0; } EXPORT_SYMBOL(ib_register_client); @@ -740,108 +1114,56 @@ EXPORT_SYMBOL(ib_register_client); * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. + * + * This is a full fence, once it returns no client callbacks will be called, + * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context; struct ib_device *device; + unsigned long index; - mutex_lock(&device_mutex); + down_write(&clients_rwsem); + xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); + up_write(&clients_rwsem); + /* + * Every device still known must be serialized to make sure we are + * done with the client callbacks before we return. + */ + down_read(&devices_rwsem); + xa_for_each (&devices, index, device) + remove_client_context(device, client->client_id); + up_read(&devices_rwsem); - down_write(&lists_rwsem); + down_write(&clients_rwsem); list_del(&client->list); - up_write(&lists_rwsem); - - list_for_each_entry(device, &device_list, core_list) { - struct ib_client_data *found_context = NULL; - - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->going_down = true; - found_context = context; - break; - } - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); - - if (client->remove) - client->remove(device, found_context ? - found_context->data : NULL); - - if (!found_context) { - dev_warn(&device->dev, - "No client context found for %s\n", - client->name); - continue; - } - - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_del(&found_context->list); - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); - kfree(found_context); - } - - mutex_unlock(&device_mutex); + xa_erase(&clients, client->client_id); + up_write(&clients_rwsem); } EXPORT_SYMBOL(ib_unregister_client); /** - * ib_get_client_data - Get IB client context - * @device:Device to get context for - * @client:Client to get context for - * - * ib_get_client_data() returns client context set with - * ib_set_client_data(). - */ -void *ib_get_client_data(struct ib_device *device, struct ib_client *client) -{ - struct ib_client_data *context; - void *ret = NULL; - unsigned long flags; - - read_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - ret = context->data; - break; - } - read_unlock_irqrestore(&device->client_data_lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_client_data); - -/** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * - * ib_set_client_data() sets client context that can be retrieved with - * ib_get_client_data(). + * ib_set_client_data() sets client context data that can be retrieved with + * ib_get_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { - struct ib_client_data *context; - unsigned long flags; - - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->data = data; - goto out; - } + void *rc; - dev_warn(&device->dev, "No client context found for %s\n", - client->name); + if (WARN_ON(IS_ERR(data))) + data = NULL; -out: - write_unlock_irqrestore(&device->client_data_lock, flags); + rc = xa_store(&device->client_data, client->client_id, data, + GFP_KERNEL); + WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); @@ -940,6 +1262,185 @@ int ib_query_port(struct ib_device *device, } EXPORT_SYMBOL(ib_query_port); +static void add_ndev_hash(struct ib_port_data *pdata) +{ + unsigned long flags; + + might_sleep(); + + spin_lock_irqsave(&ndev_hash_lock, flags); + if (hash_hashed(&pdata->ndev_hash_link)) { + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock_irqrestore(&ndev_hash_lock, flags); + /* + * We cannot do hash_add_rcu after a hash_del_rcu until the + * grace period + */ + synchronize_rcu(); + spin_lock_irqsave(&ndev_hash_lock, flags); + } + if (pdata->netdev) + hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, + (uintptr_t)pdata->netdev); + spin_unlock_irqrestore(&ndev_hash_lock, flags); +} + +/** + * ib_device_set_netdev - Associate the ib_dev with an underlying net_device + * @ib_dev: Device to modify + * @ndev: net_device to affiliate, may be NULL + * @port: IB port the net_device is connected to + * + * Drivers should use this to link the ib_device to a netdev so the netdev + * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be + * affiliated with any port. + * + * The caller must ensure that the given ndev is not unregistered or + * unregistering, and that either the ib_device is unregistered or + * ib_device_set_netdev() is called with NULL when the ndev sends a + * NETDEV_UNREGISTER event. + */ +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + unsigned int port) +{ + struct net_device *old_ndev; + struct ib_port_data *pdata; + unsigned long flags; + int ret; + + /* + * Drivers wish to call this before ib_register_driver, so we have to + * setup the port data early. + */ + ret = alloc_port_data(ib_dev); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + + pdata = &ib_dev->port_data[port]; + spin_lock_irqsave(&pdata->netdev_lock, flags); + old_ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (old_ndev == ndev) { + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + return 0; + } + + if (ndev) + dev_hold(ndev); + rcu_assign_pointer(pdata->netdev, ndev); + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + + add_ndev_hash(pdata); + if (old_ndev) + dev_put(old_ndev); + + return 0; +} +EXPORT_SYMBOL(ib_device_set_netdev); + +static void free_netdevs(struct ib_device *ib_dev) +{ + unsigned long flags; + unsigned int port; + + rdma_for_each_port (ib_dev, port) { + struct ib_port_data *pdata = &ib_dev->port_data[port]; + struct net_device *ndev; + + spin_lock_irqsave(&pdata->netdev_lock, flags); + ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (ndev) { + spin_lock(&ndev_hash_lock); + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock(&ndev_hash_lock); + + /* + * If this is the last dev_put there is still a + * synchronize_rcu before the netdev is kfreed, so we + * can continue to rely on unlocked pointer + * comparisons after the put + */ + rcu_assign_pointer(pdata->netdev, NULL); + dev_put(ndev); + } + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + } +} + +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + unsigned int port) +{ + struct ib_port_data *pdata; + struct net_device *res; + + if (!rdma_is_port_valid(ib_dev, port)) + return NULL; + + pdata = &ib_dev->port_data[port]; + + /* + * New drivers should use ib_device_set_netdev() not the legacy + * get_netdev(). + */ + if (ib_dev->ops.get_netdev) + res = ib_dev->ops.get_netdev(ib_dev, port); + else { + spin_lock(&pdata->netdev_lock); + res = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (res) + dev_hold(res); + spin_unlock(&pdata->netdev_lock); + } + + /* + * If we are starting to unregister expedite things by preventing + * propagation of an unregistering netdev. + */ + if (res && res->reg_state != NETREG_REGISTERED) { + dev_put(res); + return NULL; + } + + return res; +} + +/** + * ib_device_get_by_netdev - Find an IB device associated with a netdev + * @ndev: netdev to locate + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device that is associated with a netdev via + * ib_device_set_netdev(). The caller must call ib_device_put() on the + * returned pointer. + */ +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id) +{ + struct ib_device *res = NULL; + struct ib_port_data *cur; + + rcu_read_lock(); + hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || + cur->ib_dev->driver_id == driver_id) && + ib_device_try_get(cur->ib_dev)) { + res = cur->ib_dev; + break; + } + } + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(ib_device_get_by_netdev); + /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query @@ -958,21 +1459,12 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_callback cb, void *cookie) { - u8 port; + unsigned int port; - for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); - port++) + rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { - struct net_device *idev = NULL; - - if (ib_dev->ops.get_netdev) - idev = ib_dev->ops.get_netdev(ib_dev, port); - - if (idev && - idev->reg_state >= NETREG_UNREGISTERED) { - dev_put(idev); - idev = NULL; - } + struct net_device *idev = + ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); @@ -999,11 +1491,12 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *cookie) { struct ib_device *dev; + unsigned long index; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); - up_read(&lists_rwsem); + up_read(&devices_rwsem); } /** @@ -1015,19 +1508,19 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { + unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } - - up_read(&lists_rwsem); + up_read(&devices_rwsem); return ret; } @@ -1114,13 +1607,15 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index) { union ib_gid tmp_gid; - int ret, port, i; + unsigned int port; + int ret, i; - for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; - for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; + ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) return ret; @@ -1152,7 +1647,8 @@ int ib_find_pkey(struct ib_device *device, u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { + for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; + ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; @@ -1185,6 +1681,7 @@ EXPORT_SYMBOL(ib_find_pkey); * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. + * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, @@ -1193,29 +1690,30 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, const struct sockaddr *addr) { struct net_device *net_dev = NULL; - struct ib_client_data *context; + unsigned long index; + void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; - down_read(&lists_rwsem); - - list_for_each_entry(context, &dev->client_data_list, list) { - struct ib_client *client = context->client; + /* + * Holding the read side guarantees that the client will not become + * unregistered while we are calling get_net_dev_by_params() + */ + down_read(&dev->client_data_rwsem); + xan_for_each_marked (&dev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); - if (context->going_down) + if (!client || !client->get_net_dev_by_params) continue; - if (client->get_net_dev_by_params) { - net_dev = client->get_net_dev_by_params(dev, port, pkey, - gid, addr, - context->data); - if (net_dev) - break; - } + net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, + addr, client_data); + if (net_dev) + break; } - - up_read(&lists_rwsem); + up_read(&dev->client_data_rwsem); return net_dev; } @@ -1231,6 +1729,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) (ptr)->name = ops->name; \ } while (0) +#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); @@ -1254,6 +1754,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_fmr); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); @@ -1274,6 +1775,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); + SET_DEVICE_OP(dev_ops, enable_driver); + SET_DEVICE_OP(dev_ops, fill_res_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); @@ -1283,6 +1786,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); + SET_DEVICE_OP(dev_ops, init_port); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); @@ -1318,6 +1822,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, unmap_fmr); + + SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_ucontext); } EXPORT_SYMBOL(ib_set_device_ops); @@ -1436,6 +1943,9 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); + flush_workqueue(system_unbound_wq); + WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 476abc74178e..732637c913d9 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, + [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; @@ -504,7 +505,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) { const char *devname = dev_name(&cm_id->device->dev); const char *ifname = cm_id->device->iwcm->ifname; - struct iwpm_dev_data pm_reg_msg; + struct iwpm_dev_data pm_reg_msg = {}; struct iwpm_sa_data pm_msg; int status; @@ -515,8 +516,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - strncpy(pm_reg_msg.dev_name, devname, sizeof(pm_reg_msg.dev_name)); - strncpy(pm_reg_msg.if_name, ifname, sizeof(pm_reg_msg.if_name)); + strcpy(pm_reg_msg.dev_name, devname); + strcpy(pm_reg_msg.if_name, ifname); if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || !iwpm_valid_pid()) @@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; + pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ? + IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); @@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, - RDMA_NL_IWCM); + RDMA_NL_IWCM, pm_msg.flags); } /* diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 8861c052155a..2452b0ddcf0d 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -34,18 +34,25 @@ #include "iwpm_util.h" static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; -static int iwpm_ulib_version = 3; +u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -/* - * iwpm_register_pid - Send a netlink query to user space - * for the iwarp port mapper pid +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_REG_PID_SEQ] @@ -124,12 +131,19 @@ pid_query_error: return ret; } -/* - * iwpm_add_mapping - Send a netlink add mapping message - * to the port mapper +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] + * [IWPM_NLA_MANAGE_FLAGS] + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -173,6 +187,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto add_mapping_error; + /* If flags are required and we're not V4, then return a quiet error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto add_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_MANAGE_FLAGS); + if (ret) + goto add_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -187,6 +213,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; add_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +add_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -194,13 +221,17 @@ add_mapping_error: return ret; } -/* - * iwpm_add_and_query_mapping - Send a netlink add and query - * mapping message to the port mapper +/** + * iwpm_add_and_query_mapping - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_QUERY_FLAGS] */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -251,6 +282,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto query_mapping_error; + /* If flags are required and we're not V4, then return a quite error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto query_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_QUERY_FLAGS); + if (ret) + goto query_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -264,6 +307,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; query_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +query_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -271,9 +315,13 @@ query_mapping_error: return ret; } -/* - * iwpm_remove_mapping - Send a netlink remove mapping message - * to the port mapper +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] @@ -344,9 +392,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_register_pid_cb - Process a port mapper response to - * iwpm_register_pid() +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper */ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -379,7 +432,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) /* check device name, ulib name and version */ if (strcmp(pm_msg->dev_name, dev_name) || strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", __func__, dev_name, iwpm_name, iwpm_version); @@ -387,6 +440,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) goto register_pid_response_exit; } iwpm_user_pid = cb->nlh->nlmsg_pid; + iwpm_ulib_version = iwpm_version; + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); @@ -403,15 +460,19 @@ register_pid_response_exit: /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { - [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } + [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RMANAGE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_mapping_cb - Process a port mapper response to - * iwpm_add_mapping() +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -430,7 +491,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -439,9 +500,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); mapped_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; @@ -472,17 +533,23 @@ add_mapping_response_exit: /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { - [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RQUERY_LOCAL_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_REMOTE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_and_query_mapping_cb - Process a port mapper response to - * iwpm_add_and_query_mapping() +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -502,7 +569,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -511,9 +578,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -560,9 +627,13 @@ query_mapping_response_exit: return 0; } -/* - * iwpm_remote_info_cb - Process a port mapper message, containing - * the remote connecting peer address info +/** + * iwpm_remote_info_cb - Process remote connecting peer address info, which + * the port mapper has received from the connecting peer + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Stores the IPv4/IPv6 address info in a hash table */ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -588,9 +659,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -635,8 +706,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } }; -/* - * iwpm_mapping_info_cb - Process a port mapper request for mapping info +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper */ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -655,7 +732,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Invalid port mapper name = %s version = %d\n", __func__, iwpm_name, iwpm_version); return ret; @@ -669,6 +746,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; + + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); + if (!iwpm_mapinfo_available()) return 0; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", @@ -684,9 +766,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } }; -/* - * iwpm_ack_mapping_info_cb - Process a port mapper ack for - * the provided mapping info records +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -712,8 +796,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, }; -/* - * iwpm_mapping_error_cb - Process a port mapper error message +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -748,3 +835,46 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } + +/* netlink attribute policy for the received hello request */ +static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello request"; + u8 nl_client; + u16 abi_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, + msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); + pr_debug("Using ABI version %u\n", iwpm_ulib_version); + iwpm_user_pid = cb->nlh->nlmsg_pid; + ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index cdb63f3f4de7..a5d2a20ee697 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -51,6 +51,12 @@ static DEFINE_SPINLOCK(iwpm_reminfo_lock); static DEFINE_MUTEX(iwpm_admin_lock); static struct iwpm_admin_data iwpm_admin; +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes up. + */ int iwpm_init(u8 nl_client) { int ret = 0; @@ -87,6 +93,12 @@ init_exit: static void free_hash_bucket(void); static void free_reminfo_bucket(void); +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes down. + */ int iwpm_exit(u8 nl_client) { @@ -112,9 +124,17 @@ int iwpm_exit(u8 nl_client) static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags + */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, - u8 nl_client) + u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; @@ -132,6 +152,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; + map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { @@ -150,6 +171,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, return ret; } +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_addr: Local ip/tcp address + * @mapped_local_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { @@ -250,6 +280,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } +/** + * iwpm_get_remote_info - Get the remote connecting peer address info + * + * @mapped_loc_addr: Mapped local address of the listening peer + * @mapped_rem_addr: Mapped remote address of the connecting peer + * @remote_addr: To store the remote address of the connecting peer + * @nl_client: The index of the netlink client + * + * The remote address info is retrieved and provided to the client in + * the remote_addr. After that it is removed from the hash table + */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, @@ -686,6 +727,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &map_info->map_flags, + IWPM_NLA_MAPINFO_FLAGS); + if (ret) + goto send_mapping_info_unlock; + } + nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, @@ -754,3 +803,38 @@ int iwpm_mapinfo_available(void) spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } + +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto hello_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of abi_version into nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, + IWPM_NLA_HELLO_ABI_VERSION); + if (ret) + goto hello_num_error; + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto hello_num_error; + } + pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); + return 0; +hello_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index af1fc14a0d3d..7e2bcc72f66c 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -78,6 +78,7 @@ struct iwpm_mapping_info { struct sockaddr_storage local_sockaddr; struct sockaddr_storage mapped_sockaddr; u8 nl_client; + u32 map_flags; }; struct iwpm_remote_info { @@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, * @msg: Message to print */ void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); + +/** + * iwpm_send_hello - Send hello response to iwpmd + * + * @nl_client: The index of the netlink client + * @abi_version: The kernel's abi_version + * + * Returns 0 on success or a negative error code + */ +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version); +extern u16 iwpm_ulib_version; #endif diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 7870823bac47..e742a6a2c138 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3326,9 +3326,9 @@ error: static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { if (!rdma_cap_ib_mad(device, i)) continue; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 724f5a62e82f..eecfc0b377c9 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { - [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS, [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, @@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } /* FIXME: Convert IWCM to properly handle doit callbacks */ - if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || - index == RDMA_NL_IWCM) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index e600fc23ae62..11ed58d3fce5 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -33,12 +33,14 @@ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> +#include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" #include "cma_priv.h" +#include "restrack.h" static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, @@ -107,6 +109,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -262,9 +271,7 @@ static int fill_port_info(struct sk_buff *msg, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; - if (device->ops.get_netdev) - netdev = device->ops.get_netdev(device, port); - + netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); @@ -314,7 +321,6 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_CTX] = "ctx", }; - struct rdma_restrack_root *res = &device->res; struct nlattr *table_attr; int ret, i, curr; @@ -328,7 +334,8 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + curr = rdma_restrack_count(device, i, + task_active_pid_ns(current)); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; @@ -361,13 +368,20 @@ static int fill_res_name_pid(struct sk_buff *msg, return 0; } -static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, +static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + if (!dev->ops.fill_res_entry) + return false; + return dev->ops.fill_res_entry(msg, res); +} + +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct rdma_restrack_root *resroot = &qp->device->res; + struct ib_device *dev = qp->device; struct ib_qp_init_attr qp_init_attr; - struct nlattr *entry_attr; struct ib_qp_attr qp_attr; int ret; @@ -376,11 +390,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, return ret; if (port && port != qp_attr.port_num) - return 0; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); - if (!entry_attr) - goto out; + return -EAGAIN; /* In create_qp() port is not set yet */ if (qp_attr.port_num && @@ -412,38 +422,32 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_cm_id_entry(struct sk_buff *msg, - struct netlink_callback *cb, +static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); - struct rdma_restrack_root *resroot = &id_priv->id.device->res; + struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; - struct nlattr *entry_attr; if (port && port != cm_id->port_num) return 0; - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); - if (!entry_attr) - goto out; - if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) goto err; @@ -472,31 +476,25 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, &cm_id->route.addr.dst_addr)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); - struct rdma_restrack_root *resroot = &cq->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY); - if (!entry_attr) - goto out; + struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) goto err; @@ -509,33 +507,31 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) + goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + cq->uobject->context->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); - struct rdma_restrack_root *resroot = &mr->pd->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY); - if (!entry_attr) - goto out; + struct ib_device *dev = mr->pd->device; - if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) @@ -546,33 +542,31 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + goto err; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); - struct rdma_restrack_root *resroot = &pd->device->res; - struct nlattr *entry_attr; + struct ib_device *dev = pd->device; - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY); - if (!entry_attr) - goto out; - - if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, pd->local_dma_lkey)) goto err; @@ -584,24 +578,24 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; - if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && - nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, - pd->unsafe_global_rkey)) + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) + goto err; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + pd->uobject->context->res.id)) goto err; if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -781,7 +775,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, u32 idx = 0; u32 ifindex; int err; - u32 p; + unsigned int p; err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NULL); @@ -793,7 +787,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, if (!device) return -EINVAL; - for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink @@ -909,10 +903,17 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, } struct nldev_fill_res_entry { - int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb, + int (*fill_res_func)(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, u32 port); enum rdma_nldev_attr nldev_attr; enum rdma_nldev_command nldev_cmd; + u8 flags; + u32 entry; + u32 id; +}; + +enum nldev_res_flags { + NLDEV_PER_DEV = 1 << 0, }; static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { @@ -920,29 +921,136 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .fill_res_func = fill_res_qp_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, + .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { .fill_res_func = fill_res_cm_id_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, + .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .fill_res_func = fill_res_cq_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .fill_res_func = fill_res_mr_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .fill_res_func = fill_res_pd_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_PDN, }, }; +static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res) +{ + /* + * 1. Kern resources should be visible in init name space only + * 2. Present only resources visible in the current namespace + */ + if (rdma_is_kernel_res(res)) + return task_active_pid_ns(current) == &init_pid_ns; + return task_active_pid_ns(current) == task_active_pid_ns(res->task); +} + +static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + enum rdma_restrack_type res_type) +{ + const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + struct ib_device *device; + u32 index, id, port = 0; + bool has_cap_net_admin; + struct sk_buff *msg; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + } + + if ((port && fe->flags & NLDEV_PER_DEV) || + (!port && ~fe->flags & NLDEV_PER_DEV)) { + ret = -EINVAL; + goto err; + } + + id = nla_get_u32(tb[fe->id]); + res = rdma_restrack_get_byid(device, res_type, id); + if (IS_ERR(res)) { + ret = PTR_ERR(res); + goto err; + } + + if (!is_visible_in_pid_ns(res)) { + ret = -ENOENT; + goto err_get; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + 0, 0); + + if (fill_nldev_handle(msg, device)) { + ret = -EMSGSIZE; + goto err_free; + } + + has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); + ret = fe->fill_res_func(msg, has_cap_net_admin, res, port); + rdma_restrack_put(res); + if (ret) + goto err_free; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err_get: + rdma_restrack_put(res); +err: + ib_device_put(device); + return ret; +} + static int res_get_common_dumpit(struct sk_buff *skb, struct netlink_callback *cb, enum rdma_restrack_type res_type) @@ -950,11 +1058,15 @@ static int res_get_common_dumpit(struct sk_buff *skb, const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; struct nlattr *table_attr; + struct nlattr *entry_attr; struct ib_device *device; int start = cb->args[0]; + bool has_cap_net_admin; struct nlmsghdr *nlh; + unsigned long id; u32 index, port = 0; bool filled = false; @@ -1002,55 +1114,51 @@ static int res_get_common_dumpit(struct sk_buff *skb, goto err; } - down_read(&device->res.rwsem); - hash_for_each_possible(device->res.hash, res, node, res_type) { - if (idx < start) - goto next; + has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); - if ((rdma_is_kernel_res(res) && - task_active_pid_ns(current) != &init_pid_ns) || - (!rdma_is_kernel_res(res) && task_active_pid_ns(current) != - task_active_pid_ns(res->task))) - /* - * 1. Kern resources should be visible in init - * namspace only - * 2. Present only resources visible in the current - * namespace - */ - goto next; + rt = &device->res[res_type]; + xa_lock(&rt->xa); + /* + * FIXME: if the skip ahead is something common this loop should + * use xas_for_each & xas_pause to optimize, we can have a lot of + * objects. + */ + xa_for_each(&rt->xa, id, res) { + if (!is_visible_in_pid_ns(res)) + continue; - if (!rdma_restrack_get(res)) - /* - * Resource is under release now, but we are not - * relesing lock now, so it will be released in - * our next pass, once we will get ->next pointer. - */ + if (idx < start || !rdma_restrack_get(res)) goto next; + xa_unlock(&rt->xa); + filled = true; - up_read(&device->res.rwsem); - ret = fe->fill_res_func(skb, cb, res, port); - down_read(&device->res.rwsem); - /* - * Return resource back, but it won't be released till - * the &device->res.rwsem will be released for write. - */ + entry_attr = nla_nest_start(skb, fe->entry); + if (!entry_attr) { + ret = -EMSGSIZE; + rdma_restrack_put(res); + goto msg_full; + } + + ret = fe->fill_res_func(skb, has_cap_net_admin, res, port); rdma_restrack_put(res); - if (ret == -EMSGSIZE) - /* - * There is a chance to optimize here. - * It can be done by using list_prepare_entry - * and list_for_each_entry_continue afterwards. - */ - break; - if (ret) + if (ret) { + nla_nest_cancel(skb, entry_attr); + if (ret == -EMSGSIZE) + goto msg_full; + if (ret == -EAGAIN) + goto again; goto res_err; + } + nla_nest_end(skb, entry_attr); +again: xa_lock(&rt->xa); next: idx++; } - up_read(&device->res.rwsem); + xa_unlock(&rt->xa); +msg_full: nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); cb->args[0] = idx; @@ -1067,7 +1175,6 @@ next: idx++; res_err: nla_nest_cancel(skb, table_attr); - up_read(&device->res.rwsem); err: nlmsg_cancel(skb, nlh); @@ -1077,34 +1184,132 @@ err_index: return ret; } -static int nldev_res_get_qp_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ + struct netlink_callback *cb) \ + { \ + return res_get_common_dumpit(skb, cb, type); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ + struct netlink_ext_ack *extack) \ + { \ + return res_get_common_doit(skb, nlh, extack, type); \ + } + +RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); +RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); +RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); + +static LIST_HEAD(link_ops); +static DECLARE_RWSEM(link_ops_rwsem); + +static const struct rdma_link_ops *link_ops_get(const char *type) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP); + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; } -static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +void rdma_link_register(struct rdma_link_ops *ops) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID); + down_write(&link_ops_rwsem); + if (WARN_ON_ONCE(link_ops_get(ops->type))) + goto out; + list_add(&ops->list, &link_ops); +out: + up_write(&link_ops_rwsem); } +EXPORT_SYMBOL(rdma_link_register); -static int nldev_res_get_cq_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +void rdma_link_unregister(struct rdma_link_ops *ops) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ); + down_write(&link_ops_rwsem); + list_del(&ops->list); + up_write(&link_ops_rwsem); } +EXPORT_SYMBOL(rdma_link_unregister); -static int nldev_res_get_mr_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR); + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + struct net_device *ndev; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%')) + return -EINVAL; + + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + ndev = dev_get_by_name(&init_net, ndev_name); + if (!ndev) + return -ENODEV; + + down_read(&link_ops_rwsem); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + up_read(&link_ops_rwsem); + request_module("rdma-link-%s", type); + down_read(&link_ops_rwsem); + ops = link_ops_get(type); + } +#endif + err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; + up_read(&link_ops_rwsem); + dev_put(ndev); + + return err; } -static int nldev_res_get_pd_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + ib_device_put(device); + return -EINVAL; + } + + ib_unregister_device_and_put(device); + return 0; } static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { @@ -1116,6 +1321,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, @@ -1125,28 +1338,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .dump = nldev_res_get_dumpit, }, [RDMA_NLDEV_CMD_RES_QP_GET] = { + .doit = nldev_res_get_qp_doit, .dump = nldev_res_get_qp_dumpit, - /* - * .doit is not implemented yet for two reasons: - * 1. It is not needed yet. - * 2. There is a need to provide identifier, while it is easy - * for the QPs (device index + port index + LQPN), it is not - * the case for the rest of resources (PD and CQ). Because it - * is better to provide similar interface for all resources, - * let's wait till we will have other resources implemented - * too. - */ }, [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { + .doit = nldev_res_get_cm_id_doit, .dump = nldev_res_get_cm_id_dumpit, }, [RDMA_NLDEV_CMD_RES_CQ_GET] = { + .doit = nldev_res_get_cq_doit, .dump = nldev_res_get_cq_dumpit, }, [RDMA_NLDEV_CMD_RES_MR_GET] = { + .doit = nldev_res_get_mr_doit, .dump = nldev_res_get_mr_dumpit, }, [RDMA_NLDEV_CMD_RES_PD_GET] = { + .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, }; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6c4747e61d2b..778375ff664e 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -438,6 +438,38 @@ free: uverbs_uobject_put(uobj); return ERR_PTR(ret); } +struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, + object_id, UVERBS_LOOKUP_READ); + if (IS_ERR(uobj)) + return uobj; + + attrs->context = uobj->context; + + return uobj; +} + +struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, + object_id, UVERBS_LOOKUP_WRITE); + + if (IS_ERR(uobj)) + return uobj; + + attrs->context = uobj->context; + + return uobj; +} static struct ib_uobject * alloc_begin_idr_uobject(const struct uverbs_api_object *obj, @@ -801,6 +833,7 @@ void uverbs_close_fd(struct file *f) /* Pairs with filp->private_data in alloc_begin_fd_uobject */ uverbs_uobject_put(uobj); } +EXPORT_SYMBOL(uverbs_close_fd); /* * Drop the ucontext off the ufile and completely disconnect it from the @@ -811,7 +844,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, { struct ib_ucontext *ucontext = ufile->ucontext; struct ib_device *ib_dev = ucontext->device; - int ret; /* * If we are closing the FD then the user mmap VMAs must have @@ -829,12 +861,8 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, rdma_restrack_del(&ucontext->res); - /* - * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove - * the error return. - */ - ret = ib_dev->ops.dealloc_ucontext(ucontext); - WARN_ON(ret); + ib_dev->ops.dealloc_ucontext(ucontext); + kfree(ucontext); ufile->ucontext = NULL; } diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index be6b8e1257d0..69f8db66925e 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -106,6 +106,8 @@ int uverbs_finalize_object(struct ib_uobject *uobj, enum uverbs_obj_access access, bool commit); +int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); + void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 46a5c553c624..3b5ff2f7b5f8 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -11,17 +11,29 @@ #include <linux/pid_namespace.h> #include "cma_priv.h" +#include "restrack.h" -static int fill_res_noop(struct sk_buff *msg, - struct rdma_restrack_entry *entry) +/** + * rdma_restrack_init() - initialize and allocate resource tracking + * @dev: IB device + * + * Return: 0 on success + */ +int rdma_restrack_init(struct ib_device *dev) { - return 0; -} + struct rdma_restrack_root *rt; + int i; -void rdma_restrack_init(struct rdma_restrack_root *res) -{ - init_rwsem(&res->rwsem); - res->fill_res_entry = fill_res_noop; + dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL); + if (!dev->res) + return -ENOMEM; + + rt = dev->res; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) + xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC); + + return 0; } static const char *type2str(enum rdma_restrack_type type) @@ -38,55 +50,79 @@ static const char *type2str(enum rdma_restrack_type type) return names[type]; }; -void rdma_restrack_clean(struct rdma_restrack_root *res) +/** + * rdma_restrack_clean() - clean resource tracking + * @dev: IB device + */ +void rdma_restrack_clean(struct ib_device *dev) { + struct rdma_restrack_root *rt = dev->res; struct rdma_restrack_entry *e; char buf[TASK_COMM_LEN]; - struct ib_device *dev; + bool found = false; const char *owner; - int bkt; - - if (hash_empty(res->hash)) - return; - - dev = container_of(res, struct ib_device, res); - pr_err("restrack: %s", CUT_HERE); - dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); - hash_for_each(res->hash, bkt, e, node) { - if (rdma_is_kernel_res(e)) { - owner = e->kern_name; - } else { - /* - * There is no need to call get_task_struct here, - * because we can be here only if there are more - * get_task_struct() call than put_task_struct(). - */ - get_task_comm(buf, e->task); - owner = buf; + int i; + + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { + struct xarray *xa = &dev->res[i].xa; + + if (!xa_empty(xa)) { + unsigned long index; + + if (!found) { + pr_err("restrack: %s", CUT_HERE); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); + } + xa_for_each(xa, index, e) { + if (rdma_is_kernel_res(e)) { + owner = e->kern_name; + } else { + /* + * There is no need to call get_task_struct here, + * because we can be here only if there are more + * get_task_struct() call than put_task_struct(). + */ + get_task_comm(buf, e->task); + owner = buf; + } + + pr_err("restrack: %s %s object allocated by %s is not freed\n", + rdma_is_kernel_res(e) ? "Kernel" : + "User", + type2str(e->type), owner); + } + found = true; } - - pr_err("restrack: %s %s object allocated by %s is not freed\n", - rdma_is_kernel_res(e) ? "Kernel" : "User", - type2str(e->type), owner); + xa_destroy(xa); } - pr_err("restrack: %s", CUT_HERE); + if (found) + pr_err("restrack: %s", CUT_HERE); + + kfree(rt); } -int rdma_restrack_count(struct rdma_restrack_root *res, - enum rdma_restrack_type type, +/** + * rdma_restrack_count() - the current usage of specific object + * @dev: IB device + * @type: actual type of object to operate + * @ns: PID namespace + */ +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns) { + struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; + XA_STATE(xas, &rt->xa, 0); u32 cnt = 0; - down_read(&res->rwsem); - hash_for_each_possible(res->hash, e, node, type) { + xa_lock(&rt->xa); + xas_for_each(&xas, e, U32_MAX) { if (ns == &init_pid_ns || (!rdma_is_kernel_res(e) && ns == task_active_pid_ns(e->task))) cnt++; } - up_read(&res->rwsem); + xa_unlock(&rt->xa); return cnt; } EXPORT_SYMBOL(rdma_restrack_count); @@ -157,28 +193,29 @@ EXPORT_SYMBOL(rdma_restrack_set_task); static void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); + struct rdma_restrack_root *rt; + int ret; if (!dev) return; - if (res->type != RDMA_RESTRACK_CM_ID || rdma_is_kernel_res(res)) - res->task = NULL; - - if (!rdma_is_kernel_res(res)) { - if (!res->task) - rdma_restrack_set_task(res, NULL); - res->kern_name = NULL; - } else { - set_kern_name(res); - } + rt = &dev->res[res->type]; kref_init(&res->kref); init_completion(&res->comp); - res->valid = true; + if (res->type != RDMA_RESTRACK_QP) + ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, + &rt->next_id, GFP_KERNEL); + else { + /* Special case to ensure that LQPN points to right QP */ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + + ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL); + res->id = ret ? 0 : qp->qp_num; + } - down_write(&dev->res.rwsem); - hash_add(dev->res.hash, &res->node, res->type); - up_write(&dev->res.rwsem); + if (!ret) + res->valid = true; } /** @@ -187,6 +224,8 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) */ void rdma_restrack_kadd(struct rdma_restrack_entry *res) { + res->task = NULL; + set_kern_name(res); res->user = false; rdma_restrack_add(res); } @@ -198,6 +237,13 @@ EXPORT_SYMBOL(rdma_restrack_kadd); */ void rdma_restrack_uadd(struct rdma_restrack_entry *res) { + if (res->type != RDMA_RESTRACK_CM_ID) + res->task = NULL; + + if (!res->task) + rdma_restrack_set_task(res, NULL); + res->kern_name = NULL; + res->user = true; rdma_restrack_add(res); } @@ -209,6 +255,31 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_get); +/** + * rdma_restrack_get_byid() - translate from ID to restrack object + * @dev: IB device + * @type: resource track type + * @id: ID to take a look + * + * Return: Pointer to restrack entry or -ENOENT in case of error. + */ +struct rdma_restrack_entry * +rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, u32 id) +{ + struct rdma_restrack_root *rt = &dev->res[type]; + struct rdma_restrack_entry *res; + + xa_lock(&rt->xa); + res = xa_load(&rt->xa, id); + if (!res || !rdma_restrack_get(res)) + res = ERR_PTR(-ENOENT); + xa_unlock(&rt->xa); + + return res; +} +EXPORT_SYMBOL(rdma_restrack_get_byid); + static void restrack_release(struct kref *kref) { struct rdma_restrack_entry *res; @@ -225,23 +296,25 @@ EXPORT_SYMBOL(rdma_restrack_put); void rdma_restrack_del(struct rdma_restrack_entry *res) { + struct rdma_restrack_entry *old; + struct rdma_restrack_root *rt; struct ib_device *dev; if (!res->valid) goto out; dev = res_to_dev(res); - if (!dev) + if (WARN_ON(!dev)) return; - rdma_restrack_put(res); - - wait_for_completion(&res->comp); + rt = &dev->res[res->type]; - down_write(&dev->res.rwsem); - hash_del(&res->node); + old = xa_erase(&rt->xa, res->id); + WARN_ON(old != res); res->valid = false; - up_write(&dev->res.rwsem); + + rdma_restrack_put(res); + wait_for_completion(&res->comp); out: if (res->task) { diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h new file mode 100644 index 000000000000..09a1fbdf578e --- /dev/null +++ b/drivers/infiniband/core/restrack.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_CORE_RESTRACK_H_ +#define _RDMA_CORE_RESTRACK_H_ + +#include <linux/mutex.h> + +/** + * struct rdma_restrack_root - main resource tracking management + * entity, per-device + */ +struct rdma_restrack_root { + /** + * @xa: Array of XArray structure to hold restrack entries. + */ + struct xarray xa; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id; +}; + +int rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +#endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index d22c4a2ebac6..89a5be3a2f97 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -179,7 +179,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : qp->max_read_sge; struct ib_sge *sge; @@ -209,8 +208,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - sge->addr = ib_sg_dma_address(dev, sg) + offset; - sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->addr = sg_dma_address(sg) + offset; + sge->length = sg_dma_len(sg) - offset; sge->lkey = qp->pd->local_dma_lkey; total_len += sge->length; @@ -236,14 +235,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; struct ib_rdma_wr *rdma_wr = &ctx->single.wr; ctx->nr_ops = 1; ctx->single.sge.lkey = qp->pd->local_dma_lkey; - ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; - ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + ctx->single.sge.addr = sg_dma_address(sg) + offset; + ctx->single.sge.length = sg_dma_len(sg) - offset; memset(rdma_wr, 0, sizeof(*rdma_wr)); if (dir == DMA_TO_DEVICE) @@ -294,7 +292,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, * Skip to the S/G entry that sg_offset falls into: */ for (;;) { - u32 len = ib_sg_dma_len(dev, sg); + u32 len = sg_dma_len(sg); if (sg_offset < len) break; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 97e6d7b69abf..7925e45ea88a 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -2342,9 +2342,7 @@ static void ib_sa_add_one(struct ib_device *device) s = rdma_start_port(device); e = rdma_end_port(device); - sa_dev = kzalloc(sizeof *sa_dev + - (e - s + 1) * sizeof (struct ib_sa_port), - GFP_KERNEL); + sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL); if (!sa_dev) return; diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 1efadbccf394..1ab423b19f77 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -39,22 +39,25 @@ #include "core_priv.h" #include "mad_priv.h" +static LIST_HEAD(mad_agent_list); +/* Lock to protect mad_agent_list */ +static DEFINE_SPINLOCK(mad_agent_list_lock); + static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey = NULL; struct pkey_index_qp_list *tmp_pkey; struct ib_device *dev = pp->sec->dev; - spin_lock(&dev->port_pkey_list[pp->port_num].list_lock); - list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[pp->port_num].pkey_list, - pkey_index_list) { + spin_lock(&dev->port_data[pp->port_num].pkey_list_lock); + list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list, + pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { pkey = tmp_pkey; break; } } - spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock); + spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock); return pkey; } @@ -259,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) if (!pkey) return -ENOMEM; - spin_lock(&dev->port_pkey_list[port_num].list_lock); + spin_lock(&dev->port_data[port_num].pkey_list_lock); /* Check for the PKey again. A racing process may * have created it. */ list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[port_num].pkey_list, + &dev->port_data[port_num].pkey_list, pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { kfree(pkey); @@ -279,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) spin_lock_init(&pkey->qp_list_lock); INIT_LIST_HEAD(&pkey->qp_list); list_add(&pkey->pkey_index_list, - &dev->port_pkey_list[port_num].pkey_list); + &dev->port_data[port_num].pkey_list); } - spin_unlock(&dev->port_pkey_list[port_num].list_lock); + spin_unlock(&dev->port_data[port_num].pkey_list_lock); } spin_lock(&pkey->qp_list_lock); @@ -418,12 +421,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec) int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) { - u8 i = rdma_start_port(dev); + unsigned int i; bool is_ib = false; int ret; - while (i <= rdma_end_port(dev) && !is_ib) + rdma_for_each_port (dev, i) { is_ib = rdma_protocol_ib(dev, i++); + if (is_ib) + break; + } /* If this isn't an IB device don't create the security context */ if (!is_ib) @@ -544,9 +550,8 @@ void ib_security_cache_change(struct ib_device *device, { struct pkey_index_qp_list *pkey; - list_for_each_entry(pkey, - &device->port_pkey_list[port_num].pkey_list, - pkey_index_list) { + list_for_each_entry (pkey, &device->port_data[port_num].pkey_list, + pkey_index_list) { check_pkey_qps(pkey, device, port_num, @@ -554,21 +559,19 @@ void ib_security_cache_change(struct ib_device *device, } } -void ib_security_destroy_port_pkey_list(struct ib_device *device) +void ib_security_release_port_pkey_list(struct ib_device *device) { struct pkey_index_qp_list *pkey, *tmp_pkey; - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { - spin_lock(&device->port_pkey_list[i].list_lock); + rdma_for_each_port (device, i) { list_for_each_entry_safe(pkey, tmp_pkey, - &device->port_pkey_list[i].pkey_list, + &device->port_data[i].pkey_list, pkey_index_list) { list_del(&pkey->pkey_index_list); kfree(pkey); } - spin_unlock(&device->port_pkey_list[i].list_lock); } } @@ -676,19 +679,18 @@ static int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -static int ib_mad_agent_security_change(struct notifier_block *nb, - unsigned long event, - void *data) +void ib_mad_agent_security_change(void) { - struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb); - - if (event != LSM_POLICY_CHANGE) - return NOTIFY_DONE; - - ag->smp_allowed = !security_ib_endport_manage_subnet( - ag->security, dev_name(&ag->device->dev), ag->port_num); - - return NOTIFY_OK; + struct ib_mad_agent *ag; + + spin_lock(&mad_agent_list_lock); + list_for_each_entry(ag, + &mad_agent_list, + mad_agent_sec_list) + WRITE_ONCE(ag->smp_allowed, + !security_ib_endport_manage_subnet(ag->security, + dev_name(&ag->device->dev), ag->port_num)); + spin_unlock(&mad_agent_list_lock); } int ib_mad_agent_security_setup(struct ib_mad_agent *agent, @@ -699,6 +701,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (!rdma_protocol_ib(agent->device, agent->port_num)) return 0; + INIT_LIST_HEAD(&agent->mad_agent_sec_list); + ret = security_ib_alloc_security(&agent->security); if (ret) return ret; @@ -706,20 +710,22 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (qp_type != IB_QPT_SMI) return 0; + spin_lock(&mad_agent_list_lock); ret = security_ib_endport_manage_subnet(agent->security, dev_name(&agent->device->dev), agent->port_num); if (ret) - return ret; + goto free_security; - agent->lsm_nb.notifier_call = ib_mad_agent_security_change; - ret = register_lsm_notifier(&agent->lsm_nb); - if (ret) - return ret; - - agent->smp_allowed = true; - agent->lsm_nb_reg = true; + WRITE_ONCE(agent->smp_allowed, true); + list_add(&agent->mad_agent_sec_list, &mad_agent_list); + spin_unlock(&mad_agent_list_lock); return 0; + +free_security: + spin_unlock(&mad_agent_list_lock); + security_ib_free_security(agent->security); + return ret; } void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) @@ -727,9 +733,13 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) if (!rdma_protocol_ib(agent->device, agent->port_num)) return; + if (agent->qp->qp_type == IB_QPT_SMI) { + spin_lock(&mad_agent_list_lock); + list_del(&agent->mad_agent_sec_list); + spin_unlock(&mad_agent_list_lock); + } + security_ib_free_security(agent->security); - if (agent->lsm_nb_reg) - unregister_lsm_notifier(&agent->lsm_nb); } int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) @@ -738,7 +748,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) return 0; if (map->agent.qp->qp_type == IB_QPT_SMI) { - if (!map->agent.smp_allowed) + if (!READ_ONCE(map->agent.smp_allowed)) return -EACCES; return 0; } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 80f68eb0ba5c..9b6a065bdfa5 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1015,9 +1015,7 @@ err_free_stats: return; } -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int add_port(struct ib_device *device, int port_num) { struct ib_port *p; struct ib_port_attr attr; @@ -1113,8 +1111,8 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_pkey; - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); + if (device->ops.init_port) { + ret = device->ops.init_port(device, port_num, &p->kobj); if (ret) goto err_remove_pkey; } @@ -1189,7 +1187,7 @@ err_put: static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); @@ -1206,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type); static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), @@ -1219,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid); static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->node_guid)[0]), @@ -1232,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid); static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%.64s\n", dev->node_desc); } @@ -1241,7 +1239,7 @@ static ssize_t node_desc_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); struct ib_device_modify desc = {}; int ret; @@ -1260,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc); static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); ib_get_device_fw_str(dev, buf); strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); @@ -1277,21 +1275,21 @@ static struct attribute *ib_dev_attrs[] = { NULL, }; -static const struct attribute_group dev_attr_group = { +const struct attribute_group ib_dev_attr_group = { .attrs = ib_dev_attrs, }; -static void free_port_list_attributes(struct ib_device *device) +static void ib_free_port_attrs(struct ib_device *device) { struct kobject *p, *t; list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); - if (port->hw_stats) { - kfree(port->hw_stats); + if (port->hw_stats_ag) free_hsag(&port->kobj, port->hw_stats_ag); - } + kfree(port->hw_stats); if (port->pma_table) sysfs_remove_group(p, port->pma_table); @@ -1308,62 +1306,47 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(device->ports_kobj); } -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int ib_setup_port_attrs(struct ib_device *device) { - struct device *class_dev = &device->dev; + unsigned int port; int ret; - int i; - - device->groups[0] = &dev_attr_group; - class_dev->groups = device->groups; - ret = device_add(class_dev); - if (ret) - goto err; - - device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); - if (!device->ports_kobj) { - ret = -ENOMEM; - goto err_put; - } + device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj); + if (!device->ports_kobj) + return -ENOMEM; - if (rdma_cap_ib_switch(device)) { - ret = add_port(device, 0, port_callback); + rdma_for_each_port (device, port) { + ret = add_port(device, port); if (ret) goto err_put; - } else { - for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); - if (ret) - goto err_put; - } } - if (device->ops.alloc_hw_stats) - setup_hw_stats(device, NULL, 0); - return 0; err_put: - free_port_list_attributes(device); - device_del(class_dev); -err: + ib_free_port_attrs(device); return ret; } -void ib_device_unregister_sysfs(struct ib_device *device) +int ib_device_register_sysfs(struct ib_device *device) { - /* Hold device until ib_dealloc_device() */ - get_device(&device->dev); + int ret; + + ret = ib_setup_port_attrs(device); + if (ret) + return ret; + + if (device->ops.alloc_hw_stats) + setup_hw_stats(device, NULL, 0); - free_port_list_attributes(device); + return 0; +} - if (device->hw_stats) { - kfree(device->hw_stats); +void ib_device_unregister_sysfs(struct ib_device *device) +{ + if (device->hw_stats_ag) free_hsag(&device->dev.kobj, device->hw_stats_ag); - } + kfree(device->hw_stats); - device_unregister(&device->dev); + ib_free_port_attrs(device); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 01d68ed46c1b..7468b26b8a01 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; default: ret = -ENOSYS; } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c6144df47ea4..fe5551562dbc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -72,15 +72,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d * If access flags indicate ODP memory, avoid pinning. Instead, stores * the mm for future page fault handling in conjunction with MMU notifiers. * - * @context: userspace context to pin memory for + * @udata: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned * @dmasync: flush in-flight DMA when the memory region is written */ -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, int access, int dmasync) { + struct ib_ucontext *context; struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; @@ -95,6 +96,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct scatterlist *sg, *sg_list_start; unsigned int gup_flags = FOLL_WRITE; + if (!udata) + return ERR_PTR(-EIO); + + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + if (!context) + return ERR_PTR(-EIO); + if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -160,15 +169,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(&mm->mmap_sem); - if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || - (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { - up_write(&mm->mmap_sem); + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { + atomic64_sub(npages, &mm->pinned_vm); ret = -ENOMEM; goto out; } - mm->pinned_vm = new_pinned; - up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -228,9 +234,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem_release: __ib_umem_release(context->device, umem, 0); vma: - down_write(&mm->mmap_sem); - mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&mm->mmap_sem); + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: if (vma_list) free_page((unsigned long) vma_list); @@ -253,25 +257,12 @@ static void __ib_umem_release_tail(struct ib_umem *umem) kfree(umem); } -static void ib_umem_release_defer(struct work_struct *work) -{ - struct ib_umem *umem = container_of(work, struct ib_umem, work); - - down_write(&umem->owning_mm->mmap_sem); - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&umem->owning_mm->mmap_sem); - - __ib_umem_release_tail(umem); -} - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release */ void ib_umem_release(struct ib_umem *umem) { - struct ib_ucontext *context = umem->context; - if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); @@ -280,26 +271,7 @@ void ib_umem_release(struct ib_umem *umem) __ib_umem_release(umem->context->device, umem, 1); - /* - * We may be called with the mm's mmap_sem already held. This - * can happen when a userspace munmap() is the call that drops - * the last reference to our file and calls our release - * method. If there are memory regions to destroy, we'll end - * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting a workqueue. - */ - if (context->closing) { - if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_release_defer); - queue_work(ib_wq, &umem->work); - return; - } - } else { - down_write(&umem->owning_mm->mmap_sem); - } - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&umem->owning_mm->mmap_sem); - + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index a4ec43093cb3..e6ec79ad9cc8 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -40,6 +40,7 @@ #include <linux/vmalloc.h> #include <linux/hugetlb.h> #include <linux/interval_tree_generic.h> +#include <linux/pagemap.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> @@ -299,7 +300,7 @@ static void free_per_mm(struct rcu_head *rcu) kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); } -void put_per_mm(struct ib_umem_odp *umem_odp) +static void put_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; struct ib_ucontext *ctx = umem_odp->umem.context; @@ -332,9 +333,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp) mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, unsigned long addr, size_t size) { + struct ib_ucontext_per_mm *per_mm = root->per_mm; struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; @@ -349,9 +351,11 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; - umem->writable = 1; + umem->writable = root->umem.writable; umem->is_odp = 1; odp_data->per_mm = per_mm; + umem->owning_mm = per_mm->mm; + mmgrab(umem->owning_mm); mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -384,6 +388,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, out_page_list: vfree(odp_data->page_list); out_odp_data: + mmdrop(umem->owning_mm); kfree(odp_data); return ERR_PTR(ret); } @@ -614,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * mmget_not_zero will fail in this case. */ owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); - if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { + if (!owning_process || !mmget_not_zero(owning_mm)) { ret = -EINVAL; goto out_put_task; } @@ -681,9 +686,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { - /* Release left over pages when handling errors. */ - for (++j; j < npages; ++j) - put_page(local_page_list[j]); + /* + * Release pages, remembering that the first page + * to hit an error was already released by + * ib_umem_odp_map_dma_single_page(). + */ + if (npages - (j + 1) > 0) + release_pages(&local_page_list[j+1], + npages - (j + 1)); break; } } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index de8d31ab8945..02b7947ab215 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -957,19 +957,22 @@ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret = -ENXIO; + int ret = 0; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); - if (!port->ib_dev) + if (!port->ib_dev) { + ret = -ENXIO; goto out; + } - ret = -ENOMEM; - file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) { + ret = -ENOMEM; goto out; + } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -982,14 +985,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); - ret = nonseekable_open(inode, filp); - if (ret) { - list_del(&file->port_list); - kfree(file); - goto out; - } - - ib_umad_dev_get(port->umad_dev); + nonseekable_open(inode, filp); out: mutex_unlock(&port->file_mutex); return ret; @@ -998,7 +994,6 @@ out: static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_device *dev = file->port->umad_dev; struct ib_umad_packet *packet, *tmp; int already_dead; int i; @@ -1027,7 +1022,6 @@ static int ib_umad_close(struct inode *inode, struct file *filp) mutex_unlock(&file->port->file_mutex); kfree(file); - ib_umad_dev_put(dev); return 0; } @@ -1073,17 +1067,9 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) filp->private_data = port; - ret = nonseekable_open(inode, filp); - if (ret) - goto err_clr_sm_cap; - - ib_umad_dev_get(port->umad_dev); + nonseekable_open(inode, filp); return 0; -err_clr_sm_cap: - swap(props.set_port_cap_mask, props.clr_port_cap_mask); - ib_modify_port(port->ib_dev, port->port_num, 0, &props); - err_up_sem: up(&port->sm_sem); @@ -1106,7 +1092,6 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - ib_umad_dev_put(port->umad_dev); return ret; } @@ -1283,10 +1268,12 @@ static void ib_umad_kill_port(struct ib_umad_port *port) mutex_unlock(&port->file_mutex); cdev_device_del(&port->sm_cdev, &port->sm_dev); - put_device(&port->sm_dev); cdev_device_del(&port->cdev, &port->dev); - put_device(&port->dev); ida_free(&umad_ida, port->dev_num); + + /* balances device_initialize() */ + put_device(&port->sm_dev); + put_device(&port->dev); } static void ib_umad_add_one(struct ib_device *device) @@ -1329,21 +1316,24 @@ err: ib_umad_kill_port(&umad_dev->ports[i - s]); } free: + /* balances kref_init */ ib_umad_dev_put(umad_dev); } static void ib_umad_remove_one(struct ib_device *device, void *client_data) { struct ib_umad_device *umad_dev = client_data; - int i; + unsigned int i; if (!umad_dev) return; - for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { - if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) - ib_umad_kill_port(&umad_dev->ports[i]); + rdma_for_each_port (device, i) { + if (rdma_cap_ib_mad(device, i)) + ib_umad_kill_port( + &umad_dev->ports[i - rdma_start_port(device)]); } + /* balances kref_init() */ ib_umad_dev_put(umad_dev); } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6b12cc5f97b2..062a86c04123 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -60,6 +60,10 @@ static int uverbs_response(struct uverbs_attr_bundle *attrs, const void *resp, { int ret; + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT)) + return uverbs_copy_to_struct_or_zero( + attrs, UVERBS_ATTR_CORE_OUT, resp, resp_len); + if (copy_to_user(attrs->ucore.outbuf, resp, min(attrs->ucore.outlen, resp_len))) return -EFAULT; @@ -220,12 +224,13 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) if (ret) goto err; - ucontext = ib_dev->ops.alloc_ucontext(ib_dev, &attrs->driver_udata); - if (IS_ERR(ucontext)) { - ret = PTR_ERR(ucontext); + ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext); + if (!ucontext) { + ret = -ENOMEM; goto err_alloc; } + ucontext->res.type = RDMA_RESTRACK_CTX; ucontext->device = ib_dev; ucontext->cg_obj = cg_obj; /* ufile is required when some objects are released */ @@ -234,15 +239,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING mutex_init(&ucontext->per_mm_list_lock); INIT_LIST_HEAD(&ucontext->per_mm_list); - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; - -#endif - - resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) @@ -255,15 +253,22 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) goto err_fd; } + resp.num_comp_vectors = file->device->num_comp_vectors; + ret = uverbs_response(attrs, &resp, sizeof(resp)); if (ret) goto err_file; - fd_install(resp.async_fd, filp); + ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); + if (ret) + goto err_file; + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; - ucontext->res.type = RDMA_RESTRACK_CTX; rdma_restrack_uadd(&ucontext->res); + fd_install(resp.async_fd, filp); + /* * Make sure that ib_uverbs_get_ucontext() sees the pointer update * only after all writes to setup the ucontext have completed @@ -282,7 +287,7 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - ib_dev->ops.dealloc_ucontext(ucontext); + kfree(ucontext); err_alloc: ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); @@ -406,9 +411,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); + pd = rdma_zalloc_drv_obj(ib_dev, ib_pd); + if (!pd) { + ret = -ENOMEM; goto err; } @@ -416,11 +421,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) pd->uobject = uobj; pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); + pd->res.type = RDMA_RESTRACK_PD; + + ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata); + if (ret) + goto err_alloc; uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; - pd->res.type = RDMA_RESTRACK_PD; rdma_restrack_uadd(&pd->res); ret = uverbs_response(attrs, &resp, sizeof(resp)); @@ -431,7 +440,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) err_copy: ib_dealloc_pd(pd); - + pd = NULL; +err_alloc: + kfree(pd); err: uobj_alloc_abort(uobj); return ret; @@ -818,14 +829,13 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) cmd.length, cmd.hca_va, cmd.access_flags, pd, &attrs->driver_udata); - if (!ret) { - if (cmd.flags & IB_MR_REREG_PD) { - atomic_inc(&pd->usecnt); - mr->pd = pd; - atomic_dec(&old_pd->usecnt); - } - } else { + if (ret) goto put_uobj_pd; + + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); } memset(&resp, 0, sizeof(resp)); @@ -880,6 +890,11 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) goto err_free; } + if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) { + ret = -EINVAL; + goto err_put; + } + mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); @@ -1180,9 +1195,11 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) ret = -EFAULT; goto out_put; } - ret = 0; + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT)) + ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT); + out_put: uobj_put_obj_read(cq); return ret; @@ -2012,8 +2029,10 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) return -ENOMEM; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) + if (!qp) { + ret = -EINVAL; goto out; + } is_ud = qp->qp_type == IB_QPT_UD; sg_ind = 0; @@ -2623,7 +2642,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res, } EXPORT_SYMBOL(flow_resources_add); -static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, +static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec, struct ib_uflow_resources *uflow_res) @@ -3609,7 +3628,6 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) copy_query_dev_fields(ucontext, &resp.base, &attr); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING resp.odp_caps.general_caps = attr.odp_caps.general_caps; resp.odp_caps.per_transport_caps.rc_odp_caps = attr.odp_caps.per_transport_caps.rc_odp_caps; @@ -3617,7 +3635,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; -#endif + resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps; resp.timestamp_mask = attr.timestamp_mask; resp.hca_core_clock = attr.hca_core_clock; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 8c81ff698052..e1379949e663 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -144,6 +144,21 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } +static int uverbs_set_output(const struct uverbs_attr_bundle *bundle, + const struct uverbs_attr *attr) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + u16 flags; + + flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | + UVERBS_ATTR_F_VALID_OUTPUT; + if (put_user(flags, + &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags)) + return -EFAULT; + return 0; +} + static int uverbs_process_idrs_array(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct uverbs_objs_arr_attr *attr, @@ -198,6 +213,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, ret = PTR_ERR(attr->uobjects[i]); break; } + pbundle->bundle.context = attr->uobjects[i]->context; } attr->len = i; @@ -315,6 +331,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, uattr->data_s64); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); + pbundle->bundle.context = o_attr->uobject->context; __set_bit(attr_bkey, pbundle->uobj_finalize); if (spec->u.obj.access == UVERBS_ACCESS_NEW) { @@ -456,6 +473,19 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, } /* + * Until the drivers are revised to use the bundle directly we have to + * assume that the driver wrote to its UHW_OUT and flag userspace + * appropriately. + */ + if (!ret && pbundle->method_elm->has_udata) { + const struct uverbs_attr *attr = + uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT); + + if (!IS_ERR(attr)) + ret = uverbs_set_output(&pbundle->bundle, attr); + } + + /* * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can * not invoke the method because the request is not supported. No * other cases should return this code. @@ -564,6 +594,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, pbundle->method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; + pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ pbundle->radix = &uapi->radix; pbundle->radix_slots = slot; pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); @@ -706,10 +737,7 @@ void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) { - struct bundle_priv *pbundle = - container_of(bundle, struct bundle_priv, bundle); const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); - u16 flags; size_t min_size; if (IS_ERR(attr)) @@ -719,16 +747,25 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size)) return -EFAULT; - flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | - UVERBS_ATTR_F_VALID_OUTPUT; - if (put_user(flags, - &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags)) - return -EFAULT; - - return 0; + return uverbs_set_output(bundle, attr); } EXPORT_SYMBOL(uverbs_copy_to); + +/* + * This is only used if the caller has directly used copy_to_use to write the + * data. It signals to user space that the buffer is filled in. + */ +int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx) +{ + const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + return uverbs_set_output(bundle, attr); +} + int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, s64 lower_bound, u64 upper_bound, s64 *def_val) @@ -757,8 +794,10 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); - if (clear_user(u64_to_user_ptr(attr->ptr_attr.data), - attr->ptr_attr.len)) - return -EFAULT; + if (size < attr->ptr_attr.len) { + if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size, + attr->ptr_attr.len - size)) + return -EFAULT; + } return uverbs_copy_to(bundle, idx, from, size); } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index fb0007aa0c27..70b7d80431a9 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -204,6 +204,9 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); + if (file->async_file) + kref_put(&file->async_file->ref, + ib_uverbs_release_async_event_file); put_device(&file->device->dev); kfree(file); } @@ -690,7 +693,9 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, buf += sizeof(hdr); + memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); bundle.ufile = file; + bundle.context = NULL; /* only valid if bundle has uobject */ if (!method_elm->is_ex) { size_t in_len = hdr.in_words * 4 - sizeof(hdr); size_t out_len = hdr.out_words * 4; @@ -963,11 +968,19 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) /* Get an arbitrary mm pointer that hasn't been cleaned yet */ mutex_lock(&ufile->umap_lock); - if (!list_empty(&ufile->umaps)) { - mm = list_first_entry(&ufile->umaps, - struct rdma_umap_priv, list) - ->vma->vm_mm; - mmget(mm); + while (!list_empty(&ufile->umaps)) { + int ret; + + priv = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list); + mm = priv->vma->vm_mm; + ret = mmget_not_zero(mm); + if (!ret) { + list_del_init(&priv->list); + mm = NULL; + continue; + } + break; } mutex_unlock(&ufile->umap_lock); if (!mm) @@ -1095,10 +1108,6 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); - if (file->async_file) - kref_put(&file->async_file->ref, - ib_uverbs_release_async_event_file); - kref_put(&file->ref, ib_uverbs_release_file); return 0; @@ -1127,6 +1136,7 @@ static const struct file_operations uverbs_mmap_fops = { static struct ib_client uverbs_client = { .name = "uverbs", + .no_kverbs_req = true, .add = ib_uverbs_add_one, .remove = ib_uverbs_remove_one }; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index cbc72312eb41..f224cb727224 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -188,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject, if (ret) return ret; - ib_dealloc_pd((struct ib_pd *)uobject->object); + ib_dealloc_pd(pd); return 0; } diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index 5030ec480370..2a3f2f01028d 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -168,12 +168,18 @@ void copy_port_attr_to_resp(struct ib_port_attr *attr, static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( struct uverbs_attr_bundle *attrs) { - struct ib_device *ib_dev = attrs->ufile->device->ib_dev; + struct ib_device *ib_dev; struct ib_port_attr attr = {}; struct ib_uverbs_query_port_resp_ex resp = {}; + struct ib_ucontext *ucontext; int ret; u8 port_num; + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */ if (!ib_dev->ops.query_port) return -EOPNOTSUPP; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 9ae08e4b78a3..7a987acf0c0b 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi, obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* - * Today drivers are only permitted to use idr_class - * types. They cannot use FD types because we currently have - * no way to revoke the fops pointer after device - * disassociation. + * Today drivers are only permitted to use idr_class and + * fd_class types. We can revoke the IDR types during + * disassociation, and the FD types require the driver to use + * struct file_operations.owner to prevent the driver module + * code from unloading while the file is open. This provides + * enough safety that uverbs_close_fd() will continue to work. + * Drivers using FD are responsible to handle disassociation of + * the device on their own. */ if (WARN_ON(is_driver && - obj->type_attrs->type_class != &uverbs_idr_class)) + obj->type_attrs->type_class != &uverbs_idr_class && + obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ac011836bb54..5a5e83f5f0fc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -254,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, { struct ib_pd *pd; int mr_access_flags = 0; + int ret; - pd = device->ops.alloc_pd(device, NULL, NULL); - if (IS_ERR(pd)) - return pd; + pd = rdma_zalloc_drv_obj(device, ib_pd); + if (!pd) + return ERR_PTR(-ENOMEM); pd->device = device; pd->uobject = NULL; @@ -265,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, atomic_set(&pd->usecnt, 0); pd->flags = flags; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_set_task(&pd->res, caller); + + ret = device->ops.alloc_pd(pd, NULL, NULL); + if (ret) { + kfree(pd); + return ERR_PTR(ret); + } + rdma_restrack_kadd(&pd->res); + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else @@ -275,10 +286,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); - rdma_restrack_kadd(&pd->res); - if (mr_access_flags) { struct ib_mr *mr; @@ -329,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd) WARN_ON(atomic_read(&pd->usecnt)); rdma_restrack_del(&pd->res); - /* Making delalloc_pd a void return is a WIP, no driver should return - an error here. */ - ret = pd->device->ops.dealloc_pd(pd); - WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); + pd->device->ops.dealloc_pd(pd); + kfree(pd); } EXPORT_SYMBOL(ib_dealloc_pd); @@ -1106,8 +1111,8 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); -static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, - struct ib_qp_init_attr *qp_init_attr) +static struct ib_qp *create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; @@ -1122,10 +1127,10 @@ static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->ops.destroy_qp(real_qp); + if (IS_ERR(qp)) + return qp; + + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); return qp; } @@ -1156,10 +1161,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, return qp; ret = ib_create_qp_security(qp, device); - if (ret) { - ib_destroy_qp(qp); - return ERR_PTR(ret); - } + if (ret) + goto err; qp->real_qp = qp; qp->qp_type = qp_init_attr->qp_type; @@ -1172,8 +1175,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, INIT_LIST_HEAD(&qp->sig_mrs); qp->port = 0; - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) - return ib_create_xrc_qp(qp, qp_init_attr); + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { + struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr); + + if (IS_ERR(xrc_qp)) { + ret = PTR_ERR(xrc_qp); + goto err; + } + return xrc_qp; + } qp->event_handler = qp_init_attr->event_handler; qp->qp_context = qp_init_attr->qp_context; @@ -1200,11 +1210,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); - if (ret) { - pr_err("failed to init MR pool ret= %d\n", ret); - ib_destroy_qp(qp); - return ERR_PTR(ret); - } + if (ret) + goto err; } /* @@ -1217,6 +1224,11 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, device->attrs.max_sge_rd); return qp; + +err: + ib_destroy_qp(qp); + return ERR_PTR(ret); + } EXPORT_SYMBOL(ib_create_qp); @@ -1711,10 +1723,7 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) return -EINVAL; - if (!dev->ops.get_netdev) - return -EOPNOTSUPP; - - netdev = dev->ops.get_netdev(dev, port_num); + netdev = ib_device_get_netdev(dev, port_num); if (!netdev) return -ENODEV; |