diff options
Diffstat (limited to 'drivers/infiniband/core')
66 files changed, 8891 insertions, 6728 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d1b14887960e..8ab4eea5a0a5 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,16 +8,16 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ - device.o fmr_pool.o cache.o netlink.o \ + device.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ - trace.o + trace.o lag.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o -ib_cm-y := cm.o +ib_cm-y := cm.o cm_trace.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o @@ -36,6 +36,9 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ - uverbs_std_types_async_fd.o -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o + uverbs_std_types_async_fd.o \ + uverbs_std_types_srq.o \ + uverbs_std_types_wq.o \ + uverbs_std_types_qp.o +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 1753a9801b70..f253295795f0 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -37,7 +37,6 @@ #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/workqueue.h> -#include <linux/module.h> #include <net/arp.h> #include <net/neighbour.h> #include <net/route.h> @@ -76,7 +75,9 @@ static struct workqueue_struct *addr_wq; static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, - .len = sizeof(struct rdma_nla_ls_gid)}, + .len = sizeof(struct rdma_nla_ls_gid), + .validation_type = NLA_VALIDATE_MIN, + .min = sizeof(struct rdma_nla_ls_gid)}, }; static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) @@ -371,6 +372,8 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; + might_sleep(); + /* If we have a gateway in IB mode then it must be an IB network */ if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) return ib_nl_fetch_ha(dev_addr, daddr, seq, family); @@ -645,13 +648,12 @@ static void process_one_req(struct work_struct *_work) req->callback = NULL; spin_lock_bh(&lock); + /* + * Although the work will normally have been canceled by the workqueue, + * it can still be requeued as long as it is on the req_list. + */ + cancel_delayed_work(&req->work); if (!list_empty(&req->list)) { - /* - * Although the work will normally have been canceled by the - * workqueue, it can still be requeued as long as it is on the - * req_list. - */ - cancel_delayed_work(&req->work); list_del_init(&req->list); kfree(req); } @@ -727,6 +729,8 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, struct rdma_dev_addr dev_addr = {}; int ret; + might_sleep(); + if (rec->roce.route_resolved) return 0; diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 17bfedd24cc3..4084d05a4510 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -33,7 +33,7 @@ * SOFTWARE. */ -#include <linux/module.h> +#include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> @@ -46,7 +46,7 @@ struct ib_pkey_cache { int table_len; - u16 table[0]; + u16 table[]; }; struct ib_update_work { @@ -121,7 +121,7 @@ struct ib_gid_table { u32 default_gid_indices; }; -static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { struct ib_event event; @@ -133,7 +133,11 @@ static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) } static const char * const gid_type_str[] = { + /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for + * user space compatibility reasons. + */ [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; @@ -193,7 +197,7 @@ int ib_cache_gid_parse_type_str(const char *buf) } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); -static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) +static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { return device->port_data[port].cache.gid; } @@ -233,10 +237,10 @@ static void put_gid_ndev(struct rcu_head *head) static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; - u8 port_num = entry->attr.port_num; + u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); - dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__, + dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, port_num, entry->attr.index, entry->attr.gid.raw); write_lock_irq(&table->rwlock); @@ -278,7 +282,7 @@ static void free_gid_work(struct work_struct *work) struct ib_gid_table_entry *entry = container_of(work, struct ib_gid_table_entry, del_work); struct ib_device *device = entry->attr.device; - u8 port_num = entry->attr.port_num; + u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); mutex_lock(&table->lock); @@ -319,7 +323,7 @@ static void store_gid_entry(struct ib_gid_table *table, { entry->state = GID_TABLE_ENTRY_VALID; - dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n", + dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", __func__, entry->attr.port_num, entry->attr.index, entry->attr.gid.raw); @@ -350,7 +354,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) int ret; if (!attr->ndev) { - dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n", + dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", __func__, attr->port_num, attr->index); return -EINVAL; } @@ -358,7 +362,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) ret = attr->device->ops.add_gid(attr, &entry->context); if (ret) { dev_err(&attr->device->dev, - "%s GID add failed port=%d index=%d\n", + "%s GID add failed port=%u index=%u\n", __func__, attr->port_num, attr->index); return ret; } @@ -375,7 +379,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) * @ix: GID entry index to delete * */ -static void del_gid(struct ib_device *ib_dev, u8 port, +static void del_gid(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table, int ix) { struct roce_gid_ndev_storage *ndev_storage; @@ -383,7 +387,7 @@ static void del_gid(struct ib_device *ib_dev, u8 port, lockdep_assert_held(&table->lock); - dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port, + dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); @@ -539,7 +543,7 @@ static void make_default_gid(struct net_device *dev, union ib_gid *gid) addrconf_ifid_eui48(&gid->raw[8], dev); } -static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { @@ -583,7 +587,7 @@ out_unlock: return ret; } -int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | @@ -594,7 +598,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, } static int -_ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +_ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { @@ -623,7 +627,7 @@ out_unlock: return ret; } -int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | @@ -634,7 +638,7 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); } -int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; @@ -665,11 +669,10 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, * rdma_find_gid_by_port - Returns the GID entry attributes when it finds * a valid GID entry for given search parameters. It searches for the specified * GID value in the local software cache. - * @device: The device to query. + * @ib_dev: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. - * @port_num: The port number of the device where the GID value should be - * searched. + * @port: The port number of the device where the GID value should be searched. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * Returns sgid attributes if the GID is found with valid reference or @@ -680,7 +683,7 @@ const struct ib_gid_attr * rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, - u8 port, struct net_device *ndev) + u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; @@ -715,7 +718,7 @@ EXPORT_SYMBOL(rdma_find_gid_by_port); /** * rdma_find_gid_by_filter - Returns the GID table attribute where a * specified GID value occurs - * @device: The device to query. + * @ib_dev: The device to query. * @gid: The GID value to search for. * @port: The port number of the device where the GID value could be * searched. @@ -724,13 +727,14 @@ EXPORT_SYMBOL(rdma_find_gid_by_port); * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. + * @context: Private data to pass into the call-back. * * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * */ const struct ib_gid_attr *rdma_find_gid_by_filter( - struct ib_device *ib_dev, const union ib_gid *gid, u8 port, + struct ib_device *ib_dev, const union ib_gid *gid, u32 port, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context) @@ -801,7 +805,7 @@ static void release_gid_table(struct ib_device *device, continue; if (kref_read(&table->data_vec[i]->kref) > 1) { dev_err(&device->dev, - "GID entry ref leak for index %d ref=%d\n", i, + "GID entry ref leak for index %d ref=%u\n", i, kref_read(&table->data_vec[i]->kref)); leak = true; } @@ -814,7 +818,7 @@ static void release_gid_table(struct ib_device *device, kfree(table); } -static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, +static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; @@ -830,7 +834,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, mutex_unlock(&table->lock); } -void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) @@ -863,7 +867,7 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, } } -static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, +static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { unsigned int i; @@ -880,7 +884,7 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, static void gid_table_release_one(struct ib_device *ib_dev) { - unsigned int p; + u32 p; rdma_for_each_port (ib_dev, p) { release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); @@ -891,7 +895,7 @@ static void gid_table_release_one(struct ib_device *ib_dev) static int _gid_table_setup_one(struct ib_device *ib_dev) { struct ib_gid_table *table; - unsigned int rdma_port; + u32 rdma_port; rdma_for_each_port (ib_dev, rdma_port) { table = alloc_gid_table( @@ -911,7 +915,7 @@ rollback_table_setup: static void gid_table_cleanup_one(struct ib_device *ib_dev) { - unsigned int p; + u32 p; rdma_for_each_port (ib_dev, p) cleanup_gid_table_port(ib_dev, p, @@ -946,12 +950,12 @@ static int gid_table_setup_one(struct ib_device *ib_dev) * Returns 0 on success or appropriate error code. * */ -int rdma_query_gid(struct ib_device *device, u8 port_num, +int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid) { struct ib_gid_table *table; unsigned long flags; - int res = -EINVAL; + int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; @@ -959,9 +963,15 @@ int rdma_query_gid(struct ib_device *device, u8 port_num, table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); - if (index < 0 || index >= table->sz || - !is_gid_entry_valid(table->data_vec[index])) + if (index < 0 || index >= table->sz) { + res = -EINVAL; goto done; + } + + if (!is_gid_entry_valid(table->data_vec[index])) { + res = -ENOENT; + goto done; + } memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); res = 0; @@ -973,6 +983,23 @@ done: EXPORT_SYMBOL(rdma_query_gid); /** + * rdma_read_gid_hw_context - Read the HW GID context from GID attribute + * @attr: Potinter to the GID attribute + * + * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding + * to the SGID attr. Callers are required to already be holding the reference + * to an existing GID entry. + * + * Returns the HW GID context + * + */ +void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) +{ + return container_of(attr, struct ib_gid_table_entry, attr)->context; +} +EXPORT_SYMBOL(rdma_read_gid_hw_context); + +/** * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. @@ -993,7 +1020,7 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - unsigned int p; + u32 p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -1022,7 +1049,7 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, - u8 port_num, + u32 port_num, int index, u16 *pkey) { @@ -1037,7 +1064,7 @@ int ib_get_cached_pkey(struct ib_device *device, cache = device->port_data[port_num].cache.pkey; - if (index < 0 || index >= cache->table_len) + if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; @@ -1048,27 +1075,19 @@ int ib_get_cached_pkey(struct ib_device *device, } EXPORT_SYMBOL(ib_get_cached_pkey); -int ib_get_cached_subnet_prefix(struct ib_device *device, - u8 port_num, - u64 *sn_pfx) +void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, + u64 *sn_pfx) { unsigned long flags; - if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; - read_lock_irqsave(&device->cache_lock, flags); *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); -int ib_find_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) +int ib_find_cached_pkey(struct ib_device *device, u32 port_num, + u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; @@ -1082,6 +1101,10 @@ int ib_find_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; + if (!cache) { + ret = -EINVAL; + goto err; + } *index = -1; @@ -1091,8 +1114,9 @@ int ib_find_cached_pkey(struct ib_device *device, *index = i; ret = 0; break; - } else + } else { partial_ix = i; + } } if (ret && partial_ix >= 0) { @@ -1100,16 +1124,15 @@ int ib_find_cached_pkey(struct ib_device *device, ret = 0; } +err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); -int ib_find_exact_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) +int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, + u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; @@ -1122,6 +1145,10 @@ int ib_find_exact_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; + if (!cache) { + ret = -EINVAL; + goto err; + } *index = -1; @@ -1132,15 +1159,14 @@ int ib_find_exact_cached_pkey(struct ib_device *device, break; } +err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_exact_cached_pkey); -int ib_get_cached_lmc(struct ib_device *device, - u8 port_num, - u8 *lmc) +int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; int ret = 0; @@ -1156,8 +1182,7 @@ int ib_get_cached_lmc(struct ib_device *device, } EXPORT_SYMBOL(ib_get_cached_lmc); -int ib_get_cached_port_state(struct ib_device *device, - u8 port_num, +int ib_get_cached_port_state(struct ib_device *device, u32 port_num, enum ib_port_state *port_state) { unsigned long flags; @@ -1191,9 +1216,9 @@ EXPORT_SYMBOL(ib_get_cached_port_state); * code. */ const struct ib_gid_attr * -rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index) +rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { - const struct ib_gid_attr *attr = ERR_PTR(-EINVAL); + const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; @@ -1217,6 +1242,63 @@ done: EXPORT_SYMBOL(rdma_get_gid_attr); /** + * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. + * @device: The device to query. + * @entries: Entries where GID entries are returned. + * @max_entries: Maximum number of entries that can be returned. + * Entries array must be allocated to hold max_entries number of entries. + * + * Returns number of entries on success or appropriate error code. + */ +ssize_t rdma_query_gid_table(struct ib_device *device, + struct ib_uverbs_gid_entry *entries, + size_t max_entries) +{ + const struct ib_gid_attr *gid_attr; + ssize_t num_entries = 0, ret; + struct ib_gid_table *table; + u32 port_num, i; + struct net_device *ndev; + unsigned long flags; + + rdma_for_each_port(device, port_num) { + table = rdma_gid_table(device, port_num); + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + if (!is_gid_entry_valid(table->data_vec[i])) + continue; + if (num_entries >= max_entries) { + ret = -EINVAL; + goto err; + } + + gid_attr = &table->data_vec[i]->attr; + + memcpy(&entries->gid, &gid_attr->gid, + sizeof(gid_attr->gid)); + entries->gid_index = gid_attr->index; + entries->port_num = gid_attr->port_num; + entries->gid_type = gid_attr->gid_type; + ndev = rcu_dereference_protected( + gid_attr->ndev, + lockdep_is_held(&table->rwlock)); + if (ndev) + entries->netdev_ifindex = ndev->ifindex; + + num_entries++; + entries++; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return num_entries; +err: + read_unlock_irqrestore(&table->rwlock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_query_gid_table); + +/** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. @@ -1272,8 +1354,8 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; - struct net_device *ndev = ERR_PTR(-ENODEV); - u8 port_num = entry->attr.port_num; + struct net_device *ndev = ERR_PTR(-EINVAL); + u32 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; bool valid; @@ -1284,8 +1366,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) valid = is_gid_entry_valid(table->data_vec[attr->index]); if (valid) { ndev = rcu_dereference(attr->ndev); - if (!ndev || - (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0))) + if (!ndev) ndev = ERR_PTR(-ENODEV); } read_unlock_irqrestore(&table->rwlock, flags); @@ -1293,9 +1374,10 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); -static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) +static int get_lower_dev_vlan(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - u16 *vlan_id = data; + u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); @@ -1321,6 +1403,9 @@ static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; struct net_device *ndev; rcu_read_lock(); @@ -1341,7 +1426,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, - get_lower_dev_vlan, vlan_id); + get_lower_dev_vlan, &priv); } } rcu_read_unlock(); @@ -1350,7 +1435,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, - u8 port, int gid_tbl_len) + u32 port, struct ib_port_attr *tprops) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; @@ -1362,7 +1447,7 @@ static int config_non_roce_gid_cache(struct ib_device *device, table = rdma_gid_table(device, port); mutex_lock(&table->lock); - for (i = 0; i < gid_tbl_len; ++i) { + for (i = 0; i < tprops->gid_tbl_len; ++i) { if (!device->ops.query_gid) continue; ret = device->ops.query_gid(device, port, i, &gid_attr.gid); @@ -1373,6 +1458,8 @@ static int config_non_roce_gid_cache(struct ib_device *device, goto err; } gid_attr.index = i; + tprops->subnet_prefix = + be64_to_cpu(gid_attr.gid.global.subnet_prefix); add_modify_gid(table, &gid_attr); } err: @@ -1381,10 +1468,12 @@ err: } static int -ib_cache_update(struct ib_device *device, u8 port, bool enforce_security) +ib_cache_update(struct ib_device *device, u32 port, bool update_gids, + bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; - struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; + struct ib_pkey_cache *pkey_cache = NULL; + struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; @@ -1401,38 +1490,44 @@ ib_cache_update(struct ib_device *device, u8 port, bool enforce_security) goto err; } - if (!rdma_protocol_roce(device, port)) { + if (!rdma_protocol_roce(device, port) && update_gids) { ret = config_non_roce_gid_cache(device, port, - tprops->gid_tbl_len); + tprops); if (ret) goto err; } - pkey_cache = kmalloc(struct_size(pkey_cache, table, - tprops->pkey_tbl_len), - GFP_KERNEL); - if (!pkey_cache) { - ret = -ENOMEM; - goto err; - } - - pkey_cache->table_len = tprops->pkey_tbl_len; + update_pkeys &= !!tprops->pkey_tbl_len; - for (i = 0; i < pkey_cache->table_len; ++i) { - ret = ib_query_pkey(device, port, i, pkey_cache->table + i); - if (ret) { - dev_warn(&device->dev, - "ib_query_pkey failed (%d) for index %d\n", - ret, i); + if (update_pkeys) { + pkey_cache = kmalloc(struct_size(pkey_cache, table, + tprops->pkey_tbl_len), + GFP_KERNEL); + if (!pkey_cache) { + ret = -ENOMEM; goto err; } + + pkey_cache->table_len = tprops->pkey_tbl_len; + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, + pkey_cache->table + i); + if (ret) { + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); + goto err; + } + } } write_lock_irq(&device->cache_lock); - old_pkey_cache = device->port_data[port].cache.pkey; - - device->port_data[port].cache.pkey = pkey_cache; + if (update_pkeys) { + old_pkey_cache = device->port_data[port].cache.pkey; + device->port_data[port].cache.pkey = pkey_cache; + } device->port_data[port].cache.lmc = tprops->lmc; device->port_data[port].cache.port_state = tprops->state; @@ -1464,6 +1559,8 @@ static void ib_cache_event_task(struct work_struct *_work) * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, + work->event.event == IB_EVENT_GID_CHANGE, + work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); /* GID event is notified already for individual GID entries by @@ -1527,24 +1624,25 @@ EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { - unsigned int p; + u32 p; int err; - rwlock_init(&device->cache_lock); - err = gid_table_setup_one(device); if (err) return err; - rdma_for_each_port (device, p) - ib_cache_update(device, p, true); + rdma_for_each_port (device, p) { + err = ib_cache_update(device, p, true, true, true); + if (err) + return err; + } return 0; } void ib_cache_release_one(struct ib_device *device) { - unsigned int p; + u32 p; /* * The release function frees all the cache elements. diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 15e99a888427..1f9938a2c475 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -25,8 +25,10 @@ #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> +#include <rdma/ib_sysfs.h> #include "cm_msgs.h" #include "core_priv.h" +#include "cm_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); @@ -66,6 +68,8 @@ static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version", [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label", [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label", + [IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] = + "vendor option is not supported", }; const char *__attribute_const__ ibcm_reject_msg(int reason) @@ -80,8 +84,22 @@ const char *__attribute_const__ ibcm_reject_msg(int reason) } EXPORT_SYMBOL(ibcm_reject_msg); -static void cm_add_one(struct ib_device *device); +struct cm_id_private; +struct cm_work; +static int cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); +static void cm_process_work(struct cm_id_private *cm_id_priv, + struct cm_work *work); +static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param); +static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, + const void *private_data, u8 private_data_len); +static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len); +static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len); static struct ib_client cm_client = { .name = "cm", @@ -104,8 +122,6 @@ static struct ib_cm { __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; - /* Sync on cm change port state */ - spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ @@ -133,77 +149,33 @@ enum { CM_COUNTER_GROUPS }; -static char const counter_group_names[CM_COUNTER_GROUPS] - [sizeof("cm_rx_duplicates")] = { - "cm_tx_msgs", "cm_tx_retries", - "cm_rx_msgs", "cm_rx_duplicates" -}; - -struct cm_counter_group { - struct kobject obj; - atomic_long_t counter[CM_ATTR_COUNT]; -}; - struct cm_counter_attribute { - struct attribute attr; - int index; -}; - -#define CM_COUNTER_ATTR(_name, _index) \ -struct cm_counter_attribute cm_##_name##_counter_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .index = _index \ -} - -static CM_COUNTER_ATTR(req, CM_REQ_COUNTER); -static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER); -static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER); -static CM_COUNTER_ATTR(rep, CM_REP_COUNTER); -static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER); -static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER); -static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER); -static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER); -static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER); -static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER); -static CM_COUNTER_ATTR(apr, CM_APR_COUNTER); - -static struct attribute *cm_counter_default_attrs[] = { - &cm_req_counter_attr.attr, - &cm_mra_counter_attr.attr, - &cm_rej_counter_attr.attr, - &cm_rep_counter_attr.attr, - &cm_rtu_counter_attr.attr, - &cm_dreq_counter_attr.attr, - &cm_drep_counter_attr.attr, - &cm_sidr_req_counter_attr.attr, - &cm_sidr_rep_counter_attr.attr, - &cm_lap_counter_attr.attr, - &cm_apr_counter_attr.attr, - NULL + struct ib_port_attribute attr; + unsigned short group; + unsigned short index; }; struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; - struct kobject port_obj; - u8 port_num; - struct list_head cm_priv_prim_list; - struct list_head cm_priv_altr_list; - struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; + u32 port_num; + atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT]; }; struct cm_device { + struct kref kref; struct list_head list; + spinlock_t mad_agent_lock; struct ib_device *ib_device; u8 ack_delay; int going_down; - struct cm_port *port[0]; + struct cm_port *port[]; }; struct cm_av { struct cm_port *port; - union ib_gid dgid; struct rdma_ah_attr ah_attr; + u16 dlid_datapath; u16 pkey_index; u8 timeout; }; @@ -216,7 +188,7 @@ struct cm_work { __be32 local_id; /* Established / timewait */ __be32 remote_id; struct ib_cm_event cm_event; - struct sa_path_rec path[0]; + struct sa_path_rec path[]; }; struct cm_timewait_info { @@ -235,11 +207,13 @@ struct cm_id_private { struct rb_node service_node; struct rb_node sidr_id_node; + u32 sidr_slid; spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; refcount_t refcount; /* Number of clients sharing this ib_cm_id. Only valid for listeners. - * Protected by the cm.lock spinlock. */ + * Protected by the cm.lock spinlock. + */ int listen_sharecount; struct rcu_head rcu; @@ -261,7 +235,6 @@ struct cm_id_private { __be16 pkey; u8 private_data_len; u8 max_cm_retries; - u8 peer_to_peer; u8 responder_resources; u8 initiator_depth; u8 retry_count; @@ -269,16 +242,28 @@ struct cm_id_private { u8 service_timeout; u8 target_ack_delay; - struct list_head prim_list; - struct list_head altr_list; - /* Indicates that the send port mad is registered and av is set */ - int prim_send_port_not_ready; - int altr_send_port_not_ready; - struct list_head work_list; atomic_t work_count; + + struct rdma_ucm_ece ece; }; +static void cm_dev_release(struct kref *kref) +{ + struct cm_device *cm_dev = container_of(kref, struct cm_device, kref); + u32 i; + + rdma_for_each_port(cm_dev->ib_device, i) + kfree(cm_dev->port[i - 1]); + + kfree(cm_dev); +} + +static void cm_device_put(struct cm_device *cm_dev) +{ + kref_put(&cm_dev->kref, cm_dev_release); +} + static void cm_work_handler(struct work_struct *work); static inline void cm_deref_id(struct cm_id_private *cm_id_priv) @@ -287,52 +272,37 @@ static inline void cm_deref_id(struct cm_id_private *cm_id_priv) complete(&cm_id_priv->comp); } -static int cm_alloc_msg(struct cm_id_private *cm_id_priv, - struct ib_mad_send_buf **msg) +static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; - struct cm_av *av; - unsigned long flags, flags2; - int ret = 0; - /* don't let the port to be released till the agent is down */ - spin_lock_irqsave(&cm.state_lock, flags2); - spin_lock_irqsave(&cm.lock, flags); - if (!cm_id_priv->prim_send_port_not_ready) - av = &cm_id_priv->av; - else if (!cm_id_priv->altr_send_port_not_ready && - (cm_id_priv->alt_av.port)) - av = &cm_id_priv->alt_av; - else { - pr_info("%s: not valid CM id\n", __func__); - ret = -ENODEV; - spin_unlock_irqrestore(&cm.lock, flags); - goto out; - } - spin_unlock_irqrestore(&cm.lock, flags); - /* Make sure the port haven't released the mad yet */ + lockdep_assert_held(&cm_id_priv->lock); + + if (!cm_id_priv->av.port) + return ERR_PTR(-EINVAL); + + spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); mad_agent = cm_id_priv->av.port->mad_agent; if (!mad_agent) { - pr_info("%s: not a valid MAD agent\n", __func__); - ret = -ENODEV; + m = ERR_PTR(-EINVAL); goto out; } - ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr, 0); + + ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0); if (IS_ERR(ah)) { - ret = PTR_ERR(ah); + m = ERR_CAST(ah); goto out; } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, - av->pkey_index, + cm_id_priv->av.pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { rdma_destroy_ah(ah, 0); - ret = PTR_ERR(m); goto out; } @@ -342,11 +312,49 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, refcount_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; - *msg = m; out: - spin_unlock_irqrestore(&cm.state_lock, flags2); - return ret; + spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + return m; +} + +static void cm_free_msg(struct ib_mad_send_buf *msg) +{ + struct cm_id_private *cm_id_priv = msg->context[0]; + + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + cm_deref_id(cm_id_priv); + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf * +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv) +{ + struct ib_mad_send_buf *msg; + + lockdep_assert_held(&cm_id_priv->lock); + + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return msg; + cm_id_priv->msg = msg; + return msg; +} + +static void cm_free_priv_msg(struct ib_mad_send_buf *msg) +{ + struct cm_id_private *cm_id_priv = msg->context[0]; + + lockdep_assert_held(&cm_id_priv->lock); + + if (!WARN_ON(cm_id_priv->msg != msg)) + cm_id_priv->msg = NULL; + + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + cm_deref_id(cm_id_priv); + ib_free_send_mad(msg); } static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, @@ -373,15 +381,6 @@ static int cm_create_response_msg_ah(struct cm_port *port, return 0; } -static void cm_free_msg(struct ib_mad_send_buf *msg) -{ - if (msg->ah) - rdma_destroy_ah(msg->ah, 0); - if (msg->context[0]) - cm_deref_id(msg->context[0]); - ib_free_send_mad(msg); -} - static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, struct ib_mad_send_buf **msg) @@ -395,7 +394,7 @@ static int cm_alloc_response_msg(struct cm_port *port, ret = cm_create_response_msg_ah(port, mad_recv_wc, m); if (ret) { - cm_free_msg(m); + ib_free_send_mad(m); return ret; } @@ -403,8 +402,14 @@ static int cm_alloc_response_msg(struct cm_port *port, return 0; } -static void * cm_copy_private_data(const void *private_data, - u8 private_data_len) +static void cm_free_response_msg(struct ib_mad_send_buf *msg) +{ + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + ib_free_send_mad(msg); +} + +static void *cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; @@ -428,62 +433,38 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) +static void cm_set_av_port(struct cm_av *av, struct cm_port *port) { - struct rdma_ah_attr new_ah_attr; - int ret; + struct cm_port *old_port = av->port; - av->port = port; - av->pkey_index = wc->pkey_index; + if (old_port == port) + return; - /* - * av->ah_attr might be initialized based on past wc during incoming - * connect request or while sending out connect request. So initialize - * a new ah_attr on stack. If initialization fails, old ah_attr is - * used for sending any responses. If initialization is successful, - * than new ah_attr is used by overwriting old one. - */ - ret = ib_init_ah_attr_from_wc(port->cm_dev->ib_device, - port->port_num, wc, - grh, &new_ah_attr); - if (ret) - return ret; + av->port = port; + if (old_port) + cm_device_put(old_port->cm_dev); + if (port) + kref_get(&port->cm_dev->kref); +} - rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); - return 0; +static void cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, + struct rdma_ah_attr *ah_attr, struct cm_av *av) +{ + cm_set_av_port(av, port); + av->pkey_index = wc->pkey_index; + rdma_move_ah_attr(&av->ah_attr, ah_attr); } static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, struct ib_grh *grh, struct cm_av *av) { - av->port = port; + cm_set_av_port(av, port); av->pkey_index = wc->pkey_index; return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, port->port_num, wc, grh, &av->ah_attr); } -static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, - struct cm_av *av, - struct cm_port *port) -{ - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&cm.lock, flags); - - if (&cm_id_priv->av == av) - list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); - else if (&cm_id_priv->alt_av == av) - list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); - else - ret = -EINVAL; - - spin_unlock_irqrestore(&cm.lock, flags); - return ret; -} - static struct cm_port * get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) { @@ -527,8 +508,7 @@ get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) static int cm_init_av_by_path(struct sa_path_rec *path, const struct ib_gid_attr *sgid_attr, - struct cm_av *av, - struct cm_id_private *cm_id_priv) + struct cm_av *av) { struct rdma_ah_attr new_ah_attr; struct cm_device *cm_dev; @@ -545,7 +525,7 @@ static int cm_init_av_by_path(struct sa_path_rec *path, if (ret) return ret; - av->port = port; + cm_set_av_port(av, port); /* * av->ah_attr might be initialized based on wc or during @@ -562,36 +542,29 @@ static int cm_init_av_by_path(struct sa_path_rec *path, return ret; av->timeout = path->packet_life_time + 1; - - ret = add_cm_id_to_port_list(cm_id_priv, av, port); - if (ret) { - rdma_destroy_ah_attr(&new_ah_attr); - return ret; - } rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } -static int cm_alloc_id(struct cm_id_private *cm_id_priv) +/* Move av created by cm_init_av_by_path(), so av.dgid is not moved */ +static void cm_move_av_from_path(struct cm_av *dest, struct cm_av *src) { - int err; - u32 id; - - err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv, - xa_limit_32b, &cm.local_id_next, GFP_KERNEL); - - cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; - return err; + cm_set_av_port(dest, src->port); + cm_set_av_port(src, NULL); + dest->pkey_index = src->pkey_index; + rdma_move_ah_attr(&dest->ah_attr, &src->ah_attr); + dest->timeout = src->timeout; } -static u32 cm_local_id(__be32 local_id) +static void cm_destroy_av(struct cm_av *av) { - return (__force u32) (local_id ^ cm.random_id_operand); + rdma_destroy_ah_attr(&av->ah_attr); + cm_set_av_port(av, NULL); } -static void cm_free_id(__be32 local_id) +static u32 cm_local_id(__be32 local_id) { - xa_erase_irq(&cm.local_id_table, cm_local_id(local_id)); + return (__force u32) (local_id ^ cm.random_id_operand); } static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) @@ -633,22 +606,25 @@ static int be64_gt(__be64 a, __be64 b) return (__force u64) a > (__force u64) b; } -static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) +/* + * Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv + * if the new ID was inserted, NULL if it could not be inserted due to a + * collision, or the existing cm_id_priv ready for shared usage. + */ +static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv, + ib_cm_handler shared_handler) { struct rb_node **link = &cm.listen_service_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; - __be64 service_mask = cm_id_priv->id.service_mask; + unsigned long flags; + spin_lock_irqsave(&cm.lock, flags); while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); - if ((cur_cm_id_priv->id.service_mask & service_id) == - (service_mask & cur_cm_id_priv->id.service_id) && - (cm_id_priv->id.device == cur_cm_id_priv->id.device)) - return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) link = &(*link)->rb_left; @@ -658,26 +634,38 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; - else - link = &(*link)->rb_right; + else { + /* + * Sharing an ib_cm_id with different handlers is not + * supported + */ + if (cur_cm_id_priv->id.cm_handler != shared_handler || + cur_cm_id_priv->id.context || + WARN_ON(!cur_cm_id_priv->id.cm_handler)) { + spin_unlock_irqrestore(&cm.lock, flags); + return NULL; + } + refcount_inc(&cur_cm_id_priv->refcount); + cur_cm_id_priv->listen_sharecount++; + spin_unlock_irqrestore(&cm.lock, flags); + return cur_cm_id_priv; + } } + cm_id_priv->listen_sharecount++; rb_link_node(&cm_id_priv->service_node, parent, link); rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); - return NULL; + spin_unlock_irqrestore(&cm.lock, flags); + return cm_id_priv; } -static struct cm_id_private * cm_find_listen(struct ib_device *device, - __be64 service_id) +static struct cm_id_private *cm_find_listen(struct ib_device *device, + __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); - if ((cm_id_priv->id.service_mask & service_id) == - cm_id_priv->id.service_id && - (cm_id_priv->id.device == device)) - return cm_id_priv; if (device < cm_id_priv->id.device) node = node->rb_left; @@ -687,14 +675,16 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device, node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; - else - node = node->rb_right; + else { + refcount_inc(&cm_id_priv->refcount); + return cm_id_priv; + } } return NULL; } -static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info - *timewait_info) +static struct cm_timewait_info * +cm_insert_remote_id(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_id_table.rb_node; struct rb_node *parent = NULL; @@ -723,12 +713,14 @@ static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info return NULL; } -static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, - __be32 remote_id) +static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid, + __be32 remote_id) { struct rb_node *node = cm.remote_id_table.rb_node; struct cm_timewait_info *timewait_info; + struct cm_id_private *res = NULL; + spin_lock_irq(&cm.lock); while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); @@ -740,14 +732,18 @@ static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, node = node->rb_left; else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; - else - return timewait_info; + else { + res = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + break; + } } - return NULL; + spin_unlock_irq(&cm.lock); + return res; } -static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info - *timewait_info) +static struct cm_timewait_info * +cm_insert_remote_qpn(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_qp_table.rb_node; struct rb_node *parent = NULL; @@ -776,13 +772,12 @@ static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info return NULL; } -static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private - *cm_id_priv) +static struct cm_id_private * +cm_insert_remote_sidr(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.remote_sidr_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; - union ib_gid *port_gid = &cm_id_priv->av.dgid; __be32 remote_id = cm_id_priv->id.remote_id; while (*link) { @@ -794,12 +789,9 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_right; else { - int cmp; - cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid, - sizeof *port_gid); - if (cmp < 0) + if (cur_cm_id_priv->sidr_slid < cm_id_priv->sidr_slid) link = &(*link)->rb_left; - else if (cmp > 0) + else if (cur_cm_id_priv->sidr_slid > cm_id_priv->sidr_slid) link = &(*link)->rb_right; else return cur_cm_id_priv; @@ -810,21 +802,12 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private return NULL; } -static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv, - enum ib_cm_sidr_status status) -{ - struct ib_cm_sidr_rep_param param; - - memset(¶m, 0, sizeof param); - param.status = status; - ib_send_cm_sidr_rep(&cm_id_priv->id, ¶m); -} - -struct ib_cm_id *ib_create_cm_id(struct ib_device *device, - ib_cm_handler cm_handler, - void *context) +static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) { struct cm_id_private *cm_id_priv; + u32 id; int ret; cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); @@ -836,26 +819,54 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device, cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.remote_cm_qpn = 1; - ret = cm_alloc_id(cm_id_priv); - if (ret) - goto error; + RB_CLEAR_NODE(&cm_id_priv->service_node); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); - INIT_LIST_HEAD(&cm_id_priv->prim_list); - INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); refcount_set(&cm_id_priv->refcount, 1); - return &cm_id_priv->id; + + ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b, + &cm.local_id_next, GFP_KERNEL); + if (ret < 0) + goto error; + cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; + + return cm_id_priv; error: kfree(cm_id_priv); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); +} + +/* + * Make the ID visible to the MAD handlers and other threads that use the + * xarray. + */ +static void cm_finalize_id(struct cm_id_private *cm_id_priv) +{ + xa_store(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id), + cm_id_priv, GFP_ATOMIC); +} + +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) +{ + struct cm_id_private *cm_id_priv; + + cm_id_priv = cm_alloc_id_priv(device, cm_handler, context); + if (IS_ERR(cm_id_priv)) + return ERR_CAST(cm_id_priv); + + cm_finalize_id(cm_id_priv); + return &cm_id_priv->id; } EXPORT_SYMBOL(ib_create_cm_id); -static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv) +static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv) { struct cm_work *work; @@ -874,6 +885,36 @@ static void cm_free_work(struct cm_work *work) kfree(work); } +static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv, + struct cm_work *work) + __releases(&cm_id_priv->lock) +{ + bool immediate; + + /* + * To deliver the event to the user callback we have the drop the + * spinlock, however, we need to ensure that the user callback is single + * threaded and receives events in the temporal order. If there are + * already events being processed then thread new events onto a list, + * the thread currently processing will pick them up. + */ + immediate = atomic_inc_and_test(&cm_id_priv->work_count); + if (!immediate) { + list_add_tail(&work->list, &cm_id_priv->work_list); + /* + * This routine always consumes incoming reference. Once queued + * to the work_list then a reference is held by the thread + * currently running cm_process_work() and this reference is not + * needed. + */ + cm_deref_id(cm_id_priv); + } + spin_unlock_irq(&cm_id_priv->lock); + + if (immediate) + cm_process_work(cm_id_priv, work); +} + static inline int cm_convert_to_ms(int iba_time) { /* approximate conversion to ms from 4.096us x 2^iba_time */ @@ -899,8 +940,10 @@ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) return min(31, ack_timeout); } -static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) +static void cm_remove_remote(struct cm_id_private *cm_id_priv) { + struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info; + if (timewait_info->inserted_remote_id) { rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); timewait_info->inserted_remote_id = 0; @@ -912,7 +955,7 @@ static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) } } -static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) +static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id) { struct cm_timewait_info *timewait_info; @@ -932,12 +975,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) unsigned long flags; struct cm_device *cm_dev; + lockdep_assert_held(&cm_id_priv->lock); + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); if (!cm_dev) return; spin_lock_irqsave(&cm.lock, flags); - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); spin_unlock_irqrestore(&cm.lock, flags); @@ -956,6 +1001,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) msecs_to_jiffies(wait_time)); spin_unlock_irqrestore(&cm.lock, flags); + /* + * The timewait_info is converted into a work and gets freed during + * cm_free_work() in cm_timewait_handler(). + */ + BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0); cm_id_priv->timewait_info = NULL; } @@ -963,10 +1013,12 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) { unsigned long flags; + lockdep_assert_held(&cm_id_priv->lock); + cm_id_priv->id.state = IB_CM_IDLE; if (cm_id_priv->timewait_info) { spin_lock_irqsave(&cm.lock, flags); - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); spin_unlock_irqrestore(&cm.lock, flags); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; @@ -979,104 +1031,116 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) struct cm_work *work; cm_id_priv = container_of(cm_id, struct cm_id_private, id); -retest: spin_lock_irq(&cm_id_priv->lock); +retest: switch (cm_id->state) { case IB_CM_LISTEN: - spin_unlock_irq(&cm_id_priv->lock); - - spin_lock_irq(&cm.lock); + spin_lock(&cm.lock); if (--cm_id_priv->listen_sharecount > 0) { /* The id is still shared. */ + WARN_ON(refcount_read(&cm_id_priv->refcount) == 1); + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); cm_deref_id(cm_id_priv); - spin_unlock_irq(&cm.lock); return; } + cm_id->state = IB_CM_IDLE; rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); - spin_unlock_irq(&cm.lock); + RB_CLEAR_NODE(&cm_id_priv->service_node); + spin_unlock(&cm.lock); break; case IB_CM_SIDR_REQ_SENT: cm_id->state = IB_CM_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - spin_unlock_irq(&cm_id_priv->lock); + ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_SIDR_REQ_RCVD: - spin_unlock_irq(&cm_id_priv->lock); - cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); - spin_lock_irq(&cm.lock); - if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) - rb_erase(&cm_id_priv->sidr_id_node, - &cm.remote_sidr_table); - spin_unlock_irq(&cm.lock); + cm_send_sidr_rep_locked(cm_id_priv, + &(struct ib_cm_sidr_rep_param){ + .status = IB_SIDR_REJECT }); + /* cm_send_sidr_rep_locked will not move to IDLE if it fails */ + cm_id->state = IB_CM_IDLE; break; case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT, - &cm_id_priv->id.device->node_guid, - sizeof cm_id_priv->id.device->node_guid, - NULL, 0); + ib_cancel_mad(cm_id_priv->msg); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT, + &cm_id_priv->id.device->node_guid, + sizeof(cm_id_priv->id.device->node_guid), + NULL, 0); break; case IB_CM_REQ_RCVD: if (err == -ENOMEM) { /* Do not reject to allow future retries. */ cm_reset_to_idle(cm_id_priv); - spin_unlock_irq(&cm_id_priv->lock); } else { - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, - NULL, 0, NULL, 0); + cm_send_rej_locked(cm_id_priv, + IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + NULL, 0); } break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* Fall through */ + ib_cancel_mad(cm_id_priv->msg); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); + goto retest; case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, - NULL, 0, NULL, 0); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); break; case IB_CM_ESTABLISHED: - spin_unlock_irq(&cm_id_priv->lock); - if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) + if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) { + cm_id->state = IB_CM_IDLE; break; - ib_send_cm_dreq(cm_id, NULL, 0); + } + cm_send_dreq_locked(cm_id_priv, NULL, 0); goto retest; case IB_CM_DREQ_SENT: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ib_cancel_mad(cm_id_priv->msg); cm_enter_timewait(cm_id_priv); - spin_unlock_irq(&cm_id_priv->lock); - break; + goto retest; case IB_CM_DREQ_RCVD: - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_drep(cm_id, NULL, 0); + cm_send_drep_locked(cm_id_priv, NULL, 0); + WARN_ON(cm_id->state != IB_CM_TIMEWAIT); + goto retest; + case IB_CM_TIMEWAIT: + /* + * The cm_acquire_id in cm_timewait_handler will stop working + * once we do xa_erase below, so just move to idle here for + * consistency. + */ + cm_id->state = IB_CM_IDLE; break; - default: - spin_unlock_irq(&cm_id_priv->lock); + case IB_CM_IDLE: break; } + WARN_ON(cm_id->state != IB_CM_IDLE); - spin_lock_irq(&cm.lock); - if (!list_empty(&cm_id_priv->altr_list) && - (!cm_id_priv->altr_send_port_not_ready)) - list_del(&cm_id_priv->altr_list); - if (!list_empty(&cm_id_priv->prim_list) && - (!cm_id_priv->prim_send_port_not_ready)) - list_del(&cm_id_priv->prim_list); - spin_unlock_irq(&cm.lock); + spin_lock(&cm.lock); + /* Required for cleanup paths related cm_req_handler() */ + if (cm_id_priv->timewait_info) { + cm_remove_remote(cm_id_priv); + kfree(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + } - cm_free_id(cm_id->local_id); + WARN_ON(cm_id_priv->listen_sharecount); + WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node)); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + + xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id)); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); - rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr); - rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr); + cm_destroy_av(&cm_id_priv->av); + cm_destroy_av(&cm_id_priv->alt_av); kfree(cm_id_priv->private_data); kfree_rcu(cm_id_priv, rcu); } @@ -1087,70 +1151,63 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id) } EXPORT_SYMBOL(ib_destroy_cm_id); +static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id) +{ + if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && + (service_id != IB_CM_ASSIGN_SERVICE_ID)) + return -EINVAL; + + if (service_id == IB_CM_ASSIGN_SERVICE_ID) + cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++); + else + cm_id_priv->id.service_id = service_id; + + return 0; +} + /** - * __ib_cm_listen - Initiates listening on the specified service ID for + * ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. * @cm_id: Connection identifier associated with the listen request. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. - * @service_mask: Mask applied to service ID used to listen across a - * range of service IDs. If set to 0, the service ID is matched - * exactly. This parameter is ignored if %service_id is set to - * IB_CM_ASSIGN_SERVICE_ID. */ -static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, - __be64 service_mask) +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id) { - struct cm_id_private *cm_id_priv, *cur_cm_id_priv; - int ret = 0; - - service_mask = service_mask ? service_mask : ~cpu_to_be64(0); - service_id &= service_mask; - if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && - (service_id != IB_CM_ASSIGN_SERVICE_ID)) - return -EINVAL; - - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - if (cm_id->state != IB_CM_IDLE) - return -EINVAL; - - cm_id->state = IB_CM_LISTEN; - ++cm_id_priv->listen_sharecount; + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; - if (service_id == IB_CM_ASSIGN_SERVICE_ID) { - cm_id->service_id = cpu_to_be64(cm.listen_service_id++); - cm_id->service_mask = ~cpu_to_be64(0); - } else { - cm_id->service_id = service_id; - cm_id->service_mask = service_mask; + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->id.state != IB_CM_IDLE) { + ret = -EINVAL; + goto out; } - cur_cm_id_priv = cm_insert_listen(cm_id_priv); - if (cur_cm_id_priv) { - cm_id->state = IB_CM_IDLE; - --cm_id_priv->listen_sharecount; + ret = cm_init_listen(cm_id_priv, service_id); + if (ret) + goto out; + + if (!cm_insert_listen(cm_id_priv, NULL)) { ret = -EBUSY; + goto out; } - return ret; -} - -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) -{ - unsigned long flags; - int ret; - spin_lock_irqsave(&cm.lock, flags); - ret = __ib_cm_listen(cm_id, service_id, service_mask); - spin_unlock_irqrestore(&cm.lock, flags); + cm_id_priv->id.state = IB_CM_LISTEN; + ret = 0; +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_cm_listen); /** - * Create a new listening ib_cm_id and listen on the given service ID. + * ib_cm_insert_listen - Create a new listening ib_cm_id and listen on + * the given service ID. * * If there's an existing ID listening on that same device and service ID, * return it. @@ -1169,60 +1226,57 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, ib_cm_handler cm_handler, __be64 service_id) { + struct cm_id_private *listen_id_priv; struct cm_id_private *cm_id_priv; - struct ib_cm_id *cm_id; - unsigned long flags; int err = 0; /* Create an ID in advance, since the creation may sleep */ - cm_id = ib_create_cm_id(device, cm_handler, NULL); - if (IS_ERR(cm_id)) - return cm_id; + cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL); + if (IS_ERR(cm_id_priv)) + return ERR_CAST(cm_id_priv); - spin_lock_irqsave(&cm.lock, flags); + err = cm_init_listen(cm_id_priv, service_id); + if (err) { + ib_destroy_cm_id(&cm_id_priv->id); + return ERR_PTR(err); + } - if (service_id == IB_CM_ASSIGN_SERVICE_ID) - goto new_id; - - /* Find an existing ID */ - cm_id_priv = cm_find_listen(device, service_id); - if (cm_id_priv) { - if (cm_id->cm_handler != cm_handler || cm_id->context) { - /* Sharing an ib_cm_id with different handlers is not - * supported */ - spin_unlock_irqrestore(&cm.lock, flags); - ib_destroy_cm_id(cm_id); + spin_lock_irq(&cm_id_priv->lock); + listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler); + if (listen_id_priv != cm_id_priv) { + spin_unlock_irq(&cm_id_priv->lock); + ib_destroy_cm_id(&cm_id_priv->id); + if (!listen_id_priv) return ERR_PTR(-EINVAL); - } - refcount_inc(&cm_id_priv->refcount); - ++cm_id_priv->listen_sharecount; - spin_unlock_irqrestore(&cm.lock, flags); - - ib_destroy_cm_id(cm_id); - cm_id = &cm_id_priv->id; - return cm_id; + return &listen_id_priv->id; } + cm_id_priv->id.state = IB_CM_LISTEN; + spin_unlock_irq(&cm_id_priv->lock); -new_id: - /* Use newly created ID */ - err = __ib_cm_listen(cm_id, service_id, 0); - - spin_unlock_irqrestore(&cm.lock, flags); + /* + * A listen ID does not need to be in the xarray since it does not + * receive mads, is not placed in the remote_id or remote_qpn rbtree, + * and does not enter timewait. + */ - if (err) { - ib_destroy_cm_id(cm_id); - return ERR_PTR(err); - } - return cm_id; + return &cm_id_priv->id; } EXPORT_SYMBOL(ib_cm_insert_listen); static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) { - u64 hi_tid, low_tid; + u64 hi_tid = 0, low_tid; - hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; - low_tid = (u64)cm_id_priv->id.local_id; + lockdep_assert_held(&cm_id_priv->lock); + + low_tid = (u64)cm_id_priv->id.local_id; + if (!cm_id_priv->av.port) + return cpu_to_be64(low_tid); + + spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + if (cm_id_priv->av.port->mad_agent) + hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; + spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return cpu_to_be64(hi_tid | low_tid); } @@ -1237,6 +1291,13 @@ static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, hdr->tid = tid; } +static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, + __be64 tid, u32 attr_mod) +{ + cm_format_mad_hdr(hdr, attr_id, tid); + hdr->attr_mod = cpu_to_be32(attr_mod); +} + static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) @@ -1244,13 +1305,14 @@ static void cm_format_req(struct cm_req_msg *req_msg, struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; bool pri_ext = false; + __be16 lid; if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) pri_ext = opa_is_extended_lid(pri_path->opa.dlid, pri_path->opa.slid); - cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, - cm_form_tid(cm_id_priv)); + cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, + cm_form_tid(cm_id_priv), param->ece.attr_mod); IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg, be32_to_cpu(cm_id_priv->id.local_id)); @@ -1303,9 +1365,16 @@ static void cm_format_req(struct cm_req_msg *req_msg, htons(ntohl(sa_path_get_dlid( pri_path))))); } else { + + if (param->primary_path_inbound) { + lid = param->primary_path_inbound->ib.dlid; + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(lid)); + } else + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + /* Work-around until there's a way to obtain remote LID info */ - IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, - be16_to_cpu(IB_LID_PERMISSIVE)); IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, be16_to_cpu(IB_LID_PERMISSIVE)); } @@ -1373,6 +1442,7 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alt_path->packet_life_time)); } + IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id); if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data, @@ -1381,10 +1451,6 @@ static void cm_format_req(struct cm_req_msg *req_msg, static int cm_validate_req_param(struct ib_cm_req_param *param) { - /* peer-to-peer not supported */ - if (param->peer_to_peer) - return -EINVAL; - if (!param->primary_path) return -EINVAL; @@ -1407,7 +1473,9 @@ static int cm_validate_req_param(struct ib_cm_req_param *param) int ib_send_cm_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param) { + struct cm_av av = {}, alt_av = {}; struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; struct cm_req_msg *req_msg; unsigned long flags; int ret; @@ -1419,10 +1487,9 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, /* Verify that we're not in timewait. */ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_IDLE) { + if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = -EINVAL; - goto out; + return -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -1430,22 +1497,23 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); - goto out; + cm_id_priv->timewait_info = NULL; + return ret; } ret = cm_init_av_by_path(param->primary_path, - param->ppath_sgid_attr, &cm_id_priv->av, - cm_id_priv); + param->ppath_sgid_attr, &av); if (ret) - goto error1; + return ret; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, NULL, - &cm_id_priv->alt_av, cm_id_priv); - if (ret) - goto error1; + &alt_av); + if (ret) { + cm_destroy_av(&av); + return ret; + } } cm_id->service_id = param->service_id; - cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( @@ -1458,33 +1526,44 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, cm_id_priv->pkey = param->primary_path->pkey; cm_id_priv->qp_type = param->qp_type; - ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg); - if (ret) - goto error1; + spin_lock_irqsave(&cm_id_priv->lock, flags); + + cm_move_av_from_path(&cm_id_priv->av, &av); + if (param->primary_path_outbound) + cm_id_priv->av.dlid_datapath = + be16_to_cpu(param->primary_path_outbound->ib.dlid); + + if (param->alternate_path) + cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); - req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad; + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out_unlock; + } + + req_msg = (struct cm_req_msg *)msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; - cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms; - cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT; + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT; cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); - spin_lock_irqsave(&cm_id_priv->lock, flags); - ret = ib_post_send_mad(cm_id_priv->msg, NULL); - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - goto error2; - } + trace_icm_send_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; BUG_ON(cm_id->state != IB_CM_IDLE); cm_id->state = IB_CM_REQ_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; - -error2: cm_free_msg(cm_id_priv->msg); -error1: kfree(cm_id_priv->timewait_info); -out: return ret; +out_free: + cm_free_priv_msg(msg); +out_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; } EXPORT_SYMBOL(ib_send_cm_req); @@ -1519,9 +1598,12 @@ static int cm_issue_rej(struct cm_port *port, IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); } + trace_icm_issue_rej( + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg), + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) - cm_free_msg(msg); + cm_free_response_msg(msg); return ret; } @@ -1534,7 +1616,7 @@ static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) req_msg)))); } -static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, +static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num, struct sa_path_rec *path, union ib_gid *gid) { if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) @@ -1545,14 +1627,13 @@ static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, - struct sa_path_rec *alt_path) + struct sa_path_rec *alt_path, + struct ib_wc *wc) { u32 lid; if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { - sa_path_set_dlid(primary_path, - IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, - req_msg)); + sa_path_set_dlid(primary_path, wc->slid); sa_path_set_slid(primary_path, IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg)); @@ -1589,7 +1670,8 @@ static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, - struct sa_path_rec *alt_path) + struct sa_path_rec *alt_path, + struct ib_wc *wc) { primary_path->dgid = *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg); @@ -1647,20 +1729,20 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, if (sa_path_is_roce(alt_path)) alt_path->roce.route_resolved = false; } - cm_format_path_lid_from_req(req_msg, primary_path, alt_path); + cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc); } static u16 cm_get_bth_pkey(struct cm_work *work) { struct ib_device *ib_dev = work->port->cm_dev->ib_device; - u8 port_num = work->port->port_num; + u32 port_num = work->port->port_num; u16 pkey_index = work->mad_recv_wc->wc->pkey_index; u16 pkey; int ret; ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); if (ret) { - dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", + dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %u, pkey index %u). %d\n", port_num, pkey_index, ret); return 0; } @@ -1669,7 +1751,7 @@ static u16 cm_get_bth_pkey(struct cm_work *work) } /** - * Convert OPA SGID to IB SGID + * cm_opa_to_ib_sgid - Convert OPA SGID to IB SGID * ULPs (such as IPoIB) do not understand OPA GIDs and will * reject them as the local_gid will not match the sgid. Therefore, * change the pathrec's SGID to an IB SGID. @@ -1681,7 +1763,7 @@ static void cm_opa_to_ib_sgid(struct cm_work *work, struct sa_path_rec *path) { struct ib_device *dev = work->port->cm_dev->ib_device; - u8 port_num = work->port->port_num; + u32 port_num = work->port->port_num; if (rdma_cap_opa_ah(dev, port_num) && (ib_is_opa_gid(&path->sgid))) { @@ -1734,6 +1816,9 @@ static void cm_format_req_event(struct cm_work *work, param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); param->srq = IBA_GET(CM_REQ_SRQ, req_msg); param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; + param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg); + param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod); + work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg); } @@ -1783,17 +1868,17 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg, static void cm_format_rej(struct cm_rej_msg *rej_msg, struct cm_id_private *cm_id_priv, - enum ib_cm_rej_reason reason, - void *ari, - u8 ari_length, - const void *private_data, - u8 private_data_len) + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len, enum ib_cm_state state) { + lockdep_assert_held(&cm_id_priv->lock); + cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, be32_to_cpu(cm_id_priv->id.remote_id)); - switch(cm_id_priv->id.state) { + switch (state) { case IB_CM_REQ_RCVD: IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(0)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); @@ -1834,12 +1919,16 @@ static void cm_dup_req_handler(struct cm_work *work, struct ib_mad_send_buf *msg = NULL; int ret; - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_REQ_COUNTER]); + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES][CM_REQ_COUNTER]); /* Quick state check to discard duplicate REQs. */ - if (cm_id_priv->id.state == IB_CM_REQ_RCVD) + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state == IB_CM_REQ_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); return; + } + spin_unlock_irq(&cm_id_priv->lock); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) @@ -1854,30 +1943,31 @@ static void cm_dup_req_handler(struct cm_work *work, cm_id_priv->private_data_len); break; case IB_CM_TIMEWAIT: - cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, - IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, + IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0, + IB_CM_TIMEWAIT); break; default: goto unlock; } spin_unlock_irq(&cm_id_priv->lock); + trace_icm_send_dup_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; return; unlock: spin_unlock_irq(&cm_id_priv->lock); -free: cm_free_msg(msg); +free: cm_free_response_msg(msg); } -static struct cm_id_private * cm_match_req(struct cm_work *work, - struct cm_id_private *cm_id_priv) +static struct cm_id_private *cm_match_req(struct cm_work *work, + struct cm_id_private *cm_id_priv) { struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; - struct ib_cm_id *cm_id; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1898,7 +1988,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, /* Check for stale connections. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); @@ -1907,8 +1997,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); if (cur_cm_id_priv) { - cm_id = &cur_cm_id_priv->id; - ib_send_cm_dreq(cm_id, NULL, 0); + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } return NULL; @@ -1919,19 +2008,14 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, cm_id_priv->id.device, cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg))); if (!listen_cm_id_priv) { - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, NULL, 0); - goto out; + return NULL; } - refcount_inc(&listen_cm_id_priv->refcount); - refcount_inc(&cm_id_priv->refcount); - cm_id_priv->id.state = IB_CM_REQ_RCVD; - atomic_inc(&cm_id_priv->work_count); spin_unlock_irq(&cm.lock); -out: return listen_cm_id_priv; } @@ -1973,7 +2057,6 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) static int cm_req_handler(struct cm_work *work) { - struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; const struct ib_global_route *grh; @@ -1982,13 +2065,32 @@ static int cm_req_handler(struct cm_work *work) req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); + cm_id_priv = + cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id_priv)) + return PTR_ERR(cm_id_priv); - cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg)); + cm_id_priv->id.service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); + cm_id_priv->tid = req_msg->hdr.tid; + cm_id_priv->timeout_ms = cm_convert_to_ms( + IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg)); + cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg); + cm_id_priv->remote_qpn = + cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->initiator_depth = + IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); + cm_id_priv->responder_resources = + IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); + cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); + cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); + cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); + cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); + cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); @@ -1998,43 +2100,39 @@ static int cm_req_handler(struct cm_work *work) id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; goto destroy; } - cm_id_priv->timewait_info->work.remote_id = - cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg)); + cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id; cm_id_priv->timewait_info->remote_ca_guid = cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); - cm_id_priv->timewait_info->remote_qpn = - cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn; + + /* + * Note that the ID pointer is not in the xarray at this point, + * so this set is only visible to the local thread. + */ + cm_id_priv->id.state = IB_CM_REQ_RCVD; listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { - pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, - be32_to_cpu(cm_id->local_id)); + trace_icm_no_listener_err(&cm_id_priv->id); + cm_id_priv->id.state = IB_CM_IDLE; ret = -EINVAL; - goto free_timeinfo; + goto destroy; } - cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; - cm_id_priv->id.context = listen_cm_id_priv->id.context; - cm_id_priv->id.service_id = - cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); - cm_id_priv->id.service_mask = ~cpu_to_be64(0); - - cm_process_routed_req(req_msg, work->mad_recv_wc->wc); - memset(&work->path[0], 0, sizeof(work->path[0])); if (cm_req_has_alt_path(req_msg)) memset(&work->path[1], 0, sizeof(work->path[1])); grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); gid_attr = grh->sgid_attr; - if (gid_attr && - rdma_protocol_roce(work->port->cm_dev->ib_device, - work->port->port_num)) { + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) { work->path[0].rec_type = sa_conv_gid_to_pathrec_type(gid_attr->gid_type); } else { + cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_path_set_rec_type( work->port->cm_dev->ib_device, work->port->port_num, &work->path[0], @@ -2044,13 +2142,15 @@ static int cm_req_handler(struct cm_work *work) if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], - &work->path[1]); + &work->path[1], work->mad_recv_wc->wc); if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) sa_path_set_dmac(&work->path[0], cm_id_priv->av.ah_attr.roce.dmac); work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av, - cm_id_priv); + + /* This destroy call is needed to pair with cm_init_av_for_response */ + cm_destroy_av(&cm_id_priv->av); + ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av); if (ret) { int err; @@ -2058,54 +2158,55 @@ static int cm_req_handler(struct cm_work *work) work->port->port_num, 0, &work->path[0].sgid); if (err) - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, NULL, 0, NULL, 0); else - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof(work->path[0].sgid), NULL, 0); goto rejected; } + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB) + cm_id_priv->av.dlid_datapath = + IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg); + if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], NULL, - &cm_id_priv->alt_av, cm_id_priv); + &cm_id_priv->alt_av); if (ret) { - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, + ib_send_cm_rej(&cm_id_priv->id, + IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, sizeof(work->path[0].sgid), NULL, 0); goto rejected; } } - cm_id_priv->tid = req_msg->hdr.tid; - cm_id_priv->timeout_ms = cm_convert_to_ms( - IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg)); - cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg); - cm_id_priv->remote_qpn = - cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); - cm_id_priv->initiator_depth = - IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); - cm_id_priv->responder_resources = - IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); - cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); - cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); - cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); - cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); - cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); - cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); - cm_process_work(cm_id_priv, work); + + /* Now MAD handlers can see the new ID */ + spin_lock_irq(&cm_id_priv->lock); + cm_finalize_id(cm_id_priv); + + /* Refcount belongs to the event, pairs with cm_process_work() */ + refcount_inc(&cm_id_priv->refcount); + cm_queue_work_unlock(cm_id_priv, work); + /* + * Since this ID was just created and was not made visible to other MAD + * handlers until the cm_finalize_id() above we know that the + * cm_process_work() will deliver the event and the listen_cm_id + * embedded in the event can be derefed here. + */ cm_deref_id(listen_cm_id_priv); return 0; rejected: - refcount_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); -free_timeinfo: - kfree(cm_id_priv->timewait_info); destroy: - ib_destroy_cm_id(cm_id); + ib_destroy_cm_id(&cm_id_priv->id); return ret; } @@ -2113,7 +2214,8 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_rep_param *param) { - cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); + cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid, + param->ece.attr_mod); IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg, @@ -2140,6 +2242,10 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num); } + IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id); + IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8); + IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16); + if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data, param->private_data_len); @@ -2162,36 +2268,42 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); + trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); ret = -EINVAL; goto out; } - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); goto out; + } rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; + trace_icm_send_rep(cm_id); ret = ib_post_send_mad(msg, NULL); - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - return ret; - } + if (ret) + goto out_free; cm_id->state = IB_CM_REP_SENT; - cm_id_priv->msg = msg; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); + WARN_ONCE(param->qp_num & 0xFF000000, + "IBTA declares QPN to be 24 bits, but it is 0x%X\n", + param->qp_num); cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +out_free: + cm_free_priv_msg(msg); +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rep); @@ -2233,19 +2345,21 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_icm_send_cm_rtu_err(cm_id); ret = -EINVAL; goto error; } - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); goto error; + } cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, private_data, private_data_len); + trace_icm_send_rtu(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -2284,6 +2398,11 @@ static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg); param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); param->srq = IBA_GET(CM_REP_SRQ, rep_msg); + param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg); + param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod); + work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg); } @@ -2302,8 +2421,8 @@ static void cm_dup_rep_handler(struct cm_work *work) if (!cm_id_priv) return; - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_REP_COUNTER]); + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) goto deref; @@ -2322,13 +2441,14 @@ static void cm_dup_rep_handler(struct cm_work *work) goto unlock; spin_unlock_irq(&cm_id_priv->lock); + trace_icm_send_dup_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; goto deref; unlock: spin_unlock_irq(&cm_id_priv->lock); -free: cm_free_msg(msg); +free: cm_free_response_msg(msg); deref: cm_deref_id(cm_id_priv); } @@ -2338,7 +2458,6 @@ static int cm_rep_handler(struct cm_work *work) struct cm_rep_msg *rep_msg; int ret; struct cm_id_private *cur_cm_id_priv; - struct ib_cm_id *cm_id; struct cm_timewait_info *timewait_info; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; @@ -2346,7 +2465,7 @@ static int cm_rep_handler(struct cm_work *work) cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, + trace_icm_remote_no_priv_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); return -EINVAL; } @@ -2359,13 +2478,12 @@ static int cm_rep_handler(struct cm_work *work) case IB_CM_MRA_REQ_RCVD: break; default: - spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug( - "%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", - __func__, cm_id_priv->id.state, + trace_icm_rep_unknown_err( IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), - IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), + cm_id_priv->id.state); + spin_unlock_irq(&cm_id_priv->lock); goto error; } @@ -2381,16 +2499,14 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("%s: Failed to insert remote id %d\n", __func__, + trace_icm_insert_failed_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); goto error; } /* Check for a stale connection. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { - rb_erase(&cm_id_priv->timewait_info->remote_id_node, - &cm.remote_id_table); - cm_id_priv->timewait_info->inserted_remote_id = 0; + cm_remove_remote(cm_id_priv); cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); @@ -2400,14 +2516,12 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug( - "%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", - __func__, IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + trace_icm_staleconn_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); if (cur_cm_id_priv) { - cm_id = &cur_cm_id_priv->id; - ib_send_cm_dreq(cm_id, NULL, 0); + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } @@ -2434,18 +2548,8 @@ static int cm_rep_handler(struct cm_work *work) cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); - /* todo: handle peer_to_peer */ - - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; error: @@ -2456,7 +2560,6 @@ error: static int cm_establish_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; - int ret; /* See comment in cm_establish about lookup. */ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); @@ -2469,16 +2572,8 @@ static int cm_establish_handler(struct cm_work *work) goto out; } - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -2489,7 +2584,6 @@ static int cm_rtu_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rtu_msg *rtu_msg; - int ret; rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( @@ -2505,22 +2599,14 @@ static int cm_rtu_handler(struct cm_work *work) if (cm_id_priv->id.state != IB_CM_REP_SENT && cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { spin_unlock_irq(&cm_id_priv->lock); - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_RTU_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_RTU_COUNTER]); goto out; } cm_id_priv->id.state = IB_CM_ESTABLISHED; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -2546,35 +2632,30 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, private_data_len); } -int ib_send_cm_dreq(struct ib_cm_id *cm_id, - const void *private_data, - u8 private_data_len) +static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, + const void *private_data, u8 private_data_len) { - struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; - unsigned long flags; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_ESTABLISHED) { - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); - ret = -EINVAL; - goto out; + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { + trace_icm_dreq_skipped(&cm_id_priv->id); + return -EINVAL; } - if (cm_id->lap_state == IB_CM_LAP_SENT || - cm_id->lap_state == IB_CM_MRA_LAP_RCVD) - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || + cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->msg); - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) { + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { cm_enter_timewait(cm_id_priv); - goto out; + return PTR_ERR(msg); } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, @@ -2582,17 +2663,29 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id, msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; + trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); + cm_free_priv_msg(msg); return ret; } - cm_id->state = IB_CM_DREQ_SENT; - cm_id_priv->msg = msg; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_id_priv->id.state = IB_CM_DREQ_SENT; + return 0; +} + +int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_dreq); @@ -2613,51 +2706,58 @@ static void cm_format_drep(struct cm_drep_msg *drep_msg, private_data_len); } -int ib_send_cm_drep(struct ib_cm_id *cm_id, - const void *private_data, - u8 private_data_len) +static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len) { - struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; - unsigned long flags; - void *data; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) return -EINVAL; - data = cm_copy_private_data(private_data, private_data_len); - if (IS_ERR(data)) - return PTR_ERR(data); - - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_DREQ_RCVD) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); - pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", - __func__, be32_to_cpu(cm_id->local_id), cm_id->state); + if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { + trace_icm_send_drep_err(&cm_id_priv->id); + kfree(private_data); return -EINVAL; } - cm_set_private_data(cm_id_priv, data, private_data_len); + cm_set_private_data(cm_id_priv, private_data, private_data_len); cm_enter_timewait(cm_id_priv); - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto out; + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, private_data, private_data_len); + trace_icm_send_drep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } + return 0; +} -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + void *data; + int ret; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_drep_locked(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_drep); @@ -2683,9 +2783,12 @@ static int cm_issue_drep(struct cm_port *port, IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + trace_icm_issue_drep( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) - cm_free_msg(msg); + cm_free_response_msg(msg); return ret; } @@ -2695,19 +2798,17 @@ static int cm_dreq_handler(struct cm_work *work) struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; - int ret; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)), cpu_to_be32(IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg))); if (!cm_id_priv) { - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_DREQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug( - "%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", - __func__, IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + trace_icm_no_priv_err( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); return -EINVAL; } @@ -2723,18 +2824,17 @@ static int cm_dreq_handler(struct cm_work *work) switch (cm_id_priv->id.state) { case IB_CM_REP_SENT: case IB_CM_DREQ_SENT: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - break; - case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_TIMEWAIT: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_DREQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); if (IS_ERR(msg)) goto unlock; @@ -2746,29 +2846,19 @@ static int cm_dreq_handler(struct cm_work *work) if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) - cm_free_msg(msg); + cm_free_response_msg(msg); goto deref; case IB_CM_DREQ_RCVD: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_DREQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_dreq_unknown_err(&cm_id_priv->id); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; cm_id_priv->tid = dreq_msg->hdr.tid; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); @@ -2780,7 +2870,6 @@ static int cm_drep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_drep_msg *drep_msg; - int ret; drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( @@ -2800,81 +2889,82 @@ static int cm_drep_handler(struct cm_work *work) } cm_enter_timewait(cm_id_priv); - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } -int ib_send_cm_rej(struct ib_cm_id *cm_id, - enum ib_cm_rej_reason reason, - void *ari, - u8 ari_length, - const void *private_data, - u8 private_data_len) +static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len) { - struct cm_id_private *cm_id_priv; + enum ib_cm_state state = cm_id_priv->id.state; struct ib_mad_send_buf *msg; - unsigned long flags; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) return -EINVAL; - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - - spin_lock_irqsave(&cm_id_priv->lock, flags); - switch (cm_id->state) { + switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: - ret = cm_alloc_msg(cm_id_priv, &msg); - if (!ret) - cm_format_rej((struct cm_rej_msg *) msg->mad, - cm_id_priv, reason, ari, ari_length, - private_data, private_data_len); - cm_reset_to_idle(cm_id_priv); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, + ari, ari_length, private_data, private_data_len, + state); break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: - ret = cm_alloc_msg(cm_id_priv, &msg); - if (!ret) - cm_format_rej((struct cm_rej_msg *) msg->mad, - cm_id_priv, reason, ari, ari_length, - private_data, private_data_len); - cm_enter_timewait(cm_id_priv); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, + ari, ari_length, private_data, private_data_len, + state); break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); - ret = -EINVAL; - goto out; + trace_icm_send_unknown_rej_err(&cm_id_priv->id); + return -EINVAL; } - if (ret) - goto out; - + trace_icm_send_rej(&cm_id_priv->id, reason); ret = ib_post_send_mad(msg, NULL); - if (ret) + if (ret) { cm_free_msg(msg); + return ret; + } + + return 0; +} + +int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, + void *ari, u8 ari_length, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length, + private_data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rej); @@ -2893,26 +2983,17 @@ static void cm_format_rej_event(struct cm_work *work) IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg); } -static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) +static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) { - struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; __be32 remote_id; remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { - spin_lock_irq(&cm.lock); - timewait_info = cm_find_remote_id( + cm_id_priv = cm_find_remote_id( *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), remote_id); - if (!timewait_info) { - spin_unlock_irq(&cm.lock); - return NULL; - } - cm_id_priv = - cm_acquire_id(timewait_info->work.local_id, remote_id); - spin_unlock_irq(&cm.lock); } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == CM_MSG_RESPONSE_REQ) cm_id_priv = cm_acquire_id( @@ -2930,7 +3011,6 @@ static int cm_rej_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rej_msg *rej_msg; - int ret; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_rejected_id(rej_msg); @@ -2945,8 +3025,8 @@ static int cm_rej_handler(struct cm_work *work) case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* fall through */ + ib_cancel_mad(cm_id_priv->msg); + fallthrough; case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_STALE_CONN) @@ -2955,8 +3035,8 @@ static int cm_rej_handler(struct cm_work *work) cm_reset_to_idle(cm_id_priv); break; case IB_CM_DREQ_SENT: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* fall through */ + ib_cancel_mad(cm_id_priv->msg); + fallthrough; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_enter_timewait(cm_id_priv); @@ -2965,30 +3045,18 @@ static int cm_rej_handler(struct cm_work *work) if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) - ib_cancel_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg); + ib_cancel_mad(cm_id_priv->msg); cm_enter_timewait(cm_id_priv); break; } - /* fall through */ + fallthrough; default: + trace_icm_rej_unknown_err(&cm_id_priv->id); spin_unlock_irq(&cm_id_priv->lock); - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); - ret = -EINVAL; goto out; } - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3019,7 +3087,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - switch(cm_id_priv->id.state) { + switch (cm_id_priv->id.state) { case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; @@ -3037,26 +3105,27 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, msg_response = CM_MSG_RESPONSE_OTHER; break; } - /* fall through */ + fallthrough; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_send_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; - goto error1; + goto error_unlock; } if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto error1; + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto error_unlock; + } cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, msg_response, service_timeout, private_data, private_data_len); + trace_icm_send_mra(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) - goto error2; + goto error_free_msg; } cm_id->state = cm_state; @@ -3066,18 +3135,16 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; -error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); - return ret; - -error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); +error_free_msg: cm_free_msg(msg); +error_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); return ret; } EXPORT_SYMBOL(ib_send_cm_mra); -static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) +static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) { case CM_MSG_RESPONSE_REQ: @@ -3098,7 +3165,7 @@ static int cm_mra_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_mra_msg *mra_msg; - int timeout, ret; + int timeout; mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_mraed_id(mra_msg); @@ -3117,16 +3184,14 @@ static int cm_mra_handler(struct cm_work *work) case IB_CM_REQ_SENT: if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != CM_MSG_RESPONSE_REQ || - ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) + ib_modify_mad(cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; break; case IB_CM_REP_SENT: if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != CM_MSG_RESPONSE_REP || - ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) + ib_modify_mad(cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; break; @@ -3134,39 +3199,28 @@ static int cm_mra_handler(struct cm_work *work) if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != CM_MSG_RESPONSE_OTHER || cm_id_priv->id.lap_state != IB_CM_LAP_SENT || - ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) { + ib_modify_mad(cm_id_priv->msg, timeout)) { if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) - atomic_long_inc(&work->port-> - counter_group[CM_RECV_DUPLICATES]. - counter[CM_MRA_COUNTER]); + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES] + [CM_MRA_COUNTER]); goto out; } cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; break; case IB_CM_MRA_REQ_RCVD: case IB_CM_MRA_REP_RCVD: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_MRA_COUNTER]); - /* fall through */ + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_MRA_COUNTER]); + fallthrough; default: - pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_mra_unknown_err(&cm_id_priv->id); goto out; } cm_id_priv->msg->context[1] = (void *) (unsigned long) cm_id_priv->id.state; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: spin_unlock_irq(&cm_id_priv->lock); @@ -3226,6 +3280,8 @@ static int cm_lap_handler(struct cm_work *work) struct cm_lap_msg *lap_msg; struct ib_cm_lap_event_param *param; struct ib_mad_send_buf *msg = NULL; + struct rdma_ah_attr ah_attr; + struct cm_av alt_av = {}; int ret; /* Currently Alternate path messages are not supported for @@ -3254,7 +3310,25 @@ static int cm_lap_handler(struct cm_work *work) work->cm_event.private_data = IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg); + ret = ib_init_ah_attr_from_wc(work->port->cm_dev->ib_device, + work->port->port_num, + work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto deref; + + ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); + if (ret) { + rdma_destroy_ah_attr(&ah_attr); + goto deref; + } + spin_lock_irq(&cm_id_priv->lock); + cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, + &ah_attr, &cm_id_priv->av); + cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) goto unlock; @@ -3263,8 +3337,8 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_LAP_IDLE: break; case IB_CM_MRA_LAP_SENT: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_LAP_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_LAP_COUNTER]); msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); if (IS_ERR(msg)) goto unlock; @@ -3278,38 +3352,19 @@ static int cm_lap_handler(struct cm_work *work) if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) - cm_free_msg(msg); + cm_free_response_msg(msg); goto deref; case IB_CM_LAP_RCVD: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_LAP_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_LAP_COUNTER]); goto unlock; default: goto unlock; } - ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); - if (ret) - goto unlock; - - ret = cm_init_av_by_path(param->alternate_path, NULL, - &cm_id_priv->alt_av, cm_id_priv); - if (ret) - goto unlock; - cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); @@ -3321,7 +3376,6 @@ static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_apr_msg *apr_msg; - int ret; /* Currently Alternate path messages are not supported for * RoCE link layer. @@ -3354,18 +3408,8 @@ static int cm_apr_handler(struct cm_work *work) goto out; } cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_id_priv->msg = NULL; - - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3376,7 +3420,6 @@ static int cm_timewait_handler(struct cm_work *work) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; - int ret; timewait_info = container_of(work, struct cm_timewait_info, work); spin_lock_irq(&cm.lock); @@ -3395,15 +3438,7 @@ static int cm_timewait_handler(struct cm_work *work) goto out; } cm_id_priv->id.state = IB_CM_IDLE; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3433,6 +3468,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; + struct cm_av av = {}; unsigned long flags; int ret; @@ -3441,40 +3477,42 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - ret = cm_init_av_by_path(param->path, param->sgid_attr, - &cm_id_priv->av, - cm_id_priv); + ret = cm_init_av_by_path(param->path, param->sgid_attr, &av); if (ret) - goto out; + return ret; + spin_lock_irqsave(&cm_id_priv->lock, flags); + cm_move_av_from_path(&cm_id_priv->av, &av); cm_id->service_id = param->service_id; - cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = param->timeout_ms; cm_id_priv->max_cm_retries = param->max_cm_retries; - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto out; + if (cm_id->state != IB_CM_IDLE) { + ret = -EINVAL; + goto out_unlock; + } + + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out_unlock; + } - cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv, + cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv, param); msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; + msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT; - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state == IB_CM_IDLE) - ret = ib_post_send_mad(msg, NULL); - else - ret = -EINVAL; - - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - goto out; - } + trace_icm_send_sidr_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; cm_id->state = IB_CM_SIDR_REQ_SENT; - cm_id_priv->msg = msg; spin_unlock_irqrestore(&cm_id_priv->lock, flags); -out: + return 0; +out_free: + cm_free_priv_msg(msg); +out_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); @@ -3502,64 +3540,73 @@ static void cm_format_sidr_req_event(struct cm_work *work, static int cm_sidr_req_handler(struct cm_work *work) { - struct ib_cm_id *cm_id; - struct cm_id_private *cm_id_priv, *cur_cm_id_priv; + struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; int ret; - cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); - cm_id_priv = container_of(cm_id, struct cm_id_private, id); + cm_id_priv = + cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id_priv)) + return PTR_ERR(cm_id_priv); /* Record SGID/SLID and request ID for lookup. */ sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; + + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg)); + cm_id_priv->id.service_id = + cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); + cm_id_priv->tid = sidr_req_msg->hdr.tid; + wc = work->mad_recv_wc->wc; - cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); - cm_id_priv->av.dgid.global.interface_id = 0; + cm_id_priv->sidr_slid = wc->slid; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto out; - cm_id_priv->id.remote_id = - cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg)); - cm_id_priv->tid = sidr_req_msg->hdr.tid; - atomic_inc(&cm_id_priv->work_count); - spin_lock_irq(&cm.lock); - cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); - if (cur_cm_id_priv) { + listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); + if (listen_cm_id_priv) { spin_unlock_irq(&cm.lock); - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_SIDR_REQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_SIDR_REQ_COUNTER]); goto out; /* Duplicate message. */ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; - cur_cm_id_priv = cm_find_listen( - cm_id->device, - cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg))); - if (!cur_cm_id_priv) { + listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, + cm_id_priv->id.service_id); + if (!listen_cm_id_priv) { spin_unlock_irq(&cm.lock); - cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); + ib_send_cm_sidr_rep(&cm_id_priv->id, + &(struct ib_cm_sidr_rep_param){ + .status = IB_SIDR_UNSUPPORTED }); goto out; /* No match. */ } - refcount_inc(&cur_cm_id_priv->refcount); - refcount_inc(&cm_id_priv->refcount); spin_unlock_irq(&cm.lock); - cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler; - cm_id_priv->id.context = cur_cm_id_priv->id.context; - cm_id_priv->id.service_id = - cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); - cm_id_priv->id.service_mask = ~cpu_to_be64(0); + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; - cm_format_sidr_req_event(work, cm_id_priv, &cur_cm_id_priv->id); - cm_process_work(cm_id_priv, work); - cm_deref_id(cur_cm_id_priv); + /* + * A SIDR ID does not need to be in the xarray since it does not receive + * mads, is not placed in the remote_id or remote_qpn rbtree, and does + * not enter timewait. + */ + + cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); + cm_free_work(work); + /* + * A pointer to the listen_cm_id is held in the event, so this deref + * must be after the event is delivered above. + */ + cm_deref_id(listen_cm_id_priv); + if (ret) + cm_destroy_id(&cm_id_priv->id, ret); return 0; out: ib_destroy_cm_id(&cm_id_priv->id); @@ -3570,8 +3617,8 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { - cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, - cm_id_priv->tid); + cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, + cm_id_priv->tid, param->ece.attr_mod); IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg, be32_to_cpu(cm_id_priv->id.remote_id)); IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status); @@ -3579,6 +3626,10 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg, be64_to_cpu(cm_id_priv->id.service_id)); IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey); + IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg, + param->ece.vendor_id & 0xFF); + IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg, + (param->ece.vendor_id >> 8) & 0xFF); if (param->info && param->info_length) IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg, @@ -3589,41 +3640,36 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, param->private_data, param->private_data_len); } -int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, - struct ib_cm_sidr_rep_param *param) +static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param) { - struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || (param->private_data && param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) return -EINVAL; - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_SIDR_REQ_RCVD) { - ret = -EINVAL; - goto error; - } + if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD) + return -EINVAL; - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto error; + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, param); + trace_icm_send_sidr_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } - cm_id->state = IB_CM_IDLE; - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - + cm_id_priv->id.state = IB_CM_IDLE; spin_lock_irqsave(&cm.lock, flags); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); @@ -3631,8 +3677,19 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, } spin_unlock_irqrestore(&cm.lock, flags); return 0; +} -error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, + struct ib_cm_sidr_rep_param *param) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_sidr_rep_locked(cm_id_priv, param); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_rep); @@ -3676,7 +3733,7 @@ static int cm_sidr_rep_handler(struct cm_work *work) goto out; } cm_id_priv->id.state = IB_CM_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ib_cancel_mad(cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); cm_format_sidr_rep_event(work, cm_id_priv); @@ -3687,25 +3744,28 @@ out: return -EINVAL; } -static void cm_process_send_error(struct ib_mad_send_buf *msg, +static void cm_process_send_error(struct cm_id_private *cm_id_priv, + struct ib_mad_send_buf *msg, + enum ib_cm_state state, enum ib_wc_status wc_status) { - struct cm_id_private *cm_id_priv; - struct ib_cm_event cm_event; - enum ib_cm_state state; + struct ib_cm_event cm_event = {}; int ret; - memset(&cm_event, 0, sizeof cm_event); - cm_id_priv = msg->context[0]; - /* Discard old sends or ones without a response. */ spin_lock_irq(&cm_id_priv->lock); - state = (enum ib_cm_state) (unsigned long) msg->context[1]; - if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) - goto discard; + if (msg != cm_id_priv->msg) { + spin_unlock_irq(&cm_id_priv->lock); + cm_free_msg(msg); + return; + } + cm_free_priv_msg(msg); + + if (state != cm_id_priv->id.state || wc_status == IB_WC_SUCCESS || + wc_status == IB_WC_WR_FLUSH_ERR) + goto out_unlock; - pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n", - state, ib_wc_status_msg(wc_status)); + trace_icm_mad_send_err(state, wc_status); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: @@ -3726,26 +3786,27 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, cm_event.event = IB_CM_SIDR_REQ_ERROR; break; default: - goto discard; + goto out_unlock; } spin_unlock_irq(&cm_id_priv->lock); cm_event.param.send_status = wc_status; /* No other events can occur on the cm_id at this point. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); - cm_free_msg(msg); if (ret) ib_destroy_cm_id(&cm_id_priv->id); return; -discard: +out_unlock: spin_unlock_irq(&cm_id_priv->lock); - cm_free_msg(msg); } static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; + struct cm_id_private *cm_id_priv = msg->context[0]; + enum ib_cm_state state = + (enum ib_cm_state)(unsigned long)msg->context[1]; struct cm_port *port; u16 attr_index; @@ -3758,28 +3819,19 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, * set to a cm_id), and is not a REJ, then it is a send that was * manually retried. */ - if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) + if (!cm_id_priv && (attr_index != CM_REJ_COUNTER)) msg->retries = 1; - atomic_long_add(1 + msg->retries, - &port->counter_group[CM_XMIT].counter[attr_index]); + atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]); if (msg->retries) atomic_long_add(msg->retries, - &port->counter_group[CM_XMIT_RETRIES]. - counter[attr_index]); + &port->counters[CM_XMIT_RETRIES][attr_index]); - switch (mad_send_wc->status) { - case IB_WC_SUCCESS: - case IB_WC_WR_FLUSH_ERR: - cm_free_msg(msg); - break; - default: - if (msg->context[0] && msg->context[1]) - cm_process_send_error(msg, mad_send_wc->status); - else - cm_free_msg(msg); - break; - } + if (cm_id_priv) + cm_process_send_error(cm_id_priv, msg, state, + mad_send_wc->status); + else + cm_free_response_msg(msg); } static void cm_work_handler(struct work_struct *_work) @@ -3828,7 +3880,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: - pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); + trace_icm_handler_err(work->cm_event.event); ret = -EINVAL; break; } @@ -3854,8 +3906,7 @@ static int cm_establish(struct ib_cm_id *cm_id) cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - switch (cm_id->state) - { + switch (cm_id->state) { case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_id->state = IB_CM_ESTABLISHED; @@ -3864,8 +3915,7 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_icm_establish_err(cm_id); ret = -EINVAL; break; } @@ -3905,9 +3955,7 @@ out: static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; - struct cm_av tmp_av; unsigned long flags; - int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3916,14 +3964,7 @@ static int cm_migrate(struct ib_cm_id *cm_id) (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; - /* Swap address vector */ - tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; - cm_id_priv->alt_av = tmp_av; - /* Swap port send ready state */ - tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; - cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; - cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -4005,8 +4046,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, } attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); - atomic_long_inc(&port->counter_group[CM_RECV]. - counter[attr_id - CM_ATTR_ID_OFFSET]); + atomic_long_inc(&port->counters[CM_RECV][attr_id - CM_ATTR_ID_OFFSET]); work = kmalloc(struct_size(work, path, paths), GFP_KERNEL); if (!work) { @@ -4058,13 +4098,12 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC; qp_attr->pkey_index = cm_id_priv->av.pkey_index; - qp_attr->port_num = cm_id_priv->av.port->port_num; + if (cm_id_priv->av.port) + qp_attr->port_num = cm_id_priv->av.port->port_num; ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_init_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4091,6 +4130,10 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) && + cm_id_priv->av.dlid_datapath && + (cm_id_priv->av.dlid_datapath != 0xffff)) + qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath; qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); @@ -4102,7 +4145,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, cm_id_priv->responder_resources; qp_attr->min_rnr_timer = 0; } - if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { + if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr) && + cm_id_priv->alt_av.port) { *qp_attr_mask |= IB_QP_ALT_PATH; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; @@ -4112,9 +4156,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_rtr_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4151,7 +4193,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; - /* fall through */ + fallthrough; case IB_QPT_XRC_TGT: *qp_attr_mask |= IB_QP_TIMEOUT; qp_attr->timeout = cm_id_priv->av.timeout; @@ -4165,7 +4207,9 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, } } else { *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; - qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; + if (cm_id_priv->alt_av.port) + qp_attr->alt_port_num = + cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; @@ -4174,9 +4218,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_rts_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4210,75 +4252,76 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, - char *buf) +static ssize_t cm_show_counter(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { - struct cm_counter_group *group; - struct cm_counter_attribute *cm_attr; - - group = container_of(obj, struct cm_counter_group, obj); - cm_attr = container_of(attr, struct cm_counter_attribute, attr); - - return sprintf(buf, "%ld\n", - atomic_long_read(&group->counter[cm_attr->index])); -} - -static const struct sysfs_ops cm_counter_ops = { - .show = cm_show_counter -}; + struct cm_counter_attribute *cm_attr = + container_of(attr, struct cm_counter_attribute, attr); + struct cm_device *cm_dev = ib_get_client_data(ibdev, &cm_client); -static struct kobj_type cm_counter_obj_type = { - .sysfs_ops = &cm_counter_ops, - .default_attrs = cm_counter_default_attrs -}; + if (WARN_ON(!cm_dev)) + return -EINVAL; -static char *cm_devnode(struct device *dev, umode_t *mode) -{ - if (mode) - *mode = 0666; - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); + return sysfs_emit( + buf, "%ld\n", + atomic_long_read( + &cm_dev->port[port_num - 1] + ->counters[cm_attr->group][cm_attr->index])); } -struct class cm_class = { - .owner = THIS_MODULE, - .name = "infiniband_cm", - .devnode = cm_devnode, -}; -EXPORT_SYMBOL(cm_class); - -static int cm_create_port_fs(struct cm_port *port) -{ - int i, ret; - - for (i = 0; i < CM_COUNTER_GROUPS; i++) { - ret = ib_port_register_module_stat(port->cm_dev->ib_device, - port->port_num, - &port->counter_group[i].obj, - &cm_counter_obj_type, - counter_group_names[i]); - if (ret) - goto error; +#define CM_COUNTER_ATTR(_name, _group, _index) \ + { \ + .attr = __ATTR(_name, 0444, cm_show_counter, NULL), \ + .group = _group, .index = _index \ } - return 0; - -error: - while (i--) - ib_port_unregister_module_stat(&port->counter_group[i].obj); - return ret; - -} - -static void cm_remove_port_fs(struct cm_port *port) -{ - int i; - - for (i = 0; i < CM_COUNTER_GROUPS; i++) - ib_port_unregister_module_stat(&port->counter_group[i].obj); +#define CM_COUNTER_GROUP(_group, _name) \ + static struct cm_counter_attribute cm_counter_attr_##_group[] = { \ + CM_COUNTER_ATTR(req, _group, CM_REQ_COUNTER), \ + CM_COUNTER_ATTR(mra, _group, CM_MRA_COUNTER), \ + CM_COUNTER_ATTR(rej, _group, CM_REJ_COUNTER), \ + CM_COUNTER_ATTR(rep, _group, CM_REP_COUNTER), \ + CM_COUNTER_ATTR(rtu, _group, CM_RTU_COUNTER), \ + CM_COUNTER_ATTR(dreq, _group, CM_DREQ_COUNTER), \ + CM_COUNTER_ATTR(drep, _group, CM_DREP_COUNTER), \ + CM_COUNTER_ATTR(sidr_req, _group, CM_SIDR_REQ_COUNTER), \ + CM_COUNTER_ATTR(sidr_rep, _group, CM_SIDR_REP_COUNTER), \ + CM_COUNTER_ATTR(lap, _group, CM_LAP_COUNTER), \ + CM_COUNTER_ATTR(apr, _group, CM_APR_COUNTER), \ + }; \ + static struct attribute *cm_counter_attrs_##_group[] = { \ + &cm_counter_attr_##_group[0].attr.attr, \ + &cm_counter_attr_##_group[1].attr.attr, \ + &cm_counter_attr_##_group[2].attr.attr, \ + &cm_counter_attr_##_group[3].attr.attr, \ + &cm_counter_attr_##_group[4].attr.attr, \ + &cm_counter_attr_##_group[5].attr.attr, \ + &cm_counter_attr_##_group[6].attr.attr, \ + &cm_counter_attr_##_group[7].attr.attr, \ + &cm_counter_attr_##_group[8].attr.attr, \ + &cm_counter_attr_##_group[9].attr.attr, \ + &cm_counter_attr_##_group[10].attr.attr, \ + NULL, \ + }; \ + static const struct attribute_group cm_counter_group_##_group = { \ + .name = _name, \ + .attrs = cm_counter_attrs_##_group, \ + }; -} +CM_COUNTER_GROUP(CM_XMIT, "cm_tx_msgs") +CM_COUNTER_GROUP(CM_XMIT_RETRIES, "cm_tx_retries") +CM_COUNTER_GROUP(CM_RECV, "cm_rx_msgs") +CM_COUNTER_GROUP(CM_RECV_DUPLICATES, "cm_rx_duplicates") + +static const struct attribute_group *cm_counter_groups[] = { + &cm_counter_group_CM_XMIT, + &cm_counter_group_CM_XMIT_RETRIES, + &cm_counter_group_CM_RECV, + &cm_counter_group_CM_RECV_DUPLICATES, + NULL, +}; -static void cm_add_one(struct ib_device *ib_device) +static int cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; @@ -4292,34 +4335,38 @@ static void cm_add_one(struct ib_device *ib_device) unsigned long flags; int ret; int count = 0; - u8 i; + u32 i; cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt), GFP_KERNEL); if (!cm_dev) - return; + return -ENOMEM; + kref_init(&cm_dev->kref); + spin_lock_init(&cm_dev->mad_agent_lock); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; + ib_set_client_data(ib_device, &cm_client, cm_dev); + set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); - for (i = 1; i <= ib_device->phys_port_cnt; i++) { + rdma_for_each_port (ib_device, i) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = kzalloc(sizeof *port, GFP_KERNEL); - if (!port) + if (!port) { + ret = -ENOMEM; goto error1; + } cm_dev->port[i-1] = port; port->cm_dev = cm_dev; port->port_num = i; - INIT_LIST_HEAD(&port->cm_priv_prim_list); - INIT_LIST_HEAD(&port->cm_priv_altr_list); - - ret = cm_create_port_fs(port); + ret = ib_port_register_client_groups(ib_device, i, + cm_counter_groups); if (ret) goto error1; @@ -4331,8 +4378,10 @@ static void cm_add_one(struct ib_device *ib_device) cm_recv_handler, port, 0); - if (IS_ERR(port->mad_agent)) + if (IS_ERR(port->mad_agent)) { + ret = PTR_ERR(port->mad_agent); goto error2; + } ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) @@ -4341,24 +4390,23 @@ static void cm_add_one(struct ib_device *ib_device) count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; - - ib_set_client_data(ib_device, &cm_client, cm_dev); + } write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); - return; + return 0; error3: ib_unregister_mad_agent(port->mad_agent); error2: - cm_remove_port_fs(port); + ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; - kfree(port); while (--i) { if (!rdma_cap_ib_cm(ib_device, i)) continue; @@ -4366,27 +4414,23 @@ error1: port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); - cm_remove_port_fs(port); - kfree(port); + ib_port_unregister_client_groups(ib_device, i, + cm_counter_groups); } free: - kfree(cm_dev); + cm_device_put(cm_dev); + return ret; } static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; struct cm_port *port; - struct cm_id_private *cm_id_priv; - struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; - int i; - - if (!cm_dev) - return; + u32 i; write_lock_irqsave(&cm.device_lock, flags); list_del(&cm_dev->list); @@ -4396,35 +4440,34 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) cm_dev->going_down = 1; spin_unlock_irq(&cm.lock); - for (i = 1; i <= ib_device->phys_port_cnt; i++) { + rdma_for_each_port (ib_device, i) { + struct ib_mad_agent *mad_agent; + if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; + mad_agent = port->mad_agent; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - /* Mark all the cm_id's as not valid */ - spin_lock_irq(&cm.lock); - list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) - cm_id_priv->altr_send_port_not_ready = 1; - list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) - cm_id_priv->prim_send_port_not_ready = 1; - spin_unlock_irq(&cm.lock); /* * We flush the queue here after the going_down set, this * verify that no new works will be queued in the recv handler, * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); - spin_lock_irq(&cm.state_lock); - cur_mad_agent = port->mad_agent; + /* + * The above ensures no call paths from the work are running, + * the remaining paths all take the mad_agent_lock. + */ + spin_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; - spin_unlock_irq(&cm.state_lock); - ib_unregister_mad_agent(cur_mad_agent); - cm_remove_port_fs(port); - kfree(port); + spin_unlock(&cm_dev->mad_agent_lock); + ib_unregister_mad_agent(mad_agent); + ib_port_unregister_client_groups(ib_device, i, + cm_counter_groups); } - kfree(cm_dev); + cm_device_put(cm_dev); } static int __init ib_cm_init(void) @@ -4434,22 +4477,15 @@ static int __init ib_cm_init(void) INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); - spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; cm.remote_qp_table = RB_ROOT; cm.remote_sidr_table = RB_ROOT; - xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); + xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); - ret = class_register(&cm_class); - if (ret) { - ret = -ENOMEM; - goto error1; - } - cm.wq = alloc_workqueue("ib_cm", 0, 1); if (!cm.wq) { ret = -ENOMEM; @@ -4464,8 +4500,6 @@ static int __init ib_cm_init(void) error3: destroy_workqueue(cm.wq); error2: - class_unregister(&cm_class); -error1: return ret; } @@ -4486,7 +4520,6 @@ static void __exit ib_cm_cleanup(void) kfree(timewait_info); } - class_unregister(&cm_class); WARN_ON(!xa_empty(&cm.local_id_table)); } diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 0cc40656b5c5..8462de7ca26e 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -22,7 +22,7 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) { u8 transport_type = IBA_GET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg); - switch(transport_type) { + switch (transport_type) { case 0: return IB_QPT_RC; case 1: return IB_QPT_UC; case 3: @@ -37,7 +37,7 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, enum ib_qp_type qp_type) { - switch(qp_type) { + switch (qp_type) { case IB_QPT_UC: IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 1); break; diff --git a/drivers/infiniband/core/cm_trace.c b/drivers/infiniband/core/cm_trace.c new file mode 100644 index 000000000000..8f3482f66338 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the IB Connection Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include <rdma/rdma_cm.h> +#include "cma_priv.h" + +#define CREATE_TRACE_POINTS + +#include "cm_trace.h" diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h new file mode 100644 index 000000000000..e9d282679ef1 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.h @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020 Oracle and/or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ib_cma + +#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_IB_CMA_H + +#include <linux/tracepoint.h> +#include <rdma/ib_cm.h> +#include <trace/events/rdma.h> + +/* + * enum ib_cm_state, from include/rdma/ib_cm.h + */ +#define IB_CM_STATE_LIST \ + ib_cm_state(IDLE) \ + ib_cm_state(LISTEN) \ + ib_cm_state(REQ_SENT) \ + ib_cm_state(REQ_RCVD) \ + ib_cm_state(MRA_REQ_SENT) \ + ib_cm_state(MRA_REQ_RCVD) \ + ib_cm_state(REP_SENT) \ + ib_cm_state(REP_RCVD) \ + ib_cm_state(MRA_REP_SENT) \ + ib_cm_state(MRA_REP_RCVD) \ + ib_cm_state(ESTABLISHED) \ + ib_cm_state(DREQ_SENT) \ + ib_cm_state(DREQ_RCVD) \ + ib_cm_state(TIMEWAIT) \ + ib_cm_state(SIDR_REQ_SENT) \ + ib_cm_state_end(SIDR_REQ_RCVD) + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_STATE_LIST + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) { IB_CM_##x, #x }, +#define ib_cm_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_state(x) \ + __print_symbolic(x, IB_CM_STATE_LIST) + +/* + * enum ib_cm_lap_state, from include/rdma/ib_cm.h + */ +#define IB_CM_LAP_STATE_LIST \ + ib_cm_lap_state(LAP_UNINIT) \ + ib_cm_lap_state(LAP_IDLE) \ + ib_cm_lap_state(LAP_SENT) \ + ib_cm_lap_state(LAP_RCVD) \ + ib_cm_lap_state(MRA_LAP_SENT) \ + ib_cm_lap_state_end(MRA_LAP_RCVD) + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_LAP_STATE_LIST + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) { IB_CM_##x, #x }, +#define ib_cm_lap_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_lap_state(x) \ + __print_symbolic(x, IB_CM_LAP_STATE_LIST) + +/* + * enum ib_cm_rej_reason, from include/rdma/ib_cm.h + */ +#define IB_CM_REJ_REASON_LIST \ + ib_cm_rej_reason(REJ_NO_QP) \ + ib_cm_rej_reason(REJ_NO_EEC) \ + ib_cm_rej_reason(REJ_NO_RESOURCES) \ + ib_cm_rej_reason(REJ_TIMEOUT) \ + ib_cm_rej_reason(REJ_UNSUPPORTED) \ + ib_cm_rej_reason(REJ_INVALID_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_COMM_INSTANCE) \ + ib_cm_rej_reason(REJ_INVALID_SERVICE_ID) \ + ib_cm_rej_reason(REJ_INVALID_TRANSPORT_TYPE) \ + ib_cm_rej_reason(REJ_STALE_CONN) \ + ib_cm_rej_reason(REJ_RDC_NOT_EXIST) \ + ib_cm_rej_reason(REJ_INVALID_GID) \ + ib_cm_rej_reason(REJ_INVALID_LID) \ + ib_cm_rej_reason(REJ_INVALID_SL) \ + ib_cm_rej_reason(REJ_INVALID_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_PACKET_RATE) \ + ib_cm_rej_reason(REJ_INVALID_ALT_GID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_LID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_SL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_ALT_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_ALT_PACKET_RATE) \ + ib_cm_rej_reason(REJ_PORT_CM_REDIRECT) \ + ib_cm_rej_reason(REJ_PORT_REDIRECT) \ + ib_cm_rej_reason(REJ_INVALID_MTU) \ + ib_cm_rej_reason(REJ_INSUFFICIENT_RESP_RESOURCES) \ + ib_cm_rej_reason(REJ_CONSUMER_DEFINED) \ + ib_cm_rej_reason(REJ_INVALID_RNR_RETRY) \ + ib_cm_rej_reason(REJ_DUPLICATE_LOCAL_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_CLASS_VERSION) \ + ib_cm_rej_reason(REJ_INVALID_FLOW_LABEL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_FLOW_LABEL) \ + ib_cm_rej_reason_end(REJ_VENDOR_OPTION_NOT_SUPPORTED) + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_rej_reason_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_REJ_REASON_LIST + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) { IB_CM_##x, #x }, +#define ib_cm_rej_reason_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_rej_reason(x) \ + __print_symbolic(x, IB_CM_REJ_REASON_LIST) + +DECLARE_EVENT_CLASS(icm_id_class, + TP_PROTO( + const struct ib_cm_id *cm_id + ), + + TP_ARGS(cm_id), + + TP_STRUCT__entry( + __field(const void *, cm_id) /* for eBPF scripts */ + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + __field(unsigned long, lap_state) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->lap_state = cm_id->lap_state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_lap_state(__entry->lap_state) + ) +); + +#define DEFINE_CM_SEND_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_send_##name, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_SEND_EVENT(req); +DEFINE_CM_SEND_EVENT(rep); +DEFINE_CM_SEND_EVENT(dup_req); +DEFINE_CM_SEND_EVENT(dup_rep); +DEFINE_CM_SEND_EVENT(rtu); +DEFINE_CM_SEND_EVENT(mra); +DEFINE_CM_SEND_EVENT(sidr_req); +DEFINE_CM_SEND_EVENT(sidr_rep); +DEFINE_CM_SEND_EVENT(dreq); +DEFINE_CM_SEND_EVENT(drep); + +TRACE_EVENT(icm_send_rej, + TP_PROTO( + const struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason + ), + + TP_ARGS(cm_id, reason), + + TP_STRUCT__entry( + __field(const void *, cm_id) + __field(u32, local_id) + __field(u32, remote_id) + __field(unsigned long, state) + __field(unsigned long, reason) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->reason = reason; + ), + + TP_printk("local_id=%u remote_id=%u state=%s reason=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_rej_reason(__entry->reason) + ) +); + +#define DEFINE_CM_ERR_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_##name##_err, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_ERR_EVENT(send_cm_rtu); +DEFINE_CM_ERR_EVENT(establish); +DEFINE_CM_ERR_EVENT(no_listener); +DEFINE_CM_ERR_EVENT(send_drep); +DEFINE_CM_ERR_EVENT(dreq_unknown); +DEFINE_CM_ERR_EVENT(send_unknown_rej); +DEFINE_CM_ERR_EVENT(rej_unknown); +DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(mra_unknown); +DEFINE_CM_ERR_EVENT(qp_init); +DEFINE_CM_ERR_EVENT(qp_rtr); +DEFINE_CM_ERR_EVENT(qp_rts); + +DEFINE_EVENT(icm_id_class, \ + icm_dreq_skipped, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id) \ +); + +DECLARE_EVENT_CLASS(icm_local_class, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id + ), + + TP_ARGS(local_id, remote_id), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + ), + + TP_printk("local_id=%u remote_id=%u", + __entry->local_id, __entry->remote_id + ) +); + +#define DEFINE_CM_LOCAL_EVENT(name) \ + DEFINE_EVENT(icm_local_class, \ + icm_##name, \ + TP_PROTO( \ + unsigned int local_id, \ + unsigned int remote_id \ + ), \ + TP_ARGS(local_id, remote_id)) + +DEFINE_CM_LOCAL_EVENT(issue_rej); +DEFINE_CM_LOCAL_EVENT(issue_drep); +DEFINE_CM_LOCAL_EVENT(staleconn_err); +DEFINE_CM_LOCAL_EVENT(no_priv_err); + +DECLARE_EVENT_CLASS(icm_remote_class, + TP_PROTO( + u32 remote_id + ), + + TP_ARGS(remote_id), + + TP_STRUCT__entry( + __field(u32, remote_id) + ), + + TP_fast_assign( + __entry->remote_id = remote_id; + ), + + TP_printk("remote_id=%u", + __entry->remote_id + ) +); + +#define DEFINE_CM_REMOTE_EVENT(name) \ + DEFINE_EVENT(icm_remote_class, \ + icm_##name, \ + TP_PROTO( \ + u32 remote_id \ + ), \ + TP_ARGS(remote_id)) + +DEFINE_CM_REMOTE_EVENT(remote_no_priv_err); +DEFINE_CM_REMOTE_EVENT(insert_failed_err); + +TRACE_EVENT(icm_send_rep_err, + TP_PROTO( + __be32 local_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = be32_to_cpu(local_id); + __entry->state = state; + ), + + TP_printk("local_id=%u state=%s", + __entry->local_id, show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_rep_unknown_err, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, remote_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + __entry->state = state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_handler_err, + TP_PROTO( + enum ib_cm_event_type event + ), + + TP_ARGS(event), + + TP_STRUCT__entry( + __field(unsigned long, event) + ), + + TP_fast_assign( + __entry->event = event; + ), + + TP_printk("unhandled event=%s", + rdma_show_ib_cm_event(__entry->event) + ) +); + +TRACE_EVENT(icm_mad_send_err, + TP_PROTO( + enum ib_cm_state state, + enum ib_wc_status wc_status + ), + + TP_ARGS(state, wc_status), + + TP_STRUCT__entry( + __field(unsigned long, state) + __field(unsigned long, wc_status) + ), + + TP_fast_assign( + __entry->state = state; + __entry->wc_status = wc_status; + ), + + TP_printk("state=%s completion status=%s", + show_ib_cm_state(__entry->state), + rdma_show_wc_status(__entry->wc_status) + ) +); + +#endif /* _TRACE_IB_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../drivers/infiniband/core +#define TRACE_INCLUDE_FILE cm_trace + +#include <trace/define_trace.h> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2dec3a02ab9f..26d1772179b8 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -11,6 +11,7 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> @@ -20,6 +21,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> @@ -43,7 +45,6 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 -#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 @@ -68,6 +69,9 @@ static const char * const cma_events[] = { [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type); + const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; @@ -91,7 +95,13 @@ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, } EXPORT_SYMBOL(rdma_reject_msg); -bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) +/** + * rdma_is_consumer_reject - return true if the consumer rejected the connect + * request. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; @@ -102,7 +112,6 @@ bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) WARN_ON_ONCE(1); return false; } -EXPORT_SYMBOL(rdma_is_consumer_reject); const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) @@ -148,7 +157,7 @@ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_res_to_id); -static void cma_add_one(struct ib_device *device); +static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { @@ -161,6 +170,9 @@ static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); +static struct rb_root id_table = RB_ROOT; +/* Serialize operations of id_table tree */ +static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; @@ -195,11 +207,16 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) } } +struct id_table_entry { + struct list_head id_list; + struct rb_node rb_node; +}; + struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; - atomic_t refcount; + refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; @@ -211,14 +228,6 @@ struct rdma_bind_list { unsigned short port; }; -struct class_port_info_context { - struct ib_class_port_info *class_port_info; - struct ib_device *device; - struct completion done; - struct ib_sa_query *sa_query; - u8 port_num; -}; - static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { @@ -247,9 +256,15 @@ enum { CMA_OPTION_AFONLY, }; -void cma_ref_dev(struct cma_device *cma_dev) +void cma_dev_get(struct cma_device *cma_dev) +{ + refcount_inc(&cma_dev->refcount); +} + +void cma_dev_put(struct cma_device *cma_dev) { - atomic_inc(&cma_dev->refcount); + if (refcount_dec_and_test(&cma_dev->refcount)) + complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, @@ -267,13 +282,13 @@ struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, } if (found_cma_dev) - cma_ref_dev(found_cma_dev); + cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, - unsigned int port) + u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; @@ -282,7 +297,7 @@ int cma_get_default_gid_type(struct cma_device *cma_dev, } int cma_set_default_gid_type(struct cma_device *cma_dev, - unsigned int port, + u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; @@ -290,6 +305,10 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; + if (default_gid_type == IB_GID_TYPE_IB && + rdma_protocol_roce_eth_encap(cma_dev->device, port)) + default_gid_type = IB_GID_TYPE_ROCE; + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) @@ -301,7 +320,7 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, return 0; } -int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) +int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; @@ -309,7 +328,7 @@ int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } -int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port, +int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) @@ -335,12 +354,15 @@ struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) struct cma_multicast { struct rdma_id_private *id_priv; union { - struct ib_sa_multicast *ib; - } multicast; + struct ib_sa_multicast *sa_mc; + struct { + struct work_struct work; + struct rdma_cm_event event; + } iboe_join; + }; struct list_head list; void *context; struct sockaddr_storage addr; - struct kref mcref; u8 join_state; }; @@ -352,18 +374,6 @@ struct cma_work { struct rdma_cm_event event; }; -struct cma_ndev_work { - struct work_struct work; - struct rdma_id_private *id; - struct rdma_cm_event event; -}; - -struct iboe_mcast_work { - struct work_struct work; - struct rdma_id_private *id; - struct cma_multicast *mc; -}; - union cma_ip_addr { struct in6_addr ip6; struct { @@ -393,23 +403,21 @@ struct cma_req_info { u16 pkey; }; -static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&id_priv->lock, flags); - ret = (id_priv->state == comp); - spin_unlock_irqrestore(&id_priv->lock, flags); - return ret; -} - static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; + /* + * The FSM uses a funny double locking where state is protected by both + * the handler_mutex and the spinlock. State is not allowed to change + * to/from a handler_mutex protected value without also holding + * handler_mutex. + */ + if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; @@ -417,27 +425,24 @@ static int cma_comp_exch(struct rdma_id_private *id_priv, return ret; } -static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, - enum rdma_cm_state exch) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { - unsigned long flags; - enum rdma_cm_state old; + return hdr->ip_version >> 4; +} - spin_lock_irqsave(&id_priv->lock, flags); - old = id_priv->state; - id_priv->state = exch; - spin_unlock_irqrestore(&id_priv->lock, flags); - return old; +static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) +static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { - return hdr->ip_version >> 4; + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } -static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { - hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) @@ -460,19 +465,128 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) return (in_dev) ? 0 : -ENODEV; } +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, + struct id_table_entry *entry_b) +{ + struct rdma_id_private *id_priv = list_first_entry( + &entry_b->id_list, struct rdma_id_private, id_list_entry); + int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; + struct sockaddr *sb = cma_dst_addr(id_priv); + + if (ifindex_a != ifindex_b) + return (ifindex_a > ifindex_b) ? 1 : -1; + + if (sa->sa_family != sb->sa_family) + return sa->sa_family - sb->sa_family; + + if (sa->sa_family == AF_INET) + return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr, + (char *)&((struct sockaddr_in *)sb)->sin_addr, + sizeof(((struct sockaddr_in *)sa)->sin_addr)); + + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, + &((struct sockaddr_in6 *)sb)->sin6_addr); +} + +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) +{ + struct rb_node **new, *parent = NULL; + struct id_table_entry *this, *node; + unsigned long flags; + int result; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + spin_lock_irqsave(&id_table_lock, flags); + new = &id_table.rb_node; + while (*new) { + this = container_of(*new, struct id_table_entry, rb_node); + result = compare_netdev_and_ip( + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv), this); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else { + list_add_tail(&node_id_priv->id_list_entry, + &this->id_list); + kfree(node); + goto unlock; + } + } + + INIT_LIST_HEAD(&node->id_list); + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); + + rb_link_node(&node->rb_node, parent, new); + rb_insert_color(&node->rb_node, &id_table); + +unlock: + spin_unlock_irqrestore(&id_table_lock, flags); + return 0; +} + +static struct id_table_entry * +node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) +{ + struct rb_node *node = root->rb_node; + struct id_table_entry *data; + int result; + + while (node) { + data = container_of(node, struct id_table_entry, rb_node); + result = compare_netdev_and_ip(ifindex, sa, data); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) +{ + struct id_table_entry *data; + unsigned long flags; + + spin_lock_irqsave(&id_table_lock, flags); + if (list_empty(&id_priv->id_list_entry)) + goto out; + + data = node_from_ndev_ip(&id_table, + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + if (!data) + goto out; + + list_del_init(&id_priv->id_list_entry); + if (list_empty(&data->id_list)) { + rb_erase(&data->rb_node, &id_table); + kfree(data); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); +} + static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { - cma_ref_dev(cma_dev); + cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); - list_add_tail(&id_priv->list, &cma_dev->id_list); - if (id_priv->res.kern_name) - rdma_restrack_kadd(&id_priv->res); - else - rdma_restrack_uadd(&id_priv->res); + list_add_tail(&id_priv->device_item, &cma_dev->id_list); + + trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -484,39 +598,20 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, rdma_start_port(cma_dev->device)]; } -void cma_deref_dev(struct cma_device *cma_dev) -{ - if (atomic_dec_and_test(&cma_dev->refcount)) - complete(&cma_dev->comp); -} - -static inline void release_mc(struct kref *kref) -{ - struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); - - kfree(mc->multicast.ib); - kfree(mc); -} - static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); - list_del(&id_priv->list); - cma_deref_dev(id_priv->cma_dev); + list_del_init(&id_priv->device_item); + cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; + id_priv->id.device = NULL; + if (id_priv->id.route.addr.dev_addr.sgid_attr) { + rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = NULL; + } mutex_unlock(&lock); } -static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.src_addr; -} - -static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; -} - static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; @@ -579,7 +674,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a } static const struct ib_gid_attr * -cma_validate_port(struct ib_device *device, u8 port, +cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) @@ -637,7 +732,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; - unsigned int port; + u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -717,6 +812,7 @@ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); + rdma_restrack_add(&id_priv->res); return 0; } @@ -729,7 +825,7 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; - u8 port; + u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -753,7 +849,7 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, } list_for_each_entry(cma_dev, &dev_list, list) { - for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; @@ -771,8 +867,10 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, } out: - if (!ret) + if (!ret) { cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); + } mutex_unlock(&lock); return ret; @@ -786,9 +884,10 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; + unsigned int p; u16 pkey, index; - u8 p; enum ib_port_state port_state; + int ret; int i; cma_dev = NULL; @@ -798,7 +897,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; @@ -807,9 +906,14 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; - for (i = 0; !rdma_query_gid(cur_dev->device, - p, i, &gid); - i++) { + + for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; + ++i) { + ret = rdma_query_gid(cur_dev->device, p, i, + &gid); + if (ret) + continue; + if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; @@ -833,6 +937,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) found: cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); @@ -840,16 +945,21 @@ found: return 0; } -static void cma_deref_id(struct rdma_id_private *id_priv) +static void cma_id_get(struct rdma_id_private *id_priv) { - if (atomic_dec_and_test(&id_priv->refcount)) + refcount_inc(&id_priv->refcount); +} + +static void cma_id_put(struct rdma_id_private *id_priv) +{ + if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } -struct rdma_cm_id *__rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_ucm_port_space ps, - enum ib_qp_type qp_type, const char *caller) +static struct rdma_id_private * +__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; @@ -857,8 +967,6 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - rdma_restrack_set_task(&id_priv->res, caller); - id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; @@ -866,22 +974,60 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; + id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); - atomic_set(&id_priv->refcount, 1); + refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); + INIT_LIST_HEAD(&id_priv->device_item); + INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; - trace_cm_id_create(id_priv); - return &id_priv->id; + rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); + if (parent) + rdma_restrack_parent_name(&id_priv->res, &parent->res); + + return id_priv; +} + +struct rdma_cm_id * +__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const char *caller) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, caller); + return &ret->id; } -EXPORT_SYMBOL(__rdma_create_id); +EXPORT_SYMBOL(__rdma_create_kernel_id); + +struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, + void *context, + enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, + ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, NULL); + return &ret->id; +} +EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { @@ -1114,12 +1260,16 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; - } else + } else { ret = -ENOSYS; + } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; + if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) + qp_attr->min_rnr_timer = id_priv->min_rnr_timer; + return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); @@ -1406,7 +1556,7 @@ static bool validate_ipv4_net_dev(struct net_device *net_dev, return false; memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_iif = net_dev->ifindex; + fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; @@ -1560,7 +1710,7 @@ static bool cma_match_private_data(struct rdma_id_private *id_priv, static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; - const int port_num = id->port_num ?: rdma_start_port(device); + const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } @@ -1614,6 +1764,8 @@ static struct rdma_id_private *cma_find_listener( { struct rdma_id_private *id_priv, *id_priv_dev; + lockdep_assert_held(&lock); + if (!bind_list) return ERR_PTR(-EINVAL); @@ -1624,7 +1776,7 @@ static struct rdma_id_private *cma_find_listener( return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, - listen_list) { + listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) @@ -1660,6 +1812,7 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } } + mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice @@ -1688,8 +1841,8 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } if (!validate_net_dev(*net_dev, - (struct sockaddr *)&req->listen_addr_storage, - (struct sockaddr *)&req->src_addr_storage)) { + (struct sockaddr *)&req->src_addr_storage, + (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } @@ -1701,6 +1854,7 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); + mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; @@ -1721,28 +1875,36 @@ static void cma_cancel_route(struct rdma_id_private *id_priv) } } -static void cma_cancel_listens(struct rdma_id_private *id_priv) +static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; + lockdep_assert_held(&lock); + /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ - mutex_lock(&lock); - list_del(&id_priv->list); + list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { - dev_id_priv = list_entry(id_priv->listen_list.next, - struct rdma_id_private, listen_list); + dev_id_priv = + list_first_entry(&id_priv->listen_list, + struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ - list_del_init(&dev_id_priv->list); - list_del(&dev_id_priv->listen_list); + list_del_init(&dev_id_priv->device_item); + list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); } @@ -1751,6 +1913,14 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, { switch (state) { case RDMA_CM_ADDR_QUERY: + /* + * We can avoid doing the rdma_addr_cancel() based on state, + * only RDMA_CM_ADDR_QUERY has a work that could still execute. + * Notice that the addr_handler work could still be exiting + * outside this state, however due to the interaction with the + * handler_mutex the work is guaranteed not to touch id_priv + * during exit. + */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: @@ -1782,19 +1952,39 @@ static void cma_release_port(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static void cma_leave_roce_mc_group(struct rdma_id_private *id_priv, - struct cma_multicast *mc) +static void destroy_mc(struct rdma_id_private *id_priv, + struct cma_multicast *mc) { - struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = NULL; + bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false); + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) + ib_sa_free_multicast(mc->sa_mc); + + if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev && !send_only) { + enum ib_gid_type gid_type; + union ib_gid mgid; + + gid_type = id_priv->cma_dev->default_gid_type + [id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, + gid_type); + cma_igmp_send(ndev, &mgid, false); + } dev_put(ndev); + + cancel_work_sync(&mc->iboe_join.work); } - kref_put(&mc->mcref, release_mc); + kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -1802,37 +1992,20 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { - mc = container_of(id_priv->mc_list.next, - struct cma_multicast, list); + mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, + list); list_del(&mc->list); - if (rdma_cap_ib_mcast(id_priv->cma_dev->device, - id_priv->id.port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else { - cma_leave_roce_mc_group(id_priv, mc); - } + destroy_mc(id_priv, mc); } } -void rdma_destroy_id(struct rdma_cm_id *id) +static void _destroy_id(struct rdma_id_private *id_priv, + enum rdma_cm_state state) { - struct rdma_id_private *id_priv; - enum rdma_cm_state state; - - id_priv = container_of(id, struct rdma_id_private, id); - trace_cm_id_destroy(id_priv); - state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); - /* - * Wait for any active callback to finish. New callbacks will find - * the id_priv state set to destroying and abort. - */ - mutex_lock(&id_priv->handler_mutex); - mutex_unlock(&id_priv->handler_mutex); - rdma_restrack_del(&id_priv->res); + cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) @@ -1846,20 +2019,55 @@ void rdma_destroy_id(struct rdma_cm_id *id) } cma_release_port(id_priv); - cma_deref_id(id_priv); + cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); + cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); - - if (id_priv->id.route.addr.dev_addr.sgid_attr) - rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + kfree(id_priv->id.route.path_rec_inbound); + kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } + +/* + * destroy an ID from within the handler_mutex. This ensures that no other + * handlers can start running concurrently. + */ +static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) + __releases(&idprv->handler_mutex) +{ + enum rdma_cm_state state; + unsigned long flags; + + trace_cm_id_destroy(id_priv); + + /* + * Setting the state to destroyed under the handler mutex provides a + * fence against calling handler callbacks. If this is invoked due to + * the failure of a handler callback then it guarentees that no future + * handlers will be called. + */ + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + id_priv->state = RDMA_CM_DESTROYING; + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + _destroy_id(id_priv, state); +} + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); + destroy_id_handler_unlock(id_priv); +} EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) @@ -1901,6 +2109,9 @@ static void cma_set_rep_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; + + event->ece.vendor_id = rep_data->ece.vendor_id; + event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, @@ -1908,6 +2119,8 @@ static int cma_cm_event_handler(struct rdma_id_private *id_priv, { int ret; + lockdep_assert_held(&id_priv->handler_mutex); + trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); @@ -1919,13 +2132,15 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; - int ret = 0; + enum rdma_cm_state state; + int ret; mutex_lock(&id_priv->handler_mutex); + state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_CONNECT) || + state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_DISCONNECT)) + state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { @@ -1935,7 +2150,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - if (cma_comp(id_priv, RDMA_CM_CONNECT) && + if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); @@ -1955,7 +2170,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: - event.status = -ETIMEDOUT; /* fall through */ + event.status = -ETIMEDOUT; + fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, @@ -1988,14 +2204,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); - return ret; + return 0; } static struct rdma_id_private * @@ -2014,28 +2228,29 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); - id = __rdma_create_id(listen_id->route.addr.dev_addr.net, - listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type, - listen_id_priv->res.kern_name); - if (IS_ERR(id)) + id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, + listen_id->ps, + ib_event->param.req_rcvd.qp_type, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; - rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; - rt->path_rec = kmalloc_array(rt->num_paths, sizeof(*rt->path_rec), - GFP_KERNEL); + rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, + sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; - if (rt->num_paths == 2) + if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { @@ -2075,13 +2290,13 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); - id = __rdma_create_id(net, listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD, - listen_id_priv->res.kern_name); - if (IS_ERR(id)) + id_priv = __rdma_create_id(net, listen_id->event_handler, + listen_id->context, listen_id->ps, IB_QPT_UD, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, @@ -2119,6 +2334,9 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; + + event->ece.vendor_id = req_data->ece.vendor_id; + event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, @@ -2152,9 +2370,9 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) { + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; - goto err1; + goto err_unlock; } offset = cma_user_data_offset(listen_id); @@ -2171,55 +2389,38 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } if (!conn_id) { ret = -ENOMEM; - goto err1; + goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); - if (ret) - goto err2; + if (ret) { + destroy_id_handler_unlock(conn_id); + goto err_unlock; + } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; - /* - * Protect against the user destroying conn_id from another thread - * until we're done accessing it. - */ - atomic_inc(&conn_id->refcount); ret = cma_cm_event_handler(conn_id, &event); - if (ret) - goto err3; - /* - * Acquire mutex to prevent user executing rdma_destroy_id() - * while we're accessing the cm_id. - */ - mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && - (conn_id->id.qp_type != IB_QPT_UD)) { + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + goto net_dev_put; + } + + if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && + conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } - mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); - mutex_unlock(&listen_id->handler_mutex); - cma_deref_id(conn_id); - if (net_dev) - dev_put(net_dev); - return 0; -err3: - cma_deref_id(conn_id); - /* Destroy the CM ID by returning a non-zero value. */ - conn_id->cm_id.ib = NULL; -err2: - cma_exch(conn_id, RDMA_CM_DESTROYING); - mutex_unlock(&conn_id->handler_mutex); -err1: +err_unlock: mutex_unlock(&listen_id->handler_mutex); - if (conn_id) - rdma_destroy_id(&conn_id->id); net_dev_put: if (net_dev) @@ -2273,7 +2474,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { @@ -2319,9 +2520,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } @@ -2333,7 +2532,6 @@ out: static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { - struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; @@ -2349,35 +2547,33 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, - listen_id->id.event_handler, - listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC, - listen_id->res.kern_name); - if (IS_ERR(new_cm_id)) { + conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, + listen_id->id.context, RDMA_PS_TCP, + IB_QPT_RC, listen_id); + if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } - conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } conn_id->cm_id.iw = cm_id; @@ -2387,25 +2583,16 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - /* - * Protect against the user destroying conn_id from another thread - * until we're done accessing it. - */ - atomic_inc(&conn_id->refcount); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; - cma_exch(conn_id, RDMA_CM_DESTROYING); - mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); - cma_deref_id(conn_id); - rdma_destroy_id(&conn_id->id); + destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); - cma_deref_id(conn_id); out: mutex_unlock(&listen_id->handler_mutex); @@ -2440,8 +2627,11 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) if (IS_ERR(id)) return PTR_ERR(id); + mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -2462,57 +2652,88 @@ static int cma_listen_handler(struct rdma_cm_id *id, { struct rdma_id_private *id_priv = id->context; + /* Listening IDs are always destroyed on removal */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + return -1; + id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } -static void cma_listen_on_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static int cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev, + struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; - struct rdma_cm_id *id; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; - if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) - return; + lockdep_assert_held(&lock); - id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type, id_priv->res.kern_name); - if (IS_ERR(id)) - return; + *to_destroy = NULL; + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) + return 0; - dev_id_priv = container_of(id, struct rdma_id_private, id); + dev_id_priv = + __rdma_create_id(net, cma_listen_handler, id_priv, + id_priv->id.ps, id_priv->id.qp_type, id_priv); + if (IS_ERR(dev_id_priv)) + return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); - list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); - atomic_inc(&id_priv->refcount); + rdma_restrack_add(&dev_id_priv->res); + cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; + mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; + mutex_unlock(&id_priv->qp_mutex); - ret = rdma_listen(id, id_priv->backlog); + ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) - dev_warn(&cma_dev->device->dev, - "RDMA CMA: cma_listen_on_dev, error %d\n", ret); + goto err_listen; + list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); + return 0; +err_listen: + /* Caller must destroy this after releasing lock */ + *to_destroy = dev_id_priv; + dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); + return ret; } -static void cma_listen_on_all(struct rdma_id_private *id_priv) +static int cma_listen_on_all(struct rdma_id_private *id_priv) { + struct rdma_id_private *to_destroy; struct cma_device *cma_dev; + int ret; mutex_lock(&lock); - list_add_tail(&id_priv->list, &listen_any_list); - list_for_each_entry(cma_dev, &dev_list, list) - cma_listen_on_dev(id_priv, cma_dev); + list_add_tail(&id_priv->listen_any_item, &listen_any_list); + list_for_each_entry(cma_dev, &dev_list, list) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) { + /* Prevent racing with cma_process_remove() */ + if (to_destroy) + list_del_init(&to_destroy->device_item); + goto err_listen; + } + } mutex_unlock(&lock); + return 0; + +err_listen: + _cma_cancel_listens(id_priv); + mutex_unlock(&lock); + if (to_destroy) + rdma_destroy_id(&to_destroy->id); + return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) @@ -2520,8 +2741,10 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; + mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); @@ -2544,37 +2767,124 @@ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; - if (id->qp_type != IB_QPT_RC) + if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; + mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); +/** + * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the + * QP associated with a connection identifier. + * @id: Communication identifier to associated with service type. + * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK + * Timer Field" in the IBTA specification. + * + * This function should be called before rdma_connect() on active + * side, and on passive side before rdma_accept(). The timer value + * will be associated with the local QP. When it receives a send it is + * not read to handle, typically if the receive queue is empty, an RNR + * Retry NAK is returned to the requester with the min_rnr_timer + * encoded. The requester will then wait at least the time specified + * in the NAK before retrying. The default is zero, which translates + * to a minimum RNR Timer value of 655 ms. + * + * Return: 0 for success + */ +int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) +{ + struct rdma_id_private *id_priv; + + /* It is a five-bit value */ + if (min_rnr_timer & 0xe0) + return -EINVAL; + + if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->min_rnr_timer = min_rnr_timer; + id_priv->min_rnr_timer_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_min_rnr_timer); + +static void route_set_path_rec_inbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_inbound) { + route->path_rec_inbound = + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); + if (!route->path_rec_inbound) + return; + } + + *route->path_rec_inbound = *path_rec; +} + +static void route_set_path_rec_outbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_outbound) { + route->path_rec_outbound = + kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); + if (!route->path_rec_outbound) + return; + } + + *route->path_rec_outbound = *path_rec; +} + static void cma_query_handler(int status, struct sa_path_rec *path_rec, - void *context) + int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; + int i; route = &work->id->id.route; - if (!status) { - route->num_paths = 1; - *route->path_rec = *path_rec; - } else { - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; - work->event.status = status; - pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", - status); + if (status) + goto fail; + + for (i = 0; i < num_prs; i++) { + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) + *route->path_rec = path_rec[i]; + else if (path_rec[i].flags & IB_PATH_INBOUND) + route_set_path_rec_inbound(work, &path_rec[i]); + else if (path_rec[i].flags & IB_PATH_OUTBOUND) + route_set_path_rec_outbound(work, &path_rec[i]); + } + if (!route->path_rec) { + status = -EINVAL; + goto fail; } + route->num_pri_alt_paths = 1; + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", + status); queue_work(cma_wq, &work->work); } @@ -2631,49 +2941,54 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, return (id_priv->query_id < 0) ? id_priv->query_id : 0; } -static void cma_work_handler(struct work_struct *_work) +static void cma_iboe_join_work_handler(struct work_struct *work) { - struct cma_work *work = container_of(_work, struct cma_work, work); - struct rdma_id_private *id_priv = work->id; - int destroy = 0; + struct cma_multicast *mc = + container_of(work, struct cma_multicast, iboe_join.work); + struct rdma_cm_event *event = &mc->iboe_join.event; + struct rdma_id_private *id_priv = mc->id_priv; + int ret; mutex_lock(&id_priv->handler_mutex); - if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) - goto out; + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; - if (cma_cm_event_handler(id_priv, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; - } -out: + ret = cma_cm_event_handler(id_priv, event); + WARN_ON(ret); + +out_unlock: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); - kfree(work); + if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&event->param.ud.ah_attr); } -static void cma_ndev_work_handler(struct work_struct *_work) +static void cma_work_handler(struct work_struct *_work) { - struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; - int destroy = 0; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) - goto out; + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + if (work->old_state != 0 || work->new_state != 0) { + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out_unlock; + } if (cma_cm_event_handler(id_priv, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + goto out_free; } -out: +out_unlock: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); + cma_id_put(id_priv); +out_free: + if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } @@ -2687,14 +3002,19 @@ static void cma_init_resolve_route_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } -static void cma_init_resolve_addr_work(struct cma_work *work, - struct rdma_id_private *id_priv) +static void enqueue_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) { + /* Balances with cma_id_put() in cma_work_handler */ + cma_id_get(id_priv); + work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + + queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, @@ -2710,7 +3030,8 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, cma_init_resolve_route_work(work, id_priv); - route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; @@ -2808,7 +3129,7 @@ int rdma_set_ib_path(struct rdma_cm_id *id, dev_put(ndev); } - id->route.num_paths = 1; + id->route.num_pri_alt_paths = 1; return 0; err_free: @@ -2851,9 +3172,10 @@ struct iboe_prio_tc_map { bool found; }; -static int get_lower_vlan_dev_tc(struct net_device *dev, void *data) +static int get_lower_vlan_dev_tc(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct iboe_prio_tc_map *map = data; + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); @@ -2872,16 +3194,18 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); + struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; + priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, - &prio_tc_map); + &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. @@ -2894,6 +3218,24 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } +static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) +{ + struct sockaddr_in6 *addr6; + u16 dport, sport; + u32 hash, fl; + + addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); + fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; + if ((cma_family(id_priv) != AF_INET6) || !fl) { + dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); + sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); + hash = (u32)sport * 31 + dport; + fl = hash & IB_GRH_FLOWLABEL_MASK; + } + + return cpu_to_be32(fl); +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -2904,8 +3246,11 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; - u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + u8 tos; + mutex_lock(&id_priv->qp_mutex); + tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) @@ -2917,7 +3262,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err1; } - route->num_paths = 1; + route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { @@ -2952,14 +3297,23 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ - route->path_rec->packet_life_time = id_priv->timeout_set ? - id_priv->timeout - 1 : CMA_IBOE_PACKET_LIFETIME; + mutex_lock(&id_priv->qp_mutex); + if (id_priv->timeout_set && id_priv->timeout) + route->path_rec->packet_life_time = id_priv->timeout - 1; + else + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } + if (rdma_protocol_roce_udp_encap(id_priv->id.device, + id_priv->id.port_num)) + route->path_rec->flow_label = + cma_get_roce_udp_flow_label(id_priv); + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); @@ -2968,6 +3322,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) err2: kfree(route->path_rec); route->path_rec = NULL; + route->num_pri_alt_paths = 0; err1: kfree(work); return ret; @@ -2978,15 +3333,21 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) struct rdma_id_private *id_priv; int ret; + if (!timeout_ms) + return -EINVAL; + id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; - atomic_inc(&id_priv->refcount); + cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); - else if (rdma_protocol_roce(id->device, id->port_num)) + else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); + if (!ret) + cma_add_id_to_tree(id_priv); + } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else @@ -2998,7 +3359,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); - cma_deref_id(id_priv); + cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); @@ -3025,9 +3386,9 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; + unsigned int p; u16 pkey; int ret; - u8 p; cma_dev = NULL; mutex_lock(&lock); @@ -3039,7 +3400,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) if (!cma_dev) cma_dev = cur_dev; - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; @@ -3072,6 +3433,7 @@ port_found: ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); @@ -3104,6 +3466,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); + rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } @@ -3120,9 +3483,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return; } out: @@ -3148,9 +3509,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - atomic_inc(&id_priv->refcount); - cma_init_resolve_addr_work(work, id_priv); - queue_work(cma_wq, &work->work); + enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); @@ -3175,9 +3534,7 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); - atomic_inc(&id_priv->refcount); - cma_init_resolve_addr_work(work, id_priv); - queue_work(cma_wq, &work->work); + enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); @@ -3187,50 +3544,80 @@ err: static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { - if (!src_addr || !src_addr->sa_family) { - src_addr = (struct sockaddr *) &id->route.addr.src_addr; - src_addr->sa_family = dst_addr->sa_family; - if (IS_ENABLED(CONFIG_IPV6) && - dst_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; - struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; - src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; - if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; - } else if (dst_addr->sa_family == AF_IB) { - ((struct sockaddr_ib *) src_addr)->sib_pkey = - ((struct sockaddr_ib *) dst_addr)->sib_pkey; - } + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr(id, src_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; } - return rdma_bind_addr(id, src_addr); + return rdma_bind_addr(id, (struct sockaddr *)&zero_sock); } -int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, unsigned long timeout_ms) +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) { - struct rdma_id_private *id_priv; int ret; - id_priv = container_of(id, struct rdma_id_private, id); memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - if (id_priv->state == RDMA_CM_IDLE) { - ret = cma_bind_addr(id, src_addr, dst_addr); - if (ret) { - memset(cma_dst_addr(id_priv), 0, - rdma_addr_size(dst_addr)); - return ret; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + goto err_dst; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) { + ret = -EINVAL; + goto err_dst; } } if (cma_family(id_priv) != dst_addr->sa_family) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return -EINVAL; + ret = -EINVAL; + goto err_state; } + return 0; - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return -EINVAL; - } +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); +err_dst: + memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); @@ -3238,6 +3625,21 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, @@ -3262,7 +3664,8 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (reuse || id_priv->state == RDMA_CM_IDLE) { + if ((reuse && id_priv->state != RDMA_CM_LISTEN) || + id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -3301,6 +3704,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, u64 sid, mask; __be16 port; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); port = htons(bind_list->port); @@ -3329,6 +3734,8 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list; int ret; + lockdep_assert_held(&lock); + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; @@ -3355,6 +3762,8 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); + lockdep_assert_held(&lock); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); @@ -3394,9 +3803,11 @@ static int cma_alloc_any_port(enum rdma_ucm_port_space ps, unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; + lockdep_assert_held(&lock); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32() % remaining + low; + rover = prandom_u32_max(remaining) + low; retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; @@ -3441,13 +3852,14 @@ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; - if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && - cur_id->reuseaddr) + if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); @@ -3471,6 +3883,8 @@ static int cma_use_port(enum rdma_ucm_port_space ps, unsigned short snum; int ret; + lockdep_assert_held(&lock); + snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; @@ -3486,18 +3900,6 @@ static int cma_use_port(enum rdma_ucm_port_space ps, return ret; } -static int cma_bind_listen(struct rdma_id_private *id_priv) -{ - struct rdma_bind_list *bind_list = id_priv->bind_list; - int ret = 0; - - mutex_lock(&lock); - if (bind_list->owners.first->next) - ret = cma_check_port(bind_list, id_priv, 0); - mutex_unlock(&lock); - return ret; -} - static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { @@ -3591,28 +3993,41 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, int rdma_listen(struct rdma_cm_id *id, int backlog) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { - id->route.addr.src_addr.ss_family = AF_INET; - ret = rdma_bind_addr(id, cma_src_addr(id_priv)); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + struct sockaddr_in any_in = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN))) + return -EINVAL; } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) - return -EINVAL; - + /* + * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable + * any more, and has to be unique in the bind list. + */ if (id_priv->reuseaddr) { - ret = cma_bind_listen(id_priv); + mutex_lock(&lock); + ret = cma_check_port(id_priv->bind_list, id_priv, 0); + if (!ret) + id_priv->reuseaddr = 0; + mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; - if (id->device) { + if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) @@ -3625,12 +4040,19 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) ret = -ENOSYS; goto err; } - } else - cma_listen_on_all(id_priv); + } else { + ret = cma_listen_on_all(id_priv); + if (ret) + goto err; + } return 0; err: id_priv->backlog = 0; + /* + * All the failure paths that lead here will not allow the req_handler's + * to have run. + */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } @@ -3683,9 +4105,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err2; + if (!cma_any_addr(addr)) + rdma_restrack_add(&id_priv->res); return 0; err2: - rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) cma_release_dev(id_priv); err1: @@ -3731,10 +4154,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; - int ret = 0; + int ret; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { @@ -3781,14 +4204,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); - return ret; + return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, @@ -3802,8 +4223,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); - req.private_data_len = offset + conn_param->private_data_len; - if (req.private_data_len < conn_param->private_data_len) + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { @@ -3862,8 +4282,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); - req.private_data_len = offset + conn_param->private_data_len; - if (req.private_data_len < conn_param->private_data_len) + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { @@ -3894,7 +4313,9 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, } req.primary_path = &route->path_rec[0]; - if (route->num_paths == 2) + req.primary_path_inbound = route->path_rec_inbound; + req.primary_path_outbound = route->path_rec_outbound; + if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; @@ -3912,6 +4333,8 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; + req.ece.vendor_id = id_priv->ece.vendor_id; + req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); @@ -3936,8 +4359,11 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); + mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), @@ -3968,12 +4394,21 @@ out: return ret; } -int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +/** + * rdma_connect_locked - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Same as rdma_connect() but can only be called from the + * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. + */ +int rdma_connect_locked(struct rdma_cm_id *id, + struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; @@ -3987,20 +4422,66 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - } else if (rdma_cap_iw_cm(id->device, id->port_num)) + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); - else + } else { ret = -ENOSYS; + } if (ret) - goto err; - + goto err_state; return 0; -err: +err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } +EXPORT_SYMBOL(rdma_connect_locked); + +/** + * rdma_connect - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Users must have resolved a route for the rdma_cm_id to connect with by having + * called rdma_resolve_route before calling this routine. + * + * This call will either connect to a remote QP or obtain remote QP information + * for unconnected rdma_cm_id's. The actual operation is based on the + * rdma_cm_id's port space. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + mutex_lock(&id_priv->handler_mutex); + ret = rdma_connect_locked(id, conn_param); + mutex_unlock(&id_priv->handler_mutex); + return ret; +} EXPORT_SYMBOL(rdma_connect); +/** + * rdma_connect_ece - Initiate an active connection request with ECE data. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * @ece: ECE parameters + * + * See rdma_connect() explanation. + */ +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_connect(id, conn_param); +} +EXPORT_SYMBOL(rdma_connect_ece); + static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { @@ -4026,6 +4507,8 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); @@ -4050,9 +4533,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; - if (id_priv->id.qp) { + if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; - } else + else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); @@ -4073,7 +4556,11 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; + + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; } + rep.private_data = private_data; rep.private_data_len = private_data_len; @@ -4081,17 +4568,33 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } -int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, - const char *caller) +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + * + * This function is for use by kernel ULPs and must be called from under the + * handler callback. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - - rdma_restrack_set_task(&id_priv->res, caller); + lockdep_assert_held(&id_priv->handler_mutex); - if (!cma_comp(id_priv, RDMA_CM_CONNECT)) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { @@ -4115,21 +4618,52 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, else ret = cma_rep_recv(id_priv); } - } else if (rdma_cap_iw_cm(id->device, id->port_num)) + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); - else + } else { ret = -ENOSYS; - + } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); - rdma_reject(id, NULL, 0); + rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } -EXPORT_SYMBOL(__rdma_accept); +EXPORT_SYMBOL(rdma_accept); + +int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_accept(id, conn_param); +} +EXPORT_SYMBOL(rdma_accept_ece); + +void rdma_lock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_lock_handler); + +void rdma_unlock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_unlock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { @@ -4153,7 +4687,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len) + u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; @@ -4168,15 +4702,15 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); - ret = ib_send_cm_rej(id_priv->cm_id.ib, - IB_CM_REJ_CONSUMER_DEFINED, NULL, - 0, private_data, private_data_len); + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - } else + } else { ret = -ENOSYS; + } return ret; } @@ -4213,70 +4747,68 @@ out: } EXPORT_SYMBOL(rdma_disconnect); -static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, + struct ib_sa_multicast *multicast, + struct rdma_cm_event *event, + struct cma_multicast *mc) { - struct rdma_id_private *id_priv; - struct cma_multicast *mc = multicast->context; - struct rdma_cm_event event = {}; - int ret = 0; - - id_priv = mc->id_priv; - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_ADDR_BOUND && - id_priv->state != RDMA_CM_ADDR_RESOLVED) - goto out; + struct rdma_dev_addr *dev_addr; + enum ib_gid_type gid_type; + struct net_device *ndev; if (!status) status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); else pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); - mutex_lock(&id_priv->qp_mutex); - if (!status && id_priv->id.qp) { - status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, - be16_to_cpu(multicast->rec.mlid)); - if (status) - pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to attach QP. status %d\n", - status); + + event->status = status; + event->param.ud.private_data = mc->context; + if (status) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + return; } - mutex_unlock(&id_priv->qp_mutex); - event.status = status; - event.param.ud.private_data = mc->context; - if (!status) { - struct rdma_dev_addr *dev_addr = - &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = - dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - enum ib_gid_type gid_type = - id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; - - event.event = RDMA_CM_EVENT_MULTICAST_JOIN; - ret = ib_init_ah_from_mcmember(id_priv->id.device, - id_priv->id.port_num, - &multicast->rec, - ndev, gid_type, - &event.param.ud.ah_attr); - if (ret) - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + dev_addr = &id_priv->id.route.addr.dev_addr; + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + gid_type = + id_priv->cma_dev + ->default_gid_type[id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + + event->event = RDMA_CM_EVENT_MULTICAST_JOIN; + if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, + &multicast->rec, ndev, gid_type, + &event->param.ud.ah_attr)) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + goto out; + } - event.param.ud.qp_num = 0xFFFFFF; - event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - if (ndev) - dev_put(ndev); - } else - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + event->param.ud.qp_num = 0xFFFFFF; + event->param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - ret = cma_cm_event_handler(id_priv, &event); +out: + if (ndev) + dev_put(ndev); +} +static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +{ + struct cma_multicast *mc = multicast->context; + struct rdma_id_private *id_priv = mc->id_priv; + struct rdma_cm_event event = {}; + int ret = 0; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || + READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) + goto out; + + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); - if (ret) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); - return 0; - } + WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); @@ -4337,17 +4869,6 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; - if ((rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) && - (!ib_sa_sendonly_fullmem_support(&sa_client, - id_priv->id.device, - id_priv->id.port_num))) { - dev_warn( - &id_priv->id.device->dev, - "RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n", - id_priv->id.port_num); - return -EOPNOTSUPP; - } - comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | @@ -4361,23 +4882,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; - mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, - id_priv->id.port_num, &rec, - comp_mask, GFP_KERNEL, - cma_ib_mc_handler, mc); - return PTR_ERR_OR_ZERO(mc->multicast.ib); -} - -static void iboe_mcast_work_handler(struct work_struct *work) -{ - struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); - struct cma_multicast *mc = mw->mc; - struct ib_sa_multicast *m = mc->multicast.ib; - - mc->multicast.ib->context = mc; - cma_ib_mc_handler(0, m); - kref_put(&mc->mcref, release_mc); - kfree(mw); + mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, comp_mask, + GFP_KERNEL, cma_ib_mc_handler, mc); + return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, @@ -4412,52 +4920,41 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { - struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + struct ib_sa_multicast ib; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); - if (cma_zero_addr((struct sockaddr *)&mc->addr)) + if (cma_zero_addr(addr)) return -EINVAL; - work = kzalloc(sizeof *work, GFP_KERNEL); - if (!work) - return -ENOMEM; - - mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); - if (!mc->multicast.ib) { - err = -ENOMEM; - goto out1; - } - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); + cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); - mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); + ib.rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) - mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + ib.rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (!ndev) { - err = -ENODEV; - goto out2; - } - mc->multicast.ib->rec.rate = iboe_get_rate(ndev); - mc->multicast.ib->rec.hop_limit = 1; - mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + if (!ndev) + return -ENODEV; + + ib.rec.rate = iboe_get_rate(ndev); + ib.rec.hop_limit = 1; + ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { - mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { - err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } @@ -4466,44 +4963,35 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -ENOTSUPP; } dev_put(ndev); - if (err || !mc->multicast.ib->rec.mtu) { - if (!err) - err = -EINVAL; - goto out2; - } - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &mc->multicast.ib->rec.port_gid); - work->id = id_priv; - work->mc = mc; - INIT_WORK(&work->work, iboe_mcast_work_handler); - kref_get(&mc->mcref); - queue_work(cma_wq, &work->work); + if (err || !ib.rec.mtu) + return err ?: -EINVAL; + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &ib.rec.port_gid); + INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); + cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); + queue_work(cma_wq, &mc->iboe_join.work); return 0; - -out2: - kfree(mc->multicast.ib); -out1: - kfree(work); - return err; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; - if (!id->device) + /* Not supported for kernel QPs */ + if (WARN_ON(id->qp)) return -EINVAL; - id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && - !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) + /* ULP is calling this wrong. */ + if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && + READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; - mc = kmalloc(sizeof *mc, GFP_KERNEL); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; @@ -4513,7 +5001,6 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { - kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; @@ -4545,25 +5032,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { - list_del(&mc->list); - spin_unlock_irq(&id_priv->lock); - - if (id->qp) - ib_detach_mcast(id->qp, - &mc->multicast.ib->rec.mgid, - be16_to_cpu(mc->multicast.ib->rec.mlid)); - - BUG_ON(id_priv->cma_dev->device != id->device); - - if (rdma_cap_ib_mcast(id->device, id->port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) { - cma_leave_roce_mc_group(id_priv, mc); - } - return; - } + if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) + continue; + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + + WARN_ON(id_priv->cma_dev->device != id->device); + destroy_mc(id_priv, mc); + return; } spin_unlock_irq(&id_priv->lock); } @@ -4572,7 +5048,7 @@ EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; - struct cma_ndev_work *work; + struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; @@ -4585,10 +5061,10 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id if (!work) return -ENOMEM; - INIT_WORK(&work->work, cma_ndev_work_handler); + INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; - atomic_inc(&id_priv->refcount); + cma_id_get(id_priv); queue_work(cma_wq, &work->work); } @@ -4611,7 +5087,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) - list_for_each_entry(id_priv, &cma_dev->id_list, list) { + list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; @@ -4622,33 +5098,192 @@ out: return ret; } +static void cma_netevent_work_handler(struct work_struct *_work) +{ + struct rdma_id_private *id_priv = + container_of(_work, struct rdma_id_private, id.net_work); + struct rdma_cm_event event = {}; + + mutex_lock(&id_priv->handler_mutex); + + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + + if (cma_cm_event_handler(id_priv, &event)) { + __acquire(&id_priv->handler_mutex); + id_priv->cm_id.ib = NULL; + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + return; + } + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); +} + +static int cma_netevent_callback(struct notifier_block *self, + unsigned long event, void *ctx) +{ + struct id_table_entry *ips_node = NULL; + struct rdma_id_private *current_id; + struct neighbour *neigh = ctx; + unsigned long flags; + + if (event != NETEVENT_NEIGH_UPDATE) + return NOTIFY_DONE; + + spin_lock_irqsave(&id_table_lock, flags); + if (neigh->tbl->family == AF_INET6) { + struct sockaddr_in6 neigh_sock_6; + + neigh_sock_6.sin6_family = AF_INET6; + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_6); + } else if (neigh->tbl->family == AF_INET) { + struct sockaddr_in neigh_sock_4; + + neigh_sock_4.sin_family = AF_INET; + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_4); + } else + goto out; + + if (!ips_node) + goto out; + + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, + neigh->ha, ETH_ALEN)) + continue; + INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); + cma_id_get(current_id); + queue_work(cma_wq, ¤t_id->id.net_work); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); + return NOTIFY_DONE; +} + static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; -static void cma_add_one(struct ib_device *device) +static struct notifier_block cma_netevent_cb = { + .notifier_call = cma_netevent_callback +}; + +static void cma_send_device_removal_put(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; + enum rdma_cm_state state; + unsigned long flags; + + mutex_lock(&id_priv->handler_mutex); + /* Record that we want to remove the device */ + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); + return; + } + id_priv->state = RDMA_CM_DEVICE_REMOVAL; + spin_unlock_irqrestore(&id_priv->lock, flags); + + if (cma_cm_event_handler(id_priv, &event)) { + /* + * At this point the ULP promises it won't call + * rdma_destroy_id() concurrently + */ + cma_id_put(id_priv); + mutex_unlock(&id_priv->handler_mutex); + trace_cm_id_destroy(id_priv); + _destroy_id(id_priv, state); + return; + } + mutex_unlock(&id_priv->handler_mutex); + + /* + * If this races with destroy then the thread that first assigns state + * to a destroying does the cancel. + */ + cma_cancel_operation(id_priv, state); + cma_id_put(id_priv); +} + +static void cma_process_remove(struct cma_device *cma_dev) +{ + mutex_lock(&lock); + while (!list_empty(&cma_dev->id_list)) { + struct rdma_id_private *id_priv = list_first_entry( + &cma_dev->id_list, struct rdma_id_private, device_item); + + list_del_init(&id_priv->listen_item); + list_del_init(&id_priv->device_item); + cma_id_get(id_priv); + mutex_unlock(&lock); + + cma_send_device_removal_put(id_priv); + + mutex_lock(&lock); + } + mutex_unlock(&lock); + + cma_dev_put(cma_dev); + wait_for_completion(&cma_dev->comp); +} + +static bool cma_supported(struct ib_device *device) +{ + u32 i; + + rdma_for_each_port(device, i) { + if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) + return true; + } + return false; +} + +static int cma_add_one(struct ib_device *device) { + struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; - unsigned int i; unsigned long supported_gids = 0; + int ret; + u32 i; + + if (!cma_supported(device)) + return -EOPNOTSUPP; - cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); + cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) - return; + return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); - if (!cma_dev->default_gid_type) + if (!cma_dev->default_gid_type) { + ret = -ENOMEM; goto free_cma_dev; + } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); - if (!cma_dev->default_roce_tos) + if (!cma_dev->default_roce_tos) { + ret = -ENOMEM; goto free_gid_type; + } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); @@ -4663,90 +5298,43 @@ static void cma_add_one(struct ib_device *device) } init_completion(&cma_dev->comp); - atomic_set(&cma_dev->refcount, 1); + refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); - list_for_each_entry(id_priv, &listen_any_list, list) - cma_listen_on_dev(id_priv, cma_dev); + list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) + goto free_listen; + } mutex_unlock(&lock); trace_cm_add_one(device); - return; + return 0; + +free_listen: + list_del(&cma_dev->list); + mutex_unlock(&lock); + /* cma_process_remove() will delete to_destroy */ + cma_process_remove(cma_dev); + kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); - - return; -} - -static int cma_remove_id_dev(struct rdma_id_private *id_priv) -{ - struct rdma_cm_event event = {}; - enum rdma_cm_state state; - int ret = 0; - - /* Record that we want to remove the device */ - state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); - if (state == RDMA_CM_DESTROYING) - return 0; - - cma_cancel_operation(id_priv, state); - mutex_lock(&id_priv->handler_mutex); - - /* Check for destruction from another callback. */ - if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) - goto out; - - event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; - ret = cma_cm_event_handler(id_priv, &event); -out: - mutex_unlock(&id_priv->handler_mutex); return ret; } -static void cma_process_remove(struct cma_device *cma_dev) -{ - struct rdma_id_private *id_priv; - int ret; - - mutex_lock(&lock); - while (!list_empty(&cma_dev->id_list)) { - id_priv = list_entry(cma_dev->id_list.next, - struct rdma_id_private, list); - - list_del(&id_priv->listen_list); - list_del_init(&id_priv->list); - atomic_inc(&id_priv->refcount); - mutex_unlock(&lock); - - ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); - cma_deref_id(id_priv); - if (ret) - rdma_destroy_id(&id_priv->id); - - mutex_lock(&lock); - } - mutex_unlock(&lock); - - cma_deref_dev(cma_dev); - wait_for_completion(&cma_dev->comp); -} - static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); - if (!cma_dev) - return; - mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); @@ -4790,6 +5378,19 @@ static int __init cma_init(void) { int ret; + /* + * There is a rare lock ordering dependency in cma_netdev_callback() + * that only happens when bonding is enabled. Teach lockdep that rtnl + * must never be nested under lock so it can find these without having + * to test with bonding. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + rtnl_lock(); + mutex_lock(&lock); + mutex_unlock(&lock); + rtnl_unlock(); + } + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; @@ -4800,6 +5401,7 @@ static int __init cma_init(void) ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); + register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) @@ -4814,6 +5416,7 @@ static int __init cma_init(void) err_ib: ib_unregister_client(&cma_client); err: + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); @@ -4826,6 +5429,7 @@ static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 8b0b5ae22e4c..7b68b3ea979f 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/module.h> #include <linux/configfs.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -43,7 +42,7 @@ struct cma_device; struct cma_dev_group; struct cma_dev_port_group { - unsigned int port_num; + u32 port_num; struct cma_dev_group *cma_dev_group; struct config_group group; }; @@ -94,7 +93,7 @@ static int cma_configfs_params_get(struct config_item *item, static void cma_configfs_params_put(struct cma_device *cma_dev) { - cma_deref_dev(cma_dev); + cma_dev_put(cma_dev); } static ssize_t default_roce_mode_show(struct config_item *item, @@ -115,7 +114,7 @@ static ssize_t default_roce_mode_show(struct config_item *item, if (gid_type < 0) return gid_type; - return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type)); + return sysfs_emit(buf, "%s\n", ib_cache_gid_type_str(gid_type)); } static ssize_t default_roce_mode_store(struct config_item *item, @@ -123,16 +122,19 @@ static ssize_t default_roce_mode_store(struct config_item *item, { struct cma_device *cma_dev; struct cma_dev_port_group *group; - int gid_type = ib_cache_gid_parse_type_str(buf); + int gid_type; ssize_t ret; - if (gid_type < 0) - return -EINVAL; - ret = cma_configfs_params_get(item, &cma_dev, &group); if (ret) return ret; + gid_type = ib_cache_gid_parse_type_str(buf); + if (gid_type < 0) { + cma_configfs_params_put(cma_dev); + return -EINVAL; + } + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); cma_configfs_params_put(cma_dev); @@ -156,7 +158,7 @@ static ssize_t default_roce_tos_show(struct config_item *item, char *buf) tos = cma_get_default_roce_tos(cma_dev, group->port_num); cma_configfs_params_put(cma_dev); - return sprintf(buf, "%u\n", tos); + return sysfs_emit(buf, "%u\n", tos); } static ssize_t default_roce_tos_store(struct config_item *item, @@ -197,11 +199,10 @@ static const struct config_item_type cma_port_group_type = { static int make_cma_ports(struct cma_dev_group *cma_dev_group, struct cma_device *cma_dev) { - struct ib_device *ibdev; - unsigned int i; - unsigned int ports_num; struct cma_dev_port_group *ports; - int err; + struct ib_device *ibdev; + u32 ports_num; + u32 i; ibdev = cma_get_ib_dev(cma_dev); @@ -212,10 +213,8 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), GFP_KERNEL); - if (!ports) { - err = -ENOMEM; - goto free; - } + if (!ports) + return -ENOMEM; for (i = 0; i < ports_num; i++) { char port_str[10]; @@ -231,12 +230,7 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, } cma_dev_group->ports = ports; - return 0; -free: - kfree(ports); - cma_dev_group->ports = NULL; - return err; } static void release_cma_dev(struct config_item *item) @@ -298,7 +292,7 @@ static struct config_group *make_cma_dev(struct config_group *group, goto fail; } - strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + strscpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); config_group_init_type_name(&cma_dev_group->ports_group, "ports", &cma_ports_group_type); @@ -312,18 +306,31 @@ static struct config_group *make_cma_dev(struct config_group *group, configfs_add_default_group(&cma_dev_group->ports_group, &cma_dev_group->device_group); - cma_deref_dev(cma_dev); + cma_dev_put(cma_dev); return &cma_dev_group->device_group; fail: if (cma_dev) - cma_deref_dev(cma_dev); + cma_dev_put(cma_dev); kfree(cma_dev_group); return ERR_PTR(err); } +static void drop_cma_dev(struct config_group *cgroup, struct config_item *item) +{ + struct config_group *group = + container_of(item, struct config_group, cg_item); + struct cma_dev_group *cma_dev_group = + container_of(group, struct cma_dev_group, device_group); + + configfs_remove_default_groups(&cma_dev_group->ports_group); + configfs_remove_default_groups(&cma_dev_group->device_group); + config_item_put(item); +} + static struct configfs_group_operations cma_subsys_group_ops = { .make_group = make_cma_dev, + .drop_item = drop_cma_dev, }; static const struct config_item_type cma_subsys_type = { diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index ca7307277518..b7354c94cf1b 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -55,8 +55,16 @@ struct rdma_id_private { struct rdma_bind_list *bind_list; struct hlist_node node; - struct list_head list; /* listen_any_list or cma_device.list */ - struct list_head listen_list; /* per device listens */ + union { + struct list_head device_item; /* On cma_device->id_list */ + struct list_head listen_any_item; /* On listen_any_list */ + }; + union { + /* On rdma_id_private->listen_list */ + struct list_head listen_item; + struct list_head listen_list; + }; + struct list_head id_list_entry; struct cma_device *cma_dev; struct list_head mc_list; @@ -66,7 +74,7 @@ struct rdma_id_private { struct mutex qp_mutex; struct completion comp; - atomic_t refcount; + refcount_t refcount; struct mutex handler_mutex; int backlog; @@ -86,15 +94,19 @@ struct rdma_id_private { u8 tos; u8 tos_set:1; u8 timeout_set:1; + u8 min_rnr_timer_set:1; u8 reuseaddr; u8 afonly; u8 timeout; + u8 min_rnr_timer; + u8 used_resolve_ip; enum ib_gid_type gid_type; /* * Internal to RDMA/core, don't use in the drivers */ struct rdma_restrack_entry res; + struct rdma_ucm_ece ece; }; #if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) @@ -111,16 +123,16 @@ static inline void cma_configfs_exit(void) } #endif -void cma_ref_dev(struct cma_device *dev); -void cma_deref_dev(struct cma_device *dev); +void cma_dev_get(struct cma_device *dev); +void cma_dev_put(struct cma_device *dev); typedef bool (*cma_device_filter)(struct ib_device *, void *); struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie); -int cma_get_default_gid_type(struct cma_device *dev, unsigned int port); -int cma_set_default_gid_type(struct cma_device *dev, unsigned int port, +int cma_get_default_gid_type(struct cma_device *dev, u32 port); +int cma_set_default_gid_type(struct cma_device *dev, u32 port, enum ib_gid_type default_gid_type); -int cma_get_default_roce_tos(struct cma_device *dev, unsigned int port); -int cma_set_default_roce_tos(struct cma_device *dev, unsigned int port, +int cma_get_default_roce_tos(struct cma_device *dev, u32 port); +int cma_set_default_roce_tos(struct cma_device *dev, u32 port, u8 default_roce_tos); struct ib_device *cma_get_ib_dev(struct cma_device *dev); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index 81e36bf13159..e45264267bcc 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -17,46 +17,6 @@ #include <linux/tracepoint.h> #include <trace/events/rdma.h> -/* - * enum ib_cm_event_type, from include/rdma/ib_cm.h - */ -#define IB_CM_EVENT_LIST \ - ib_cm_event(REQ_ERROR) \ - ib_cm_event(REQ_RECEIVED) \ - ib_cm_event(REP_ERROR) \ - ib_cm_event(REP_RECEIVED) \ - ib_cm_event(RTU_RECEIVED) \ - ib_cm_event(USER_ESTABLISHED) \ - ib_cm_event(DREQ_ERROR) \ - ib_cm_event(DREQ_RECEIVED) \ - ib_cm_event(DREP_RECEIVED) \ - ib_cm_event(TIMEWAIT_EXIT) \ - ib_cm_event(MRA_RECEIVED) \ - ib_cm_event(REJ_RECEIVED) \ - ib_cm_event(LAP_ERROR) \ - ib_cm_event(LAP_RECEIVED) \ - ib_cm_event(APR_RECEIVED) \ - ib_cm_event(SIDR_REQ_ERROR) \ - ib_cm_event(SIDR_REQ_RECEIVED) \ - ib_cm_event_end(SIDR_REP_RECEIVED) - -#undef ib_cm_event -#undef ib_cm_event_end - -#define ib_cm_event(x) TRACE_DEFINE_ENUM(IB_CM_##x); -#define ib_cm_event_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); - -IB_CM_EVENT_LIST - -#undef ib_cm_event -#undef ib_cm_event_end - -#define ib_cm_event(x) { IB_CM_##x, #x }, -#define ib_cm_event_end(x) { IB_CM_##x, #x } - -#define rdma_show_ib_cm_event(x) \ - __print_symbolic(x, IB_CM_EVENT_LIST) - DECLARE_EVENT_CLASS(cma_fsm_class, TP_PROTO( @@ -103,23 +63,33 @@ DEFINE_CMA_FSM_EVENT(sent_drep); DEFINE_CMA_FSM_EVENT(sent_dreq); DEFINE_CMA_FSM_EVENT(id_destroy); -TRACE_EVENT(cm_id_create, +TRACE_EVENT(cm_id_attach, TP_PROTO( - const struct rdma_id_private *id_priv + const struct rdma_id_private *id_priv, + const struct ib_device *device ), - TP_ARGS(id_priv), + TP_ARGS(id_priv, device), TP_STRUCT__entry( __field(u32, cm_id) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + __string(devname, device->name) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + __assign_str(devname, device->name); ), - TP_printk("cm.id=%u", - __entry->cm_id + TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, + __get_str(devname) ) ); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index cf42acca4a3a..f66f48d860ec 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -44,6 +44,7 @@ #include <rdma/ib_mad.h> #include <rdma/restrack.h> #include "mad_priv.h" +#include "restrack.h" /* Total number of ports combined across all struct ib_devices's */ #define RDMA_MAX_PORTS 8192 @@ -77,19 +78,17 @@ static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net) return net_generic(net, rdma_dev_net_id); } -int ib_device_register_sysfs(struct ib_device *device); -void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); -typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, +typedef void (*roce_netdev_callback)(struct ib_device *device, u32 port, struct net_device *idev, void *cookie); -typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port, +typedef bool (*roce_netdev_filter)(struct ib_device *device, u32 port, struct net_device *idev, void *cookie); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, - unsigned int port); + u32 port); void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, @@ -112,7 +111,7 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct ib_client_nl_info { struct sk_buff *nl_msg; struct device *cdev; - unsigned int port; + u32 port; u64 abi; }; int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, @@ -127,24 +126,24 @@ int ib_cache_gid_parse_type_str(const char *buf); const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); -void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); -int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr); -int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr); -int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev); int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); -unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); @@ -213,15 +212,15 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); -int ib_get_cached_subnet_prefix(struct ib_device *device, - u8 port_num, - u64 *sn_pfx); +void ib_get_cached_subnet_prefix(struct ib_device *device, + u32 port_num, + u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND void ib_security_release_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix); int ib_security_modify_qp(struct ib_qp *qp, @@ -246,7 +245,7 @@ static inline void ib_security_release_port_pkey_list(struct ib_device *device) } static inline void ib_security_cache_change(struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix) { } @@ -317,58 +316,13 @@ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index); void nldev_init(void); void nldev_exit(void); -static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, - struct ib_pd *pd, - struct ib_qp_init_attr *attr, - struct ib_udata *udata, - struct ib_uqp_object *uobj) -{ - enum ib_qp_type qp_type = attr->qp_type; - struct ib_qp *qp; - bool is_xrc; - - if (!dev->ops.create_qp) - return ERR_PTR(-EOPNOTSUPP); - - qp = dev->ops.create_qp(pd, attr, udata); - if (IS_ERR(qp)) - return qp; - - qp->device = dev; - qp->pd = pd; - qp->uobject = uobj; - qp->real_qp = qp; - - qp->qp_type = attr->qp_type; - qp->rwq_ind_tbl = attr->rwq_ind_tbl; - qp->send_cq = attr->send_cq; - qp->recv_cq = attr->recv_cq; - qp->srq = attr->srq; - qp->rwq_ind_tbl = attr->rwq_ind_tbl; - qp->event_handler = attr->event_handler; - - atomic_set(&qp->usecnt, 0); - spin_lock_init(&qp->mr_lock); - INIT_LIST_HEAD(&qp->rdma_mrs); - INIT_LIST_HEAD(&qp->sig_mrs); - - /* - * We don't track XRC QPs for now, because they don't have PD - * and more importantly they are created internaly by driver, - * see mlx5 create_dev_resources() as an example. - */ - is_xrc = qp_type == IB_QPT_XRC_INI || qp_type == IB_QPT_XRC_TGT; - if ((qp_type < IB_QPT_MAX && !is_xrc) || qp_type == IB_QPT_DRIVER) { - qp->res.type = RDMA_RESTRACK_QP; - if (uobj) - rdma_restrack_uadd(&qp->res); - else - rdma_restrack_kadd(&qp->res); - } else - qp->res.valid = false; - - return qp; -} +struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller); + +void ib_qp_usecnt_inc(struct ib_qp *qp); +void ib_qp_usecnt_dec(struct ib_qp *qp); struct rdma_dev_addr; int rdma_resolve_ip_route(struct sockaddr *src_addr, @@ -390,13 +344,16 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); void ib_free_port_attrs(struct ib_core_device *coredev); int ib_setup_port_attrs(struct ib_core_device *coredev); +struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, u32 port_num); +void ib_device_release_hw_stats(struct hw_stats_device_data *data); +int ib_setup_device_attrs(struct ib_device *ibdev); int rdma_compatdev_set(u8 enable); -int ib_port_register_module_stat(struct ib_device *device, u8 port_num, - struct kobject *kobj, struct kobj_type *ktype, - const char *name); -void ib_port_unregister_module_stat(struct kobject *kobj); +int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups); +void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups); int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd); @@ -414,4 +371,6 @@ void rdma_umap_priv_init(struct rdma_umap_priv *priv, struct vm_area_struct *vma, struct rdma_user_mmap_entry *entry); +void ib_cq_pool_cleanup(struct ib_device *dev); + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 2257d7f7810f..af59486fe418 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -8,33 +8,43 @@ #include "core_priv.h" #include "restrack.h" -#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE) +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) -static int __counter_set_mode(struct rdma_counter_mode *curr, +static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, enum rdma_nl_counter_mask new_mask) { - if ((new_mode == RDMA_COUNTER_MODE_AUTO) && - ((new_mask & (~ALL_AUTO_MODE_MASKS)) || - (curr->mode != RDMA_COUNTER_MODE_NONE))) - return -EINVAL; + if (new_mode == RDMA_COUNTER_MODE_AUTO) { + if (new_mask & (~ALL_AUTO_MODE_MASKS)) + return -EINVAL; + if (port_counter->num_counters) + return -EBUSY; + } - curr->mode = new_mode; - curr->mask = new_mask; + port_counter->mode.mode = new_mode; + port_counter->mode.mask = new_mask; return 0; } -/** +/* * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * - * When @on is true, the @mask must be set; When @on is false, it goes - * into manual mode if there's any counter, so that the user is able to - * manually access them. + * @dev: Device to operate + * @port: Port to use + * @mask: Mask to configure + * @extack: Message to the user + * + * Return 0 on success. If counter mode wasn't changed then it is considered + * as success as well. + * Return -EBUSY when changing to auto mode while there are bounded counters. + * */ -int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, - bool on, enum rdma_nl_counter_mask mask) +int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, + enum rdma_nl_counter_mask mask, + struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; + enum rdma_nl_counter_mode mode; int ret; port_counter = &dev->port_data[port].port_counter; @@ -42,30 +52,95 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, return -EOPNOTSUPP; mutex_lock(&port_counter->lock); - if (on) { - ret = __counter_set_mode(&port_counter->mode, - RDMA_COUNTER_MODE_AUTO, mask); - } else { - if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) { - ret = -EINVAL; - goto out; - } - - if (port_counter->num_counters) - ret = __counter_set_mode(&port_counter->mode, - RDMA_COUNTER_MODE_MANUAL, 0); - else - ret = __counter_set_mode(&port_counter->mode, - RDMA_COUNTER_MODE_NONE, 0); + if (mask) + mode = RDMA_COUNTER_MODE_AUTO; + else + mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : + RDMA_COUNTER_MODE_NONE; + + if (port_counter->mode.mode == mode && + port_counter->mode.mask == mask) { + ret = 0; + goto out; } + ret = __counter_set_mode(port_counter, mode, mask); + out: mutex_unlock(&port_counter->lock); + if (ret == -EBUSY) + NL_SET_ERR_MSG( + extack, + "Modifying auto mode is not allowed when there is a bound QP"); + return ret; +} + +static void auto_mode_init_counter(struct rdma_counter *counter, + const struct ib_qp *qp, + enum rdma_nl_counter_mask new_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + + counter->mode.mode = RDMA_COUNTER_MODE_AUTO; + counter->mode.mask = new_mask; + + if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) + param->qp_type = qp->qp_type; +} + +static int __rdma_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + int ret; + + if (qp->counter) + return -EINVAL; + + if (!qp->device->ops.counter_bind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_bind_qp(counter, qp); + mutex_unlock(&counter->lock); + + return ret; +} + +int rdma_counter_modify(struct ib_device *dev, u32 port, + unsigned int index, bool enable) +{ + struct rdma_hw_stats *stats; + int ret = 0; + + if (!dev->ops.modify_hw_stat) + return -EOPNOTSUPP; + + stats = ib_get_hw_stats_port(dev, port); + if (!stats || index >= stats->num_counters || + !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) + return -EINVAL; + + mutex_lock(&stats->lock); + + if (enable != test_bit(index, stats->is_disabled)) + goto out; + + ret = dev->ops.modify_hw_stat(dev, port, index, enable); + if (ret) + goto out; + + if (enable) + clear_bit(index, stats->is_disabled); + else + set_bit(index, stats->is_disabled); +out: + mutex_unlock(&stats->lock); return ret; } -static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, - enum rdma_nl_counter_mode mode) +static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, + struct ib_qp *qp, + enum rdma_nl_counter_mode mode) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; @@ -80,18 +155,30 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, counter->device = dev; counter->port = port; - counter->res.type = RDMA_RESTRACK_COUNTER; - counter->stats = dev->ops.counter_alloc_stats(counter); + + rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); + counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; port_counter = &dev->port_data[port].port_counter; mutex_lock(&port_counter->lock); - if (mode == RDMA_COUNTER_MODE_MANUAL) { - ret = __counter_set_mode(&port_counter->mode, - RDMA_COUNTER_MODE_MANUAL, 0); - if (ret) + switch (mode) { + case RDMA_COUNTER_MODE_MANUAL: + ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, + 0); + if (ret) { + mutex_unlock(&port_counter->lock); goto err_mode; + } + break; + case RDMA_COUNTER_MODE_AUTO: + auto_mode_init_counter(counter, qp, port_counter->mode.mask); + break; + default: + ret = -EOPNOTSUPP; + mutex_unlock(&port_counter->lock); + goto err_mode; } port_counter->num_counters++; @@ -101,12 +188,18 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, kref_init(&counter->kref); mutex_init(&counter->lock); + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) + goto err_mode; + + rdma_restrack_parent_name(&counter->res, &qp->res); + rdma_restrack_add(&counter->res); return counter; err_mode: - mutex_unlock(&port_counter->lock); - kfree(counter->stats); + rdma_free_hw_stats_struct(counter->stats); err_stats: + rdma_restrack_put(&counter->res); kfree(counter); return NULL; } @@ -120,71 +213,29 @@ static void rdma_counter_free(struct rdma_counter *counter) port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) - __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE, - 0); + __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0); mutex_unlock(&port_counter->lock); rdma_restrack_del(&counter->res); - kfree(counter->stats); + rdma_free_hw_stats_struct(counter->stats); kfree(counter); } -static void auto_mode_init_counter(struct rdma_counter *counter, - const struct ib_qp *qp, - enum rdma_nl_counter_mask new_mask) -{ - struct auto_mode_param *param = &counter->mode.param; - - counter->mode.mode = RDMA_COUNTER_MODE_AUTO; - counter->mode.mask = new_mask; - - if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) - param->qp_type = qp->qp_type; -} - static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, enum rdma_nl_counter_mask auto_mask) { struct auto_mode_param *param = &counter->mode.param; bool match = true; - /* - * Ensure that counter belongs to the right PID. This operation can - * race with user space which kills the process and leaves QP and - * counters orphans. - * - * It is not a big deal because exitted task will leave both QP and - * counter in the same bucket of zombie process. Just ensure that - * process is still alive before procedding. - * - */ - if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task) || - !task_pid_nr(qp->res.task)) - return false; - if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); - return match; -} - -static int __rdma_counter_bind_qp(struct rdma_counter *counter, - struct ib_qp *qp) -{ - int ret; - - if (qp->counter) - return -EINVAL; + if (auto_mask & RDMA_COUNTER_MASK_PID) + match &= (task_pid_nr(counter->res.task) == + task_pid_nr(qp->res.task)); - if (!qp->device->ops.counter_bind_qp) - return -EOPNOTSUPP; - - mutex_lock(&counter->lock); - ret = qp->device->ops.counter_bind_qp(counter, qp); - mutex_unlock(&counter->lock); - - return ret; + return match; } static int __rdma_counter_unbind_qp(struct ib_qp *qp) @@ -202,7 +253,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp) return ret; } -static void counter_history_stat_update(const struct rdma_counter *counter) +static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; @@ -212,18 +263,20 @@ static void counter_history_stat_update(const struct rdma_counter *counter) if (!port_counter->hstats) return; + rdma_counter_query_stats(counter); + for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } -/** +/* * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode * * Return: The counter (with ref-count increased) if found */ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, - u8 port) + u32 port) { struct rdma_port_counter *port_counter; struct rdma_counter *counter = NULL; @@ -253,18 +306,6 @@ next: return counter; } -static void rdma_counter_res_add(struct rdma_counter *counter, - struct ib_qp *qp) -{ - if (rdma_is_kernel_res(&qp->res)) { - rdma_restrack_set_task(&counter->res, qp->res.kern_name); - rdma_restrack_kadd(&counter->res); - } else { - rdma_restrack_attach_task(&counter->res, qp->res.task); - rdma_restrack_uadd(&counter->res); - } -} - static void counter_release(struct kref *kref) { struct rdma_counter *counter; @@ -275,18 +316,18 @@ static void counter_release(struct kref *kref) rdma_counter_free(counter); } -/** +/* * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on * the auto-mode rule */ -int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port) +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct ib_device *dev = qp->device; struct rdma_counter *counter; int ret; - if (!qp->res.valid) + if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) return 0; if (!rdma_is_port_valid(dev, port)) @@ -304,25 +345,15 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port) return ret; } } else { - counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO); + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO); if (!counter) return -ENOMEM; - - auto_mode_init_counter(counter, qp, port_counter->mode.mask); - - ret = __rdma_counter_bind_qp(counter, qp); - if (ret) { - rdma_counter_free(counter); - return ret; - } - - rdma_counter_res_add(counter, qp); } return 0; } -/** +/* * rdma_counter_unbind_qp - Unbind a qp from a counter * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) @@ -359,7 +390,7 @@ int rdma_counter_query_stats(struct rdma_counter *counter) } static u64 get_running_counters_hwstat_sum(struct ib_device *dev, - u8 port, u32 index) + u32 port, u32 index) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; @@ -391,11 +422,11 @@ next: return sum; } -/** +/* * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a * specific port, including the running ones and history data */ -u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index) +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) { struct rdma_port_counter *port_counter; u64 sum; @@ -430,15 +461,6 @@ err: return NULL; } -static int rdma_counter_bind_qp_manual(struct rdma_counter *counter, - struct ib_qp *qp) -{ - if ((counter->device != qp->device) || (counter->port != qp->port)) - return -EINVAL; - - return __rdma_counter_bind_qp(counter, qp); -} - static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, u32 counter_id) { @@ -456,10 +478,10 @@ static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, return counter; } -/** +/* * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id */ -int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, +int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; @@ -481,12 +503,17 @@ int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, goto err; } - if (counter->res.task != qp->res.task) { + if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { + ret = -EINVAL; + goto err_task; + } + + if ((counter->device != qp->device) || (counter->port != qp->port)) { ret = -EINVAL; goto err_task; } - ret = rdma_counter_bind_qp_manual(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp); if (ret) goto err_task; @@ -500,11 +527,11 @@ err: return ret; } -/** +/* * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it * The id of new counter is returned in @counter_id */ -int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id) { struct rdma_port_counter *port_counter; @@ -531,35 +558,27 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, goto err; } - counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL); + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL); if (!counter) { ret = -ENOMEM; goto err; } - ret = rdma_counter_bind_qp_manual(counter, qp); - if (ret) - goto err_bind; - if (counter_id) *counter_id = counter->id; - rdma_counter_res_add(counter, qp); - rdma_restrack_put(&qp->res); - return ret; + return 0; -err_bind: - rdma_counter_free(counter); err: rdma_restrack_put(&qp->res); return ret; } -/** +/* * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter */ -int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, +int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; @@ -592,7 +611,7 @@ out: return ret; } -int rdma_counter_get_mode(struct ib_device *dev, u8 port, +int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask) { @@ -618,10 +637,10 @@ void rdma_counter_init(struct ib_device *dev) port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); - if (!dev->ops.alloc_hw_stats) + if (!dev->ops.alloc_hw_port_stats) continue; - port_counter->hstats = dev->ops.alloc_hw_stats(dev, port); + port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); if (!port_counter->hstats) goto fail; } @@ -631,7 +650,7 @@ void rdma_counter_init(struct ib_device *dev) fail: for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; - kfree(port_counter->hstats); + rdma_free_hw_stats_struct(port_counter->hstats); port_counter->hstats = NULL; mutex_destroy(&port_counter->lock); } @@ -644,7 +663,7 @@ void rdma_counter_release(struct ib_device *dev) rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; - kfree(port_counter->hstats); + rdma_free_hw_stats_struct(port_counter->hstats); mutex_destroy(&port_counter->lock); } } diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 4f25b2400694..a70876a0a231 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -2,12 +2,15 @@ /* * Copyright (c) 2015 HGST, a Western Digital Company. */ -#include <linux/module.h> #include <linux/err.h> #include <linux/slab.h> #include <rdma/ib_verbs.h> +#include "core_priv.h" + #include <trace/events/rdma_core.h> +/* Max size for shared CQ, may require tuning */ +#define IB_MAX_SHARED_CQ_SZ 4096U /* # of WCs to poll for with a single call to ib_poll_cq */ #define IB_POLL_BATCH 16 @@ -68,6 +71,15 @@ static void rdma_dim_init(struct ib_cq *cq) INIT_WORK(&dim->work, ib_cq_rdma_dim_work); } +static void rdma_dim_destroy(struct ib_cq *cq) +{ + if (!cq->dim) + return; + + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); +} + static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { int rc; @@ -110,7 +122,7 @@ static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, } /** - * ib_process_direct_cq - process a CQ in caller context + * ib_process_cq_direct - process a CQ in caller context * @cq: CQ to process * @budget: number of CQEs to poll for * @@ -184,24 +196,22 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * __ib_alloc_cq_user - allocate a completion queue + * __ib_alloc_cq - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. * @caller: module owner name. - * @udata: Valid user data or NULL for kernel object * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, - enum ib_poll_context poll_ctx, - const char *caller, struct ib_udata *udata) +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, + int comp_vector, enum ib_poll_context poll_ctx, + const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -218,20 +228,19 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); + cq->comp_vector = comp_vector; cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) goto out_free_cq; - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); ret = dev->ops.create_cq(cq, &cq_attr, NULL); if (ret) goto out_free_wc; - rdma_restrack_kadd(&cq->res); - rdma_dim_init(cq); switch (cq->poll_ctx) { @@ -257,20 +266,22 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, goto out_destroy_cq; } + rdma_restrack_add(&cq->res); trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); return cq; out_destroy_cq: - rdma_restrack_del(&cq->res); - cq->device->ops.destroy_cq(cq, udata); + rdma_dim_destroy(cq); + cq->device->ops.destroy_cq(cq, NULL); out_free_wc: + rdma_restrack_put(&cq->res); kfree(cq->wc); out_free_cq: kfree(cq); trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); return ERR_PTR(ret); } -EXPORT_SYMBOL(__ib_alloc_cq_user); +EXPORT_SYMBOL(__ib_alloc_cq); /** * __ib_alloc_cq_any - allocate a completion queue @@ -295,20 +306,23 @@ struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, atomic_inc_return(&counter) % min_t(int, dev->num_comp_vectors, num_online_cpus()); - return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx, - caller, NULL); + return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, + caller); } EXPORT_SYMBOL(__ib_alloc_cq_any); /** - * ib_free_cq_user - free a completion queue + * ib_free_cq - free a completion queue * @cq: completion queue to free. - * @udata: User data or NULL for kernel object */ -void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) +void ib_free_cq(struct ib_cq *cq) { + int ret; + if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; + if (WARN_ON_ONCE(cq->cqe_used)) + return; switch (cq->poll_ctx) { case IB_POLL_DIRECT: @@ -324,13 +338,170 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) WARN_ON_ONCE(1); } + rdma_dim_destroy(cq); trace_cq_free(cq); + ret = cq->device->ops.destroy_cq(cq, NULL); + WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); - cq->device->ops.destroy_cq(cq, udata); - if (cq->dim) - cancel_work_sync(&cq->dim->work); - kfree(cq->dim); kfree(cq->wc); kfree(cq); } -EXPORT_SYMBOL(ib_free_cq_user); +EXPORT_SYMBOL(ib_free_cq); + +void ib_cq_pool_cleanup(struct ib_device *dev) +{ + struct ib_cq *cq, *n; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { + list_for_each_entry_safe(cq, n, &dev->cq_pools[i], + pool_entry) { + WARN_ON(cq->cqe_used); + list_del(&cq->pool_entry); + cq->shared = false; + ib_free_cq(cq); + } + } +} + +static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, + enum ib_poll_context poll_ctx) +{ + LIST_HEAD(tmp_list); + unsigned int nr_cqs, i; + struct ib_cq *cq, *n; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return -EINVAL; + } + + /* + * Allocate at least as many CQEs as requested, and otherwise + * a reasonable batch size so that we can share CQs between + * multiple users instead of allocating a larger number of CQs. + */ + nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, + max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); + nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + for (i = 0; i < nr_cqs; i++) { + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto out_free_cqs; + } + cq->shared = true; + list_add_tail(&cq->pool_entry, &tmp_list); + } + + spin_lock_irq(&dev->cq_pools_lock); + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); + spin_unlock_irq(&dev->cq_pools_lock); + + return 0; + +out_free_cqs: + list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) { + cq->shared = false; + ib_free_cq(cq); + } + return ret; +} + +/** + * ib_cq_pool_get() - Find the least used completion queue that matches + * a given cpu hint (or least used for wild card affinity) and fits + * nr_cqe. + * @dev: rdma device + * @nr_cqe: number of needed cqe entries + * @comp_vector_hint: completion vector hint (-1) for the driver to assign + * a comp vector based on internal counter + * @poll_ctx: cq polling context + * + * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and + * claim entries in it for us. In case there is no available cq, allocate + * a new cq with the requirements and add it to the device pool. + * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value + * for @poll_ctx. + */ +struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, + int comp_vector_hint, + enum ib_poll_context poll_ctx) +{ + static unsigned int default_comp_vector; + unsigned int vector, num_comp_vectors; + struct ib_cq *cq, *found = NULL; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return ERR_PTR(-EINVAL); + } + + num_comp_vectors = + min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + /* Project the affinty to the device completion vector range */ + if (comp_vector_hint < 0) { + comp_vector_hint = + (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; + WRITE_ONCE(default_comp_vector, comp_vector_hint); + } + vector = comp_vector_hint % num_comp_vectors; + + /* + * Find the least used CQ with correct affinity and + * enough free CQ entries + */ + while (!found) { + spin_lock_irq(&dev->cq_pools_lock); + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], + pool_entry) { + /* + * Check to see if we have found a CQ with the + * correct completion vector + */ + if (vector != cq->comp_vector) + continue; + if (cq->cqe_used + nr_cqe > cq->cqe) + continue; + found = cq; + break; + } + + if (found) { + found->cqe_used += nr_cqe; + spin_unlock_irq(&dev->cq_pools_lock); + + return found; + } + spin_unlock_irq(&dev->cq_pools_lock); + + /* + * Didn't find a match or ran out of CQs in the device + * pool, allocate a new array of CQs. + */ + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); + if (ret) + return ERR_PTR(ret); + } + + return found; +} +EXPORT_SYMBOL(ib_cq_pool_get); + +/** + * ib_cq_pool_put - Return a CQ taken from a shared pool. + * @cq: The CQ to return. + * @nr_cqe: The max number of cqes that the user had requested. + */ +void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) +{ + if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) + return; + + spin_lock_irq(&cq->device->cq_pools_lock); + cq->cqe_used -= nr_cqe; + spin_unlock_irq(&cq->device->cq_pools_lock); +} +EXPORT_SYMBOL(ib_cq_pool_put); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f6c255202d7f..b69e2c4e4d2a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -58,6 +58,7 @@ struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the @@ -272,7 +273,6 @@ static void ib_device_check_mandatory(struct ib_device *device) } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), - IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), @@ -285,6 +285,7 @@ static void ib_device_check_mandatory(struct ib_device *device) IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), + IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; @@ -421,7 +422,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) return ret; } - strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); + strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); @@ -491,6 +492,8 @@ static void ib_device_release(struct device *device) free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); + if (dev->hw_stats_data) + ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); @@ -570,6 +573,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; + unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; @@ -583,7 +587,6 @@ struct ib_device *_ib_alloc_device(size_t size) return NULL; } - device->groups[0] = &ib_dev_attr_group; rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); @@ -601,6 +604,43 @@ struct ib_device *_ib_alloc_device(size_t size) init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); + spin_lock_init(&device->cq_pools_lock); + for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) + INIT_LIST_HEAD(&device->cq_pools[i]); + + rwlock_init(&device->cache_lock); + + device->uverbs_cmd_mask = + BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | + BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | + BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | + BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | + BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | + BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | + BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | + BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | + BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | + BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); return device; } EXPORT_SYMBOL(_ib_alloc_device); @@ -677,8 +717,20 @@ static int add_client_context(struct ib_device *device, if (ret) goto out; downgrade_write(&device->client_data_rwsem); - if (client->add) - client->add(device); + if (client->add) { + if (client->add(device)) { + /* + * If a client fails to add then the error code is + * ignored, but we won't call any more ops on this + * client. + */ + xa_erase(&device->client_data, client->client_id); + up_read(&device->client_data_rwsem); + ib_device_put(device); + ib_client_put(client); + return 0; + } + } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, @@ -731,7 +783,7 @@ static void remove_client_context(struct ib_device *device, static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; - unsigned int port; + u32 port; if (device->port_data) return 0; @@ -740,6 +792,10 @@ static int alloc_port_data(struct ib_device *device) if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; + /* Reserve U32_MAX so the logic to go over all the ports is sane */ + if (WARN_ON(device->phys_port_cnt == U32_MAX)) + return -EINVAL; + /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. @@ -771,7 +827,7 @@ static int alloc_port_data(struct ib_device *device) return 0; } -static int verify_immutable(const struct ib_device *dev, u8 port) +static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); @@ -779,7 +835,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port) static int setup_port_data(struct ib_device *device) { - unsigned int port; + u32 port; int ret; ret = alloc_port_data(device); @@ -800,6 +856,20 @@ static int setup_port_data(struct ib_device *device) return 0; } +/** + * ib_port_immutable_read() - Read rdma port's immutable data + * @dev: IB device + * @port: port number whose immutable data to read. It starts with index 1 and + * valid upto including rdma_end_port(). + */ +const struct ib_port_immutable* +ib_port_immutable_read(struct ib_device *dev, unsigned int port) +{ + WARN_ON(!rdma_is_port_valid(dev, port)); + return &dev->port_data[port].immutable; +} +EXPORT_SYMBOL(ib_port_immutable_read); + void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) @@ -820,15 +890,8 @@ static void ib_policy_change_task(struct work_struct *work) rdma_for_each_port (dev, i) { u64 sp; - int ret = ib_get_cached_subnet_prefix(dev, - i, - &sp); - - WARN_ONCE(ret, - "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", - ret); - if (!ret) - ib_security_cache_change(dev, i, sp); + ib_get_cached_subnet_prefix(dev, i, &sp); + ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); @@ -896,7 +959,9 @@ static int add_one_compat_dev(struct ib_device *device, cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; - dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); + ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); + if (ret) + goto add_err; ret = device_add(&cdev->dev); if (ret) @@ -1152,7 +1217,7 @@ static int assign_name(struct ib_device *device, const char *name) ret = -ENFILE; goto out; } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); @@ -1164,56 +1229,6 @@ out: return ret; } -static void setup_dma_device(struct ib_device *device) -{ - struct device *parent = device->dev.parent; - - WARN_ON_ONCE(device->dma_device); - if (device->dev.dma_ops) { - /* - * The caller provided custom DMA operations. Copy the - * DMA-related fields that are used by e.g. dma_alloc_coherent() - * into device->dev. - */ - device->dma_device = &device->dev; - if (!device->dev.dma_mask) { - if (parent) - device->dev.dma_mask = parent->dma_mask; - else - WARN_ON_ONCE(true); - } - if (!device->dev.coherent_dma_mask) { - if (parent) - device->dev.coherent_dma_mask = - parent->coherent_dma_mask; - else - WARN_ON_ONCE(true); - } - } else { - /* - * The caller did not provide custom DMA operations. Use the - * DMA mapping operations of the parent device. - */ - WARN_ON_ONCE(!parent); - device->dma_device = parent; - } - - if (!device->dev.dma_parms) { - if (parent) { - /* - * The caller did not provide DMA parameters, so - * 'parent' probably represents a PCI device. The PCI - * core sets the maximum segment size to 64 - * KB. Increase this parameter to 2 GB. - */ - device->dev.dma_parms = parent->dma_parms; - dma_set_max_seg_size(device->dma_device, SZ_2G); - } else { - WARN_ON_ONCE(true); - } - } -} - /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during @@ -1224,7 +1239,6 @@ static int setup_device(struct ib_device *device) struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; - setup_dma_device(device); ib_device_check_mandatory(device); ret = setup_port_data(device); @@ -1268,6 +1282,8 @@ static void disable_device(struct ib_device *device) remove_client_context(device, cid); } + ib_cq_pool_cleanup(device); + /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); @@ -1325,11 +1341,18 @@ out: return ret; } +static void prevent_dealloc_device(struct ib_device *ib_dev) +{ +} + /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will - * cause a unique index to be added to the passed device name. + * cause a unique index to be added to the passed device name. + * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB + * device will be used. In this case the caller should fully + * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a @@ -1340,7 +1363,8 @@ out: * asynchronously then the device pointer may become freed as soon as this * function returns. */ -int ib_register_device(struct ib_device *device, const char *name) +int ib_register_device(struct ib_device *device, const char *name, + struct device *dma_device) { int ret; @@ -1348,6 +1372,14 @@ int ib_register_device(struct ib_device *device, const char *name) if (ret) return ret; + /* + * If the caller does not provide a DMA capable device then the IB core + * will set up ib_sge and scatterlist structures that stash the kernel + * virtual address into the address field. + */ + WARN_ON(dma_device && !dma_device->dma_parms); + device->dma_device = dma_device; + ret = setup_device(device); if (ret) return ret; @@ -1359,6 +1391,12 @@ int ib_register_device(struct ib_device *device, const char *name) return ret; } + device->groups[0] = &ib_dev_attr_group; + device->groups[1] = device->ops.device_group; + ret = ib_setup_device_attrs(device); + if (ret) + goto cache_cleanup; + ib_device_register_rdmacg(device); rdma_counter_init(device); @@ -1372,7 +1410,7 @@ int ib_register_device(struct ib_device *device, const char *name) if (ret) goto cg_cleanup; - ret = ib_device_register_sysfs(device); + ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); @@ -1380,9 +1418,6 @@ int ib_register_device(struct ib_device *device, const char *name) } ret = enable_device_and_get(device); - dev_set_uevent_suppress(&device->dev, false); - /* Mark for userspace that device is ready */ - kobject_uevent(&device->dev.kobj, KOBJ_ADD); if (ret) { void (*dealloc_fn)(struct ib_device *); @@ -1394,16 +1429,20 @@ int ib_register_device(struct ib_device *device, const char *name) * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the - * NULL pointers, causing the responsibility to + * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; - device->ops.dealloc_driver = NULL; + device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; + dev_set_uevent_suppress(&device->dev, false); return ret; } + dev_set_uevent_suppress(&device->dev, false); + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); return 0; @@ -1413,6 +1452,7 @@ dev_cleanup: cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); +cache_cleanup: ib_cache_cleanup_one(device); return ret; } @@ -1437,7 +1477,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev) /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); - ib_device_unregister_sysfs(ib_dev); + ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); @@ -1446,7 +1486,8 @@ static void __ib_unregister_device(struct ib_device *ib_dev) * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ - if (ib_dev->ops.dealloc_driver) { + if (ib_dev->ops.dealloc_driver && + ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } @@ -1562,7 +1603,7 @@ void ib_unregister_device_queued(struct ib_device *ib_dev) WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); - if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) + if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); @@ -1654,13 +1695,11 @@ int ib_device_set_netns_put(struct sk_buff *skb, } /* - * Currently supported only for those providers which support - * disassociation and don't do port specific sysfs init. Once a - * port_cleanup infrastructure is implemented, this limitation will be - * removed. + * All the ib_clients, including uverbs, are reset when the namespace is + * changed and this cannot be blocked waiting for userspace to do + * something, so disassociation is mandatory. */ - if (!dev->ops.disassociate_ucontext || dev->ops.init_port || - ib_devices_shared_netns) { + if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } @@ -1868,9 +1907,9 @@ static int __ib_get_client_nl_info(struct ib_device *ibdev, /** * ib_get_client_nl_info - Fetch the nl_info from a client - * @device - IB device - * @client_name - Name of the client - * @res - Result of the query + * @ibdev: IB device + * @client_name: Name of the client + * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) @@ -1972,7 +2011,7 @@ void ib_dispatch_event_clients(struct ib_event *event) } static int iw_query_port(struct ib_device *device, - u8 port_num, + u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; @@ -2011,10 +2050,9 @@ static int iw_query_port(struct ib_device *device, } static int __ib_query_port(struct ib_device *device, - u8 port_num, + u32 port_num, struct ib_port_attr *port_attr) { - union ib_gid gid = {}; int err; memset(port_attr, 0, sizeof(*port_attr)); @@ -2027,11 +2065,8 @@ static int __ib_query_port(struct ib_device *device, IB_LINK_LAYER_INFINIBAND) return 0; - err = device->ops.query_gid(device, port_num, 0, &gid); - if (err) - return err; - - port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); + ib_get_cached_subnet_prefix(device, port_num, + &port_attr->subnet_prefix); return 0; } @@ -2045,7 +2080,7 @@ static int __ib_query_port(struct ib_device *device, * @port_attr pointer. */ int ib_query_port(struct ib_device *device, - u8 port_num, + u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) @@ -2097,7 +2132,7 @@ static void add_ndev_hash(struct ib_port_data *pdata) * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, - unsigned int port) + u32 port) { struct net_device *old_ndev; struct ib_port_data *pdata; @@ -2140,7 +2175,7 @@ EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; - unsigned int port; + u32 port; if (!ib_dev->port_data) return; @@ -2171,7 +2206,7 @@ static void free_netdevs(struct ib_device *ib_dev) } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, - unsigned int port) + u32 port) { struct ib_port_data *pdata; struct net_device *res; @@ -2258,7 +2293,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_callback cb, void *cookie) { - unsigned int port; + u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { @@ -2298,7 +2333,7 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, up_read(&devices_rwsem); } -/** +/* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * @@ -2336,11 +2371,14 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, - u8 port_num, u16 index, u16 *pkey) + u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; + if (!device->ops.query_pkey) + return -EOPNOTSUPP; + return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); @@ -2378,7 +2416,7 @@ EXPORT_SYMBOL(ib_modify_device); * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, - u8 port_num, int port_modify_mask, + u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; @@ -2410,10 +2448,10 @@ EXPORT_SYMBOL(ib_modify_port); * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - u8 *port_num, u16 *index) + u32 *port_num, u16 *index) { union ib_gid tmp_gid; - unsigned int port; + u32 port; int ret, i; rdma_for_each_port (device, port) { @@ -2424,7 +2462,8 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) - return ret; + continue; + if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) @@ -2447,7 +2486,7 @@ EXPORT_SYMBOL(ib_find_gid); * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, - u8 port_num, u16 pkey, u16 *index) + u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; @@ -2490,7 +2529,7 @@ EXPORT_SYMBOL(ib_find_pkey); * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, - u8 port, + u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) @@ -2555,8 +2594,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); - SET_DEVICE_OP(dev_ops, alloc_fmr); - SET_DEVICE_OP(dev_ops, alloc_hw_stats); + SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); + SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); @@ -2575,14 +2614,13 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); - SET_DEVICE_OP(dev_ops, create_flow_action_esp); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); + SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); - SET_DEVICE_OP(dev_ops, dealloc_fmr); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); @@ -2598,24 +2636,31 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); + SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); - SET_DEVICE_OP(dev_ops, fill_res_entry); - SET_DEVICE_OP(dev_ops, fill_stat_entry); + SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); + SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); + SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); - SET_DEVICE_OP(dev_ops, init_port); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); @@ -2626,19 +2671,19 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); - SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); - SET_DEVICE_OP(dev_ops, modify_flow_action_esp); + SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); + SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); @@ -2650,26 +2695,46 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); + SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); - SET_DEVICE_OP(dev_ops, req_ncomp_notif); + SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); - SET_DEVICE_OP(dev_ops, unmap_fmr); SET_OBJ_SIZE(dev_ops, ib_ah); + SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_qp); + SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); + SET_OBJ_SIZE(dev_ops, ib_xrcd); } EXPORT_SYMBOL(ib_set_device_ops); +#ifdef CONFIG_INFINIBAND_VIRT_DMA +int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + sg_dma_address(s) = (uintptr_t)sg_virt(s); + sg_dma_len(s) = s->length; + } + return nents; +} +EXPORT_SYMBOL(ib_dma_virt_map_sg); +#endif /* CONFIG_INFINIBAND_VIRT_DMA */ + static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, @@ -2687,27 +2752,28 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { static int __init ib_core_init(void) { - int ret; + int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; + ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_unreg_wq) + goto err; + ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); - if (!ib_comp_wq) { - ret = -ENOMEM; - goto err; - } + if (!ib_comp_wq) + goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); - if (!ib_comp_unbound_wq) { - ret = -ENOMEM; + if (!ib_comp_unbound_wq) goto err_comp; - } ret = class_register(&ib_class); if (ret) { @@ -2719,7 +2785,7 @@ static int __init ib_core_init(void) ret = addr_init(); if (ret) { - pr_warn("Could't init IB address resolution\n"); + pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } @@ -2749,10 +2815,18 @@ static int __init ib_core_init(void) nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); - roce_gid_mgmt_init(); + ret = roce_gid_mgmt_init(); + if (ret) { + pr_warn("Couldn't init RoCE GID management\n"); + goto err_parent; + } return 0; +err_parent: + rdma_nl_unregister(RDMA_NL_LS); + nldev_exit(); + unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: @@ -2767,6 +2841,8 @@ err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); +err_unbound: + destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; @@ -2788,7 +2864,7 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); - flush_workqueue(system_unbound_wq); + destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c deleted file mode 100644 index e08aec427027..000000000000 --- a/drivers/infiniband/core/fmr_pool.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Copyright (c) 2004 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/jhash.h> -#include <linux/kthread.h> - -#include <rdma/ib_fmr_pool.h> - -#include "core_priv.h" - -#define PFX "fmr_pool: " - -enum { - IB_FMR_MAX_REMAPS = 32, - - IB_FMR_HASH_BITS = 8, - IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS, - IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1 -}; - -/* - * If an FMR is not in use, then the list member will point to either - * its pool's free_list (if the FMR can be mapped again; that is, - * remap_count < pool->max_remaps) or its pool's dirty_list (if the - * FMR needs to be unmapped before being remapped). In either of - * these cases it is a bug if the ref_count is not 0. In other words, - * if ref_count is > 0, then the list member must not be linked into - * either free_list or dirty_list. - * - * The cache_node member is used to link the FMR into a cache bucket - * (if caching is enabled). This is independent of the reference - * count of the FMR. When a valid FMR is released, its ref_count is - * decremented, and if ref_count reaches 0, the FMR is placed in - * either free_list or dirty_list as appropriate. However, it is not - * removed from the cache and may be "revived" if a call to - * ib_fmr_register_physical() occurs before the FMR is remapped. In - * this case we just increment the ref_count and remove the FMR from - * free_list/dirty_list. - * - * Before we remap an FMR from free_list, we remove it from the cache - * (to prevent another user from obtaining a stale FMR). When an FMR - * is released, we add it to the tail of the free list, so that our - * cache eviction policy is "least recently used." - * - * All manipulation of ref_count, list and cache_node is protected by - * pool_lock to maintain consistency. - */ - -struct ib_fmr_pool { - spinlock_t pool_lock; - - int pool_size; - int max_pages; - int max_remaps; - int dirty_watermark; - int dirty_len; - struct list_head free_list; - struct list_head dirty_list; - struct hlist_head *cache_bucket; - - void (*flush_function)(struct ib_fmr_pool *pool, - void * arg); - void *flush_arg; - - struct kthread_worker *worker; - struct kthread_work work; - - atomic_t req_ser; - atomic_t flush_ser; - - wait_queue_head_t force_wait; -}; - -static inline u32 ib_fmr_hash(u64 first_page) -{ - return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) & - (IB_FMR_HASH_SIZE - 1); -} - -/* Caller must hold pool_lock */ -static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, - u64 *page_list, - int page_list_len, - u64 io_virtual_address) -{ - struct hlist_head *bucket; - struct ib_pool_fmr *fmr; - - if (!pool->cache_bucket) - return NULL; - - bucket = pool->cache_bucket + ib_fmr_hash(*page_list); - - hlist_for_each_entry(fmr, bucket, cache_node) - if (io_virtual_address == fmr->io_virtual_address && - page_list_len == fmr->page_list_len && - !memcmp(page_list, fmr->page_list, - page_list_len * sizeof *page_list)) - return fmr; - - return NULL; -} - -static void ib_fmr_batch_release(struct ib_fmr_pool *pool) -{ - int ret; - struct ib_pool_fmr *fmr; - LIST_HEAD(unmap_list); - LIST_HEAD(fmr_list); - - spin_lock_irq(&pool->pool_lock); - - list_for_each_entry(fmr, &pool->dirty_list, list) { - hlist_del_init(&fmr->cache_node); - fmr->remap_count = 0; - list_add_tail(&fmr->fmr->list, &fmr_list); - } - - list_splice_init(&pool->dirty_list, &unmap_list); - pool->dirty_len = 0; - - spin_unlock_irq(&pool->pool_lock); - - if (list_empty(&unmap_list)) { - return; - } - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); - - spin_lock_irq(&pool->pool_lock); - list_splice(&unmap_list, &pool->free_list); - spin_unlock_irq(&pool->pool_lock); -} - -static void ib_fmr_cleanup_func(struct kthread_work *work) -{ - struct ib_fmr_pool *pool = container_of(work, struct ib_fmr_pool, work); - - ib_fmr_batch_release(pool); - atomic_inc(&pool->flush_ser); - wake_up_interruptible(&pool->force_wait); - - if (pool->flush_function) - pool->flush_function(pool, pool->flush_arg); - - if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) - kthread_queue_work(pool->worker, &pool->work); -} - -/** - * ib_create_fmr_pool - Create an FMR pool - * @pd:Protection domain for FMRs - * @params:FMR pool parameters - * - * Create a pool of FMRs. Return value is pointer to new pool or - * error code if creation failed. - */ -struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, - struct ib_fmr_pool_param *params) -{ - struct ib_device *device; - struct ib_fmr_pool *pool; - int i; - int ret; - int max_remaps; - - if (!params) - return ERR_PTR(-EINVAL); - - device = pd->device; - if (!device->ops.alloc_fmr || !device->ops.dealloc_fmr || - !device->ops.map_phys_fmr || !device->ops.unmap_fmr) { - dev_info(&device->dev, "Device does not support FMRs\n"); - return ERR_PTR(-ENOSYS); - } - - if (!device->attrs.max_map_per_fmr) - max_remaps = IB_FMR_MAX_REMAPS; - else - max_remaps = device->attrs.max_map_per_fmr; - - pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) - return ERR_PTR(-ENOMEM); - - pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; - pool->flush_arg = params->flush_arg; - - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->dirty_list); - - if (params->cache) { - pool->cache_bucket = - kmalloc_array(IB_FMR_HASH_SIZE, - sizeof(*pool->cache_bucket), - GFP_KERNEL); - if (!pool->cache_bucket) { - ret = -ENOMEM; - goto out_free_pool; - } - - for (i = 0; i < IB_FMR_HASH_SIZE; ++i) - INIT_HLIST_HEAD(pool->cache_bucket + i); - } - - pool->pool_size = 0; - pool->max_pages = params->max_pages_per_fmr; - pool->max_remaps = max_remaps; - pool->dirty_watermark = params->dirty_watermark; - pool->dirty_len = 0; - spin_lock_init(&pool->pool_lock); - atomic_set(&pool->req_ser, 0); - atomic_set(&pool->flush_ser, 0); - init_waitqueue_head(&pool->force_wait); - - pool->worker = - kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); - if (IS_ERR(pool->worker)) { - pr_warn(PFX "couldn't start cleanup kthread worker\n"); - ret = PTR_ERR(pool->worker); - goto out_free_pool; - } - kthread_init_work(&pool->work, ib_fmr_cleanup_func); - - { - struct ib_pool_fmr *fmr; - struct ib_fmr_attr fmr_attr = { - .max_pages = params->max_pages_per_fmr, - .max_maps = pool->max_remaps, - .page_shift = params->page_shift - }; - int bytes_per_fmr = sizeof *fmr; - - if (pool->cache_bucket) - bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); - - for (i = 0; i < params->pool_size; ++i) { - fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) - goto out_fail; - - fmr->pool = pool; - fmr->remap_count = 0; - fmr->ref_count = 0; - INIT_HLIST_NODE(&fmr->cache_node); - - fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); - if (IS_ERR(fmr->fmr)) { - pr_warn(PFX "fmr_create failed for FMR %d\n", - i); - kfree(fmr); - goto out_fail; - } - - list_add_tail(&fmr->list, &pool->free_list); - ++pool->pool_size; - } - } - - return pool; - - out_free_pool: - kfree(pool->cache_bucket); - kfree(pool); - - return ERR_PTR(ret); - - out_fail: - ib_destroy_fmr_pool(pool); - - return ERR_PTR(-ENOMEM); -} -EXPORT_SYMBOL(ib_create_fmr_pool); - -/** - * ib_destroy_fmr_pool - Free FMR pool - * @pool:FMR pool to free - * - * Destroy an FMR pool and free all associated resources. - */ -void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) -{ - struct ib_pool_fmr *fmr; - struct ib_pool_fmr *tmp; - LIST_HEAD(fmr_list); - int i; - - kthread_destroy_worker(pool->worker); - ib_fmr_batch_release(pool); - - i = 0; - list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { - if (fmr->remap_count) { - INIT_LIST_HEAD(&fmr_list); - list_add_tail(&fmr->fmr->list, &fmr_list); - ib_unmap_fmr(&fmr_list); - } - ib_dealloc_fmr(fmr->fmr); - list_del(&fmr->list); - kfree(fmr); - ++i; - } - - if (i < pool->pool_size) - pr_warn(PFX "pool still has %d regions registered\n", - pool->pool_size - i); - - kfree(pool->cache_bucket); - kfree(pool); -} -EXPORT_SYMBOL(ib_destroy_fmr_pool); - -/** - * ib_flush_fmr_pool - Invalidate all unmapped FMRs - * @pool:FMR pool to flush - * - * Ensure that all unmapped FMRs are fully invalidated. - */ -int ib_flush_fmr_pool(struct ib_fmr_pool *pool) -{ - int serial; - struct ib_pool_fmr *fmr, *next; - - /* - * The free_list holds FMRs that may have been used - * but have not been remapped enough times to be dirty. - * Put them on the dirty list now so that the cleanup - * thread will reap them too. - */ - spin_lock_irq(&pool->pool_lock); - list_for_each_entry_safe(fmr, next, &pool->free_list, list) { - if (fmr->remap_count > 0) - list_move(&fmr->list, &pool->dirty_list); - } - spin_unlock_irq(&pool->pool_lock); - - serial = atomic_inc_return(&pool->req_ser); - kthread_queue_work(pool->worker, &pool->work); - - if (wait_event_interruptible(pool->force_wait, - atomic_read(&pool->flush_ser) - serial >= 0)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL(ib_flush_fmr_pool); - -/** - * ib_fmr_pool_map_phys - Map an FMR from an FMR pool. - * @pool_handle: FMR pool to allocate FMR from - * @page_list: List of pages to map - * @list_len: Number of pages in @page_list - * @io_virtual_address: I/O virtual address for new FMR - */ -struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, - u64 *page_list, - int list_len, - u64 io_virtual_address) -{ - struct ib_fmr_pool *pool = pool_handle; - struct ib_pool_fmr *fmr; - unsigned long flags; - int result; - - if (list_len < 1 || list_len > pool->max_pages) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&pool->pool_lock, flags); - fmr = ib_fmr_cache_lookup(pool, - page_list, - list_len, - io_virtual_address); - if (fmr) { - /* found in cache */ - ++fmr->ref_count; - if (fmr->ref_count == 1) { - list_del(&fmr->list); - } - - spin_unlock_irqrestore(&pool->pool_lock, flags); - - return fmr; - } - - if (list_empty(&pool->free_list)) { - spin_unlock_irqrestore(&pool->pool_lock, flags); - return ERR_PTR(-EAGAIN); - } - - fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); - list_del(&fmr->list); - hlist_del_init(&fmr->cache_node); - spin_unlock_irqrestore(&pool->pool_lock, flags); - - result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, - io_virtual_address); - - if (result) { - spin_lock_irqsave(&pool->pool_lock, flags); - list_add(&fmr->list, &pool->free_list); - spin_unlock_irqrestore(&pool->pool_lock, flags); - - pr_warn(PFX "fmr_map returns %d\n", result); - - return ERR_PTR(result); - } - - ++fmr->remap_count; - fmr->ref_count = 1; - - if (pool->cache_bucket) { - fmr->io_virtual_address = io_virtual_address; - fmr->page_list_len = list_len; - memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list)); - - spin_lock_irqsave(&pool->pool_lock, flags); - hlist_add_head(&fmr->cache_node, - pool->cache_bucket + ib_fmr_hash(fmr->page_list[0])); - spin_unlock_irqrestore(&pool->pool_lock, flags); - } - - return fmr; -} -EXPORT_SYMBOL(ib_fmr_pool_map_phys); - -/** - * ib_fmr_pool_unmap - Unmap FMR - * @fmr:FMR to unmap - * - * Unmap an FMR. The FMR mapping may remain valid until the FMR is - * reused (or until ib_flush_fmr_pool() is called). - */ -void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) -{ - struct ib_fmr_pool *pool; - unsigned long flags; - - pool = fmr->pool; - - spin_lock_irqsave(&pool->pool_lock, flags); - - --fmr->ref_count; - if (!fmr->ref_count) { - if (fmr->remap_count < pool->max_remaps) { - list_add_tail(&fmr->list, &pool->free_list); - } else { - list_add_tail(&fmr->list, &pool->dirty_list); - if (++pool->dirty_len >= pool->dirty_watermark) { - atomic_inc(&pool->req_ser); - kthread_queue_work(pool->worker, &pool->work); - } - } - } - - spin_unlock_irqrestore(&pool->pool_lock, flags); -} -EXPORT_SYMBOL(ib_fmr_pool_unmap); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index da8adadf4755..2b47073c61a6 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -211,8 +211,7 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) */ static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { - BUG_ON(atomic_read(&cm_id_priv->refcount)==0); - if (atomic_dec_and_test(&cm_id_priv->refcount)) { + if (refcount_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); return 1; @@ -225,7 +224,7 @@ static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - atomic_inc(&cm_id_priv->refcount); + refcount_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) @@ -257,7 +256,7 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; spin_lock_init(&cm_id_priv->lock); - atomic_set(&cm_id_priv->refcount, 1); + refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); @@ -1094,7 +1093,7 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } } - atomic_inc(&cm_id_priv->refcount); + refcount_inc(&cm_id_priv->refcount); if (list_empty(&cm_id_priv->work_list)) { list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); @@ -1187,29 +1186,34 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) - pr_err("iw_cm: couldn't init iwpm\n"); - else - rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + return ret; + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); if (!iwcm_wq) - return -ENOMEM; + goto err_alloc; iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", iwcm_ctl_table); if (!iwcm_ctl_table_hdr) { pr_err("iw_cm: couldn't register sysctl paths\n"); - destroy_workqueue(iwcm_wq); - return -ENOMEM; + goto err_sysctl; } + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); return 0; + +err_sysctl: + destroy_workqueue(iwcm_wq); +err_alloc: + iwpm_exit(RDMA_NL_IWCM); + return -ENOMEM; } static void __exit iw_cm_cleanup(void) { + rdma_nl_unregister(RDMA_NL_IWCM); unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h index 82c2cd1b0a80..bf74639be128 100644 --- a/drivers/infiniband/core/iwcm.h +++ b/drivers/infiniband/core/iwcm.h @@ -52,7 +52,7 @@ struct iwcm_id_private { wait_queue_head_t connect_wait; struct list_head work_list; spinlock_t lock; - atomic_t refcount; + refcount_t refcount; struct list_head work_free_list; }; diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 46686990a827..3c9a9869212b 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -69,10 +69,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto pid_query_error; - } if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || iwpm_user_pid == IWPM_PID_UNAVAILABLE) return 0; @@ -123,7 +119,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) ret = iwpm_wait_complete_req(nlmsg_request); return ret; pid_query_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); @@ -153,10 +149,6 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto add_mapping_error; - } if (!iwpm_valid_pid()) return 0; if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { @@ -211,7 +203,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) ret = iwpm_wait_complete_req(nlmsg_request); return ret; add_mapping_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); add_mapping_error_nowarn: dev_kfree_skb(skb); if (nlmsg_request) @@ -240,10 +232,6 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto query_mapping_error; - } if (!iwpm_valid_pid()) return 0; if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { @@ -304,7 +292,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) ret = iwpm_wait_complete_req(nlmsg_request); return ret; query_mapping_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); query_mapping_error_nowarn: dev_kfree_skb(skb); if (nlmsg_request) @@ -331,10 +319,6 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto remove_mapping_error; - } if (!iwpm_valid_pid()) return 0; if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) { @@ -372,7 +356,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) "remove_mapping: Local sockaddr:"); return 0; remove_mapping_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); if (skb) dev_kfree_skb_any(skb); return ret; @@ -392,7 +376,7 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { /** * iwpm_register_pid_cb - Process the port mapper response to * iwpm_register_pid query - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * If successful, the function receives the userspace port mapper pid @@ -431,7 +415,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) strcmp(iwpm_ulib_name, iwpm_name) || iwpm_version < IWPM_UABI_VERSION_MIN) { - pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", + pr_info("%s: Incorrect info (dev = %s name = %s version = %u)\n", __func__, dev_name, iwpm_name, iwpm_version); nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto register_pid_response_exit; @@ -439,13 +423,12 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_user_pid = cb->nlh->nlmsg_pid; iwpm_ulib_version = iwpm_version; if (iwpm_ulib_version < IWPM_UABI_VERSION) - pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); - if (iwpm_valid_client(nl_client)) - iwpm_set_registration(nl_client, IWPM_REG_VALID); + iwpm_set_registration(nl_client, IWPM_REG_VALID); register_pid_response_exit: nlmsg_request->request_done = 1; /* always for found nlmsg_request */ @@ -468,7 +451,7 @@ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { /** * iwpm_add_mapping_cb - Process the port mapper response to * iwpm_add_mapping request - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -528,7 +511,8 @@ add_mapping_response_exit: } /* netlink attribute policy for the response to add and query mapping request - * and response with remote address info */ + * and response with remote address info + */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, [IWPM_NLA_RQUERY_LOCAL_ADDR] = { @@ -545,7 +529,7 @@ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = /** * iwpm_add_and_query_mapping_cb - Process the port mapper response to * iwpm_add_and_query_mapping request - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, @@ -627,7 +611,7 @@ query_mapping_response_exit: /** * iwpm_remote_info_cb - Process remote connecting peer address info, which * the port mapper has received from the connecting peer - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * Stores the IPv4/IPv6 address info in a hash table @@ -648,11 +632,6 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) return ret; nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid port mapper client = %d\n", - __func__, nl_client); - return ret; - } atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) @@ -706,7 +685,7 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { /** * iwpm_mapping_info_cb - Process a notification that the userspace * port mapper daemon is started - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * Using the received port mapper pid, send all the local mapping @@ -730,22 +709,17 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || iwpm_version < IWPM_UABI_VERSION_MIN) { - pr_info("%s: Invalid port mapper name = %s version = %d\n", + pr_info("%s: Invalid port mapper name = %s version = %u\n", __func__, iwpm_name, iwpm_version); return ret; } nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid port mapper client = %d\n", - __func__, nl_client); - return ret; - } iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; if (iwpm_ulib_version < IWPM_UABI_VERSION) - pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", __func__, iwpm_user_pid); if (!iwpm_mapinfo_available()) @@ -766,7 +740,7 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { /** * iwpm_ack_mapping_info_cb - Process the port mapper ack for * the provided local mapping info records - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -796,7 +770,7 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { /** * iwpm_mapping_error_cb - Process port mapper notification for error * - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -841,7 +815,7 @@ static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { /** * iwpm_hello_cb - Process a hello message from iwpmd * - * @skb: + * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * Using the received port mapper pid, send the kernel's abi_version @@ -862,11 +836,6 @@ int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) } abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid port mapper client = %d\n", - __func__, nl_client); - return ret; - } iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 13495b43dbc1..358a2db38d23 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -48,7 +48,6 @@ static DEFINE_SPINLOCK(iwpm_mapinfo_lock); static struct hlist_head *iwpm_reminfo_bucket; static DEFINE_SPINLOCK(iwpm_reminfo_lock); -static DEFINE_MUTEX(iwpm_admin_lock); static struct iwpm_admin_data iwpm_admin; /** @@ -59,35 +58,21 @@ static struct iwpm_admin_data iwpm_admin; */ int iwpm_init(u8 nl_client) { - int ret = 0; - mutex_lock(&iwpm_admin_lock); - if (atomic_read(&iwpm_admin.refcount) == 0) { - iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!iwpm_hash_bucket) { - ret = -ENOMEM; - goto init_exit; - } - iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!iwpm_reminfo_bucket) { - kfree(iwpm_hash_bucket); - ret = -ENOMEM; - goto init_exit; - } - } - atomic_inc(&iwpm_admin.refcount); -init_exit: - mutex_unlock(&iwpm_admin_lock); - if (!ret) { - iwpm_set_valid(nl_client, 1); - iwpm_set_registration(nl_client, IWPM_REG_UNDEF); - pr_debug("%s: Mapinfo and reminfo tables are created\n", - __func__); + iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) + return -ENOMEM; + + iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_reminfo_bucket) { + kfree(iwpm_hash_bucket); + return -ENOMEM; } - return ret; + + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); + pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); + return 0; } static void free_hash_bucket(void); @@ -101,22 +86,9 @@ static void free_reminfo_bucket(void); */ int iwpm_exit(u8 nl_client) { - - if (!iwpm_valid_client(nl_client)) - return -EINVAL; - mutex_lock(&iwpm_admin_lock); - if (atomic_read(&iwpm_admin.refcount) == 0) { - mutex_unlock(&iwpm_admin_lock); - pr_err("%s Incorrect usage - negative refcount\n", __func__); - return -EINVAL; - } - if (atomic_dec_and_test(&iwpm_admin.refcount)) { - free_hash_bucket(); - free_reminfo_bucket(); - pr_debug("%s: Resources are destroyed\n", __func__); - } - mutex_unlock(&iwpm_admin_lock); - iwpm_set_valid(nl_client, 0); + free_hash_bucket(); + free_reminfo_bucket(); + pr_debug("%s: Resources are destroyed\n", __func__); iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } @@ -127,8 +99,8 @@ static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, /** * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address * info in a hash table - * @local_addr: Local ip/tcp address - * @mapped_addr: Mapped local ip/tcp address + * @local_sockaddr: Local ip/tcp address + * @mapped_sockaddr: Mapped local ip/tcp address * @nl_client: The index of the netlink client * @map_flags: IWPM mapping flags */ @@ -141,8 +113,6 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, unsigned long flags; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) - return ret; map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); if (!map_info) return -ENOMEM; @@ -174,7 +144,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address * info from the hash table - * @local_addr: Local ip/tcp address + * @local_sockaddr: Local ip/tcp address * @mapped_local_addr: Mapped local ip/tcp address * * Returns err code if mapping info is not found in the hash table, @@ -302,10 +272,6 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, unsigned long flags; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid client = %d\n", __func__, nl_client); - return ret; - } spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( @@ -420,16 +386,6 @@ int iwpm_get_nlmsg_seq(void) return atomic_inc_return(&iwpm_admin.nlmsg_seq); } -int iwpm_valid_client(u8 nl_client) -{ - return iwpm_admin.client_list[nl_client]; -} - -void iwpm_set_valid(u8 nl_client, int valid) -{ - iwpm_admin.client_list[nl_client] = valid; -} - /* valid client */ u32 iwpm_get_registration(u8 nl_client) { @@ -651,7 +607,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) err_str = "Unable to send a nlmsg"; goto mapinfo_num_error; } - pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num); + pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); @@ -806,7 +762,7 @@ int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; - const char *err_str = ""; + const char *err_str; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index 1bf87d9fd0bd..d6fc8402158a 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -33,7 +33,6 @@ #ifndef _IWPM_UTIL_H #define _IWPM_UTIL_H -#include <linux/module.h> #include <linux/io.h> #include <linux/in.h> #include <linux/in6.h> @@ -90,9 +89,7 @@ struct iwpm_remote_info { }; struct iwpm_admin_data { - atomic_t refcount; atomic_t nlmsg_seq; - int client_list[RDMA_NL_NUM_CLIENTS]; u32 reg_list[RDMA_NL_NUM_CLIENTS]; }; @@ -141,29 +138,13 @@ int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); int iwpm_get_nlmsg_seq(void); /** - * iwpm_add_reminfo - Add remote address info of the connecting peer + * iwpm_add_remote_info - Add remote address info of the connecting peer * to the remote info hash table * @reminfo: The remote info to be added */ void iwpm_add_remote_info(struct iwpm_remote_info *reminfo); /** - * iwpm_valid_client - Check if the port mapper client is valid - * @nl_client: The index of the netlink client - * - * Valid clients need to call iwpm_init() before using - * the port mapper - */ -int iwpm_valid_client(u8 nl_client); - -/** - * iwpm_set_valid - Set the port mapper client to valid or not - * @nl_client: The index of the netlink client - * @valid: 1 if valid or 0 if invalid - */ -void iwpm_set_valid(u8 nl_client, int valid); - -/** * iwpm_check_registration - Check if the client registration * matches the given one * @nl_client: The index of the netlink client @@ -183,7 +164,7 @@ u32 iwpm_check_registration(u8 nl_client, u32 reg); void iwpm_set_registration(u8 nl_client, u32 reg); /** - * iwpm_get_registration + * iwpm_get_registration - Get the client registration * @nl_client: The index of the netlink client * * Returns the client registration type diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c new file mode 100644 index 000000000000..c77d7d2559a1 --- /dev/null +++ b/drivers/infiniband/core/lag.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#include <rdma/ib_verbs.h> +#include <rdma/ib_cache.h> +#include <rdma/lag.h> + +static struct sk_buff *rdma_build_skb(struct net_device *netdev, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *eth; + struct iphdr *iph; + struct udphdr *uh; + u8 smac[ETH_ALEN]; + bool is_ipv4; + int hdr_len; + + is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw); + hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev); + hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr); + + skb = alloc_skb(hdr_len, flags); + if (!skb) + return NULL; + + skb->dev = netdev; + skb_reserve(skb, hdr_len); + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + uh->source = + htons(rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label)); + uh->dest = htons(ROCE_V2_UDP_DPORT); + uh->len = htons(sizeof(struct udphdr)); + + if (is_ipv4) { + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->frag_off = 0; + iph->version = 4; + iph->protocol = IPPROTO_UDP; + iph->ihl = 0x5; + iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct + iphdr)); + memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12, + sizeof(struct in_addr)); + memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12, + sizeof(struct in_addr)); + } else { + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label, + sizeof(*ip6h->flow_lbl)); + memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw, + sizeof(struct in6_addr)); + memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw, + sizeof(struct in6_addr)); + } + + skb_push(skb, sizeof(struct ethhdr)); + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + skb->protocol = eth->h_proto = htons(is_ipv4 ? ETH_P_IP : ETH_P_IPV6); + rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr, NULL, smac); + memcpy(eth->h_source, smac, ETH_ALEN); + memcpy(eth->h_dest, ah_attr->roce.dmac, ETH_ALEN); + + return skb; +} + +static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device, + struct net_device *master, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave; + struct sk_buff *skb; + + skb = rdma_build_skb(master, ah_attr, flags); + if (!skb) + return ERR_PTR(-ENOMEM); + + rcu_read_lock(); + slave = netdev_get_xmit_slave(master, skb, + !!(device->lag_flags & + RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); + if (slave) + dev_hold(slave); + rcu_read_unlock(); + kfree_skb(skb); + return slave; +} + +void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave) +{ + if (xmit_slave) + dev_put(xmit_slave); +} + +struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave = NULL; + struct net_device *master; + + if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE && + ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + ah_attr->grh.flow_label)) + return NULL; + + rcu_read_lock(); + master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr); + if (IS_ERR(master)) { + rcu_read_unlock(); + return master; + } + dev_hold(master); + rcu_read_unlock(); + + if (!netif_is_bond_master(master)) + goto put; + + slave = rdma_get_xmit_slave_udp(device, master, ah_attr, flags); +put: + dev_put(master); + return slave; +} diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c54db13fa9b0..1893aa613ad7 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -61,7 +61,7 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, { u16 pkey; struct ib_device *dev = qp_info->port_priv->device; - u8 pnum = qp_info->port_priv->port_num; + u32 pnum = qp_info->port_priv->port_num; struct ib_ud_wr *wr = &mad_send_wr->send_wr; struct rdma_ah_attr attr = {}; @@ -85,7 +85,6 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -/* Client ID 0 is used for snoop-only clients */ static DEFINE_XARRAY_ALLOC1(ib_mad_clients); static u32 ib_mad_client_next; static struct list_head ib_mad_port_list; @@ -119,7 +118,7 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); * Assumes ib_mad_port_list_lock is being held */ static inline struct ib_mad_port_private * -__ib_get_mad_port(struct ib_device *device, int port_num) +__ib_get_mad_port(struct ib_device *device, u32 port_num) { struct ib_mad_port_private *entry; @@ -135,7 +134,7 @@ __ib_get_mad_port(struct ib_device *device, int port_num) * for a device/port */ static inline struct ib_mad_port_private * -ib_get_mad_port(struct ib_device *device, int port_num) +ib_get_mad_port(struct ib_device *device, u32 port_num) { struct ib_mad_port_private *entry; unsigned long flags; @@ -156,8 +155,7 @@ static inline u8 convert_mgmt_class(u8 mgmt_class) static int get_spl_qp_index(enum ib_qp_type qp_type) { - switch (qp_type) - { + switch (qp_type) { case IB_QPT_SMI: return 0; case IB_QPT_GSI: @@ -223,7 +221,7 @@ EXPORT_SYMBOL(ib_response_mad); * Context: Process context. */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, - u8 port_num, + u32 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, @@ -353,7 +351,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n", + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %u\n", __func__, port_num); ret = ERR_PTR(-ENODEV); goto error1; @@ -403,7 +401,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); - atomic_set(&mad_agent_priv->refcount, 1); + refcount_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); @@ -483,141 +481,12 @@ error1: } EXPORT_SYMBOL(ib_register_mad_agent); -static inline int is_snooping_sends(int mad_snoop_flags) -{ - return (mad_snoop_flags & - (/*IB_MAD_SNOOP_POSTED_SENDS | - IB_MAD_SNOOP_RMPP_SENDS |*/ - IB_MAD_SNOOP_SEND_COMPLETIONS /*| - IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); -} - -static inline int is_snooping_recvs(int mad_snoop_flags) -{ - return (mad_snoop_flags & - (IB_MAD_SNOOP_RECVS /*| - IB_MAD_SNOOP_RMPP_RECVS*/)); -} - -static int register_snoop_agent(struct ib_mad_qp_info *qp_info, - struct ib_mad_snoop_private *mad_snoop_priv) -{ - struct ib_mad_snoop_private **new_snoop_table; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - /* Check for empty slot in array. */ - for (i = 0; i < qp_info->snoop_table_size; i++) - if (!qp_info->snoop_table[i]) - break; - - if (i == qp_info->snoop_table_size) { - /* Grow table. */ - new_snoop_table = krealloc(qp_info->snoop_table, - sizeof mad_snoop_priv * - (qp_info->snoop_table_size + 1), - GFP_ATOMIC); - if (!new_snoop_table) { - i = -ENOMEM; - goto out; - } - - qp_info->snoop_table = new_snoop_table; - qp_info->snoop_table_size++; - } - qp_info->snoop_table[i] = mad_snoop_priv; - atomic_inc(&qp_info->snoop_count); -out: - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - return i; -} - -struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, - u8 port_num, - enum ib_qp_type qp_type, - int mad_snoop_flags, - ib_mad_snoop_handler snoop_handler, - ib_mad_recv_handler recv_handler, - void *context) -{ - struct ib_mad_port_private *port_priv; - struct ib_mad_agent *ret; - struct ib_mad_snoop_private *mad_snoop_priv; - int qpn; - int err; - - /* Validate parameters */ - if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || - (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { - ret = ERR_PTR(-EINVAL); - goto error1; - } - qpn = get_spl_qp_index(qp_type); - if (qpn == -1) { - ret = ERR_PTR(-EINVAL); - goto error1; - } - port_priv = ib_get_mad_port(device, port_num); - if (!port_priv) { - ret = ERR_PTR(-ENODEV); - goto error1; - } - /* Allocate structures */ - mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); - if (!mad_snoop_priv) { - ret = ERR_PTR(-ENOMEM); - goto error1; - } - - /* Now, fill in the various structures */ - mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; - mad_snoop_priv->agent.device = device; - mad_snoop_priv->agent.recv_handler = recv_handler; - mad_snoop_priv->agent.snoop_handler = snoop_handler; - mad_snoop_priv->agent.context = context; - mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; - mad_snoop_priv->agent.port_num = port_num; - mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; - init_completion(&mad_snoop_priv->comp); - - err = ib_mad_agent_security_setup(&mad_snoop_priv->agent, qp_type); - if (err) { - ret = ERR_PTR(err); - goto error2; - } - - mad_snoop_priv->snoop_index = register_snoop_agent( - &port_priv->qp_info[qpn], - mad_snoop_priv); - if (mad_snoop_priv->snoop_index < 0) { - ret = ERR_PTR(mad_snoop_priv->snoop_index); - goto error3; - } - - atomic_set(&mad_snoop_priv->refcount, 1); - return &mad_snoop_priv->agent; -error3: - ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); -error2: - kfree(mad_snoop_priv); -error1: - return ret; -} -EXPORT_SYMBOL(ib_register_mad_snoop); - static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { - if (atomic_dec_and_test(&mad_agent_priv->refcount)) + if (refcount_dec_and_test(&mad_agent_priv->refcount)) complete(&mad_agent_priv->comp); } -static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) -{ - if (atomic_dec_and_test(&mad_snoop_priv->refcount)) - complete(&mad_snoop_priv->comp); -} - static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; @@ -639,10 +508,10 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); flush_workqueue(port_priv->wq); - ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); + ib_cancel_rmpp_recvs(mad_agent_priv); ib_mad_agent_security_cleanup(&mad_agent_priv->agent); @@ -650,25 +519,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) kfree_rcu(mad_agent_priv, rcu); } -static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) -{ - struct ib_mad_qp_info *qp_info; - unsigned long flags; - - qp_info = mad_snoop_priv->qp_info; - spin_lock_irqsave(&qp_info->snoop_lock, flags); - qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; - atomic_dec(&qp_info->snoop_count); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - - deref_snoop_agent(mad_snoop_priv); - wait_for_completion(&mad_snoop_priv->comp); - - ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); - - kfree(mad_snoop_priv); -} - /* * ib_unregister_mad_agent - Unregisters a client from using MAD services * @@ -677,20 +527,11 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_snoop_private *mad_snoop_priv; - - /* If the TID is zero, the agent can only snoop. */ - if (mad_agent->hi_tid) { - mad_agent_priv = container_of(mad_agent, - struct ib_mad_agent_private, - agent); - unregister_mad_agent(mad_agent_priv); - } else { - mad_snoop_priv = container_of(mad_agent, - struct ib_mad_snoop_private, - agent); - unregister_mad_snoop(mad_snoop_priv); - } + + mad_agent_priv = container_of(mad_agent, + struct ib_mad_agent_private, + agent); + unregister_mad_agent(mad_agent_priv); } EXPORT_SYMBOL(ib_unregister_mad_agent); @@ -706,59 +547,8 @@ static void dequeue_mad(struct ib_mad_list_head *mad_list) spin_unlock_irqrestore(&mad_queue->lock, flags); } -static void snoop_send(struct ib_mad_qp_info *qp_info, - struct ib_mad_send_buf *send_buf, - struct ib_mad_send_wc *mad_send_wc, - int mad_snoop_flags) -{ - struct ib_mad_snoop_private *mad_snoop_priv; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - for (i = 0; i < qp_info->snoop_table_size; i++) { - mad_snoop_priv = qp_info->snoop_table[i]; - if (!mad_snoop_priv || - !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) - continue; - - atomic_inc(&mad_snoop_priv->refcount); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, - send_buf, mad_send_wc); - deref_snoop_agent(mad_snoop_priv); - spin_lock_irqsave(&qp_info->snoop_lock, flags); - } - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); -} - -static void snoop_recv(struct ib_mad_qp_info *qp_info, - struct ib_mad_recv_wc *mad_recv_wc, - int mad_snoop_flags) -{ - struct ib_mad_snoop_private *mad_snoop_priv; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - for (i = 0; i < qp_info->snoop_table_size; i++) { - mad_snoop_priv = qp_info->snoop_table[i]; - if (!mad_snoop_priv || - !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) - continue; - - atomic_inc(&mad_snoop_priv->refcount); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, - mad_recv_wc); - deref_snoop_agent(mad_snoop_priv); - spin_lock_irqsave(&qp_info->snoop_lock, flags); - } - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); -} - static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, - u16 pkey_index, u8 port_num, struct ib_wc *wc) + u16 pkey_index, u32 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); wc->wr_cqe = cqe; @@ -817,7 +607,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_port_private *port_priv; struct ib_mad_agent_private *recv_mad_agent = NULL; struct ib_device *device = mad_agent_priv->agent.device; - u8 port_num; + u32 port_num; struct ib_wc mad_wc; struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); @@ -916,8 +706,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, (const struct ib_mad *)smp, (struct ib_mad *)mad_priv->mad, &mad_size, &out_mad_pkey_index); - switch (ret) - { + switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && mad_agent_priv->agent.recv_handler) { @@ -927,7 +716,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, * Reference MAD agent until receive * side of local completion handled */ - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); } else kfree(mad_priv); break; @@ -967,7 +756,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, local->return_wc_byte_len = mad_size; } /* Reference MAD agent until send side of local completion handled */ - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&local->completion_list, &mad_agent_priv->local_list); @@ -1016,7 +805,7 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, /* Allocate data segments. */ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { - seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); + seg = kmalloc(sizeof(*seg) + seg_size, gfp_mask); if (!seg) { free_send_rmpp_list(send_wr); return -ENOMEM; @@ -1046,12 +835,11 @@ int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) } EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); -struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, - u32 remote_qpn, u16 pkey_index, - int rmpp_active, - int hdr_len, int data_len, - gfp_t gfp_mask, - u8 base_version) +struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent, + u32 remote_qpn, u16 pkey_index, + int rmpp_active, int hdr_len, + int data_len, gfp_t gfp_mask, + u8 base_version) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; @@ -1125,7 +913,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, } mad_send_wr->send_buf.mad_agent = mad_agent; - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); return &mad_send_wr->send_buf; } EXPORT_SYMBOL(ib_create_send_mad); @@ -1340,7 +1128,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, mad_send_wr->status = IB_WC_SUCCESS; /* Reference MAD agent until send completes */ - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->send_list); @@ -1357,7 +1145,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, spin_lock_irqsave(&mad_agent_priv->lock, flags); list_del(&mad_send_wr->agent_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); goto error; } } @@ -1484,11 +1272,9 @@ static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, int i; /* Remove any methods for this mad agent */ - for (i = 0; i < IB_MGMT_MAX_METHODS; i++) { - if (method->agent[i] == agent) { + for (i = 0; i < IB_MGMT_MAX_METHODS; i++) + if (method->agent[i] == agent) method->agent[i] = NULL; - } - } } static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, @@ -1663,9 +1449,8 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) * Was MAD registration request supplied * with original registration ? */ - if (!agent_priv->reg_req) { + if (!agent_priv->reg_req) goto out; - } port_priv = agent_priv->qp_info->port_priv; mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); @@ -1763,7 +1548,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv, hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; rcu_read_lock(); mad_agent = xa_load(&ib_mad_clients, hi_tid); - if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount)) + if (mad_agent && !refcount_inc_not_zero(&mad_agent->refcount)) mad_agent = NULL; rcu_read_unlock(); } else { @@ -1815,14 +1600,14 @@ find_mad_agent(struct ib_mad_port_private *port_priv, } } if (mad_agent) - atomic_inc(&mad_agent->refcount); + refcount_inc(&mad_agent->refcount); out: spin_unlock_irqrestore(&port_priv->reg_lock, flags); } if (mad_agent && !mad_agent->agent.recv_handler) { dev_notice(&port_priv->device->dev, - "No receive handler for client %p on port %d\n", + "No receive handler for client %p on port %u\n", &mad_agent->agent, port_priv->port_num); deref_mad_agent(mad_agent); mad_agent = NULL; @@ -1841,7 +1626,7 @@ static int validate_mad(const struct ib_mad_hdr *mad_hdr, /* Make sure MAD base version is understood */ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { - pr_err("MAD received with unsupported base version %d %s\n", + pr_err("MAD received with unsupported base version %u %s\n", mad_hdr->base_version, opa ? "(opa)" : ""); goto out; } @@ -1886,15 +1671,16 @@ static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, rwc->recv_buf.mad->mad_hdr.mgmt_class; } -static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, - const struct ib_mad_send_wr_private *wr, - const struct ib_mad_recv_wc *rwc ) +static inline int +rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) { struct rdma_ah_attr attr; u8 send_resp, rcv_resp; union ib_gid sgid; struct ib_device *device = mad_agent_priv->agent.device; - u8 port_num = mad_agent_priv->agent.port_num; + u32 port_num = mad_agent_priv->agent.port_num; u8 lmc; bool has_grh; @@ -2040,10 +1826,11 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_agent_priv->agent.recv_handler( &mad_agent_priv->agent, NULL, mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); } else { /* not user rmpp, revert to normal behavior and - * drop the mad */ + * drop the mad + */ ib_free_recv_mad(mad_recv_wc); deref_mad_agent(mad_agent_priv); return; @@ -2057,7 +1844,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, &mad_agent_priv->agent, &mad_send_wr->send_buf, mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; @@ -2069,14 +1856,12 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_recv_wc); deref_mad_agent(mad_agent_priv); } - - return; } static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, const struct ib_mad_qp_info *qp_info, const struct ib_wc *wc, - int port_num, + u32 port_num, struct ib_mad_private *recv, struct ib_mad_private *response) { @@ -2163,7 +1948,7 @@ static enum smi_action handle_opa_smi(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info, struct ib_wc *wc, - int port_num, + u32 port_num, struct ib_mad_private *recv, struct ib_mad_private *response) { @@ -2219,7 +2004,7 @@ static enum smi_action handle_smi(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info, struct ib_wc *wc, - int port_num, + u32 port_num, struct ib_mad_private *recv, struct ib_mad_private *response, bool opa) @@ -2243,7 +2028,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; struct ib_mad_agent_private *mad_agent; - int port_num; + u32 port_num; int ret = IB_MAD_RESULT_SUCCESS; size_t mad_size; u16 resp_mad_pkey_index = 0; @@ -2289,9 +2074,6 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; - if (atomic_read(&qp_info->snoop_count)) - snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); - /* Validate MAD */ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; @@ -2414,9 +2196,10 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) temp_mad_send_wr->timeout)) break; } - } - else + } else { list_item = &mad_agent_priv->wait_list; + } + list_add(&mad_send_wr->agent_list, list_item); /* Reschedule a work item if we have a shorter timeout */ @@ -2470,7 +2253,7 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_send_wr->status != IB_WC_SUCCESS ) + if (mad_send_wr->status != IB_WC_SUCCESS) mad_send_wc->status = mad_send_wr->status; if (ret == IB_RMPP_RESULT_INTERNAL) ib_rmpp_send_handler(mad_send_wc); @@ -2538,9 +2321,6 @@ retry: mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_send_wc.status = wc->status; mad_send_wc.vendor_err = wc->vendor_err; - if (atomic_read(&qp_info->snoop_count)) - snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, - IB_MAD_SNOOP_SEND_COMPLETIONS); ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { @@ -2653,7 +2433,7 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) list_del(&mad_send_wr->agent_list); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); } } @@ -2679,16 +2459,18 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv, return NULL; } -int ib_modify_mad(struct ib_mad_agent *mad_agent, - struct ib_mad_send_buf *send_buf, u32 timeout_ms) +int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int active; - mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, - agent); + if (!send_buf) + return -EINVAL; + + mad_agent_priv = container_of(send_buf->mad_agent, + struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { @@ -2713,13 +2495,6 @@ int ib_modify_mad(struct ib_mad_agent *mad_agent, } EXPORT_SYMBOL(ib_modify_mad); -void ib_cancel_mad(struct ib_mad_agent *mad_agent, - struct ib_mad_send_buf *send_buf) -{ - ib_modify_mad(mad_agent, send_buf, 0); -} -EXPORT_SYMBOL(ib_cancel_mad); - static void local_completions(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; @@ -2782,16 +2557,12 @@ static void local_completions(struct work_struct *work) local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = (struct ib_mad *)local->mad_priv->mad; - if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) - snoop_recv(recv_mad_agent->qp_info, - &local->mad_priv->header.recv_wc, - IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); - atomic_dec(&recv_mad_agent->refcount); + deref_mad_agent(recv_mad_agent); spin_unlock_irqrestore(&recv_mad_agent->lock, flags); } @@ -2800,15 +2571,11 @@ local_send_completion: mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &local->mad_send_wr->send_buf; - if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) - snoop_send(mad_agent_priv->qp_info, - &local->mad_send_wr->send_buf, - &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); spin_lock_irqsave(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); if (free_mad) kfree(local->mad_priv); kfree(local); @@ -2894,7 +2661,7 @@ static void timeout_sends(struct work_struct *work) mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); spin_lock_irqsave(&mad_agent_priv->lock, flags); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); @@ -2941,6 +2708,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { + kfree(mad_priv); ret = -ENOMEM; break; } @@ -3099,7 +2867,7 @@ static void qp_event_handler(struct ib_event *event, void *qp_context) /* It's worse than that! He's dead, Jim! */ dev_err(&qp_info->port_priv->device->dev, - "Fatal error (%d) on MAD QP (%d)\n", + "Fatal error (%d) on MAD QP (%u)\n", event->event, qp_info->qp->qp_num); } @@ -3119,10 +2887,6 @@ static void init_mad_qp(struct ib_mad_port_private *port_priv, init_mad_queue(qp_info, &qp_info->send_queue); init_mad_queue(qp_info, &qp_info->recv_queue); INIT_LIST_HEAD(&qp_info->overflow_list); - spin_lock_init(&qp_info->snoop_lock); - qp_info->snoop_table = NULL; - qp_info->snoop_table_size = 0; - atomic_set(&qp_info->snoop_count, 0); } static int create_mad_qp(struct ib_mad_qp_info *qp_info, @@ -3166,7 +2930,6 @@ static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) return; ib_destroy_qp(qp_info->qp); - kfree(qp_info->snoop_table); } /* @@ -3174,7 +2937,7 @@ static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) * Create the QP, PD, MR, and CQ if needed */ static int ib_mad_port_open(struct ib_device *device, - int port_num) + u32 port_num) { int ret, cq_size; struct ib_mad_port_private *port_priv; @@ -3229,7 +2992,7 @@ static int ib_mad_port_open(struct ib_device *device, if (ret) goto error7; - snprintf(name, sizeof name, "ib_mad%d", port_num); + snprintf(name, sizeof(name), "ib_mad%u", port_num); port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!port_priv->wq) { ret = -ENOMEM; @@ -3275,7 +3038,7 @@ error3: * If there are no classes using the port, free the port * resources (CQ, MR, PD, QP) and remove the port's info structure */ -static int ib_mad_port_close(struct ib_device *device, int port_num) +static int ib_mad_port_close(struct ib_device *device, u32 port_num) { struct ib_mad_port_private *port_priv; unsigned long flags; @@ -3284,7 +3047,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - dev_err(&device->dev, "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %u not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); @@ -3304,9 +3067,11 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) return 0; } -static void ib_mad_init_device(struct ib_device *device) +static int ib_mad_init_device(struct ib_device *device) { int start, i; + unsigned int count = 0; + int ret; start = rdma_start_port(device); @@ -3314,17 +3079,23 @@ static void ib_mad_init_device(struct ib_device *device) if (!rdma_cap_ib_mad(device, i)) continue; - if (ib_mad_port_open(device, i)) { + ret = ib_mad_port_open(device, i); + if (ret) { dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } - if (ib_agent_port_open(device, i)) { + ret = ib_agent_port_open(device, i); + if (ret) { dev_err(&device->dev, "Couldn't open port %d for agents\n", i); goto error_agent; } + count++; } - return; + if (!count) + return -EOPNOTSUPP; + + return 0; error_agent: if (ib_mad_port_close(device, i)) @@ -3341,6 +3112,7 @@ error: if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); } + return ret; } static void ib_mad_remove_device(struct ib_device *device, void *client_data) @@ -3353,9 +3125,9 @@ static void ib_mad_remove_device(struct ib_device *device, void *client_data) if (ib_agent_port_close(device, i)) dev_err(&device->dev, - "Couldn't close port %d for agents\n", i); + "Couldn't close port %u for agents\n", i); if (ib_mad_port_close(device, i)) - dev_err(&device->dev, "Couldn't close port %d\n", i); + dev_err(&device->dev, "Couldn't close port %u\n", i); } } diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 956b3a7dfed7..1b7445a6f671 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -79,13 +79,13 @@ struct ib_mad_private { struct ib_mad_private_header header; size_t mad_size; struct ib_grh grh; - u8 mad[0]; + u8 mad[]; } __packed; struct ib_rmpp_segment { struct list_head list; u32 num; - u8 data[0]; + u8 data[]; }; struct ib_mad_agent_private { @@ -103,7 +103,7 @@ struct ib_mad_agent_private { struct work_struct local_work; struct list_head rmpp_list; - atomic_t refcount; + refcount_t refcount; union { struct completion comp; struct rcu_head rcu; @@ -115,7 +115,6 @@ struct ib_mad_snoop_private { struct ib_mad_qp_info *qp_info; int snoop_index; int mad_snoop_flags; - atomic_t refcount; struct completion comp; }; diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 5ec57abc0849..8af0619a39cd 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -40,8 +40,7 @@ enum rmpp_state { RMPP_STATE_ACTIVE, RMPP_STATE_TIMEOUT, - RMPP_STATE_COMPLETE, - RMPP_STATE_CANCELING + RMPP_STATE_COMPLETE }; struct mad_rmpp_recv { @@ -52,7 +51,7 @@ struct mad_rmpp_recv { struct completion comp; enum rmpp_state state; spinlock_t lock; - atomic_t refcount; + refcount_t refcount; struct ib_ah *ah; struct ib_mad_recv_wc *rmpp_wc; @@ -73,7 +72,7 @@ struct mad_rmpp_recv { static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { - if (atomic_dec_and_test(&rmpp_recv->refcount)) + if (refcount_dec_and_test(&rmpp_recv->refcount)) complete(&rmpp_recv->comp); } @@ -92,22 +91,18 @@ void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent) spin_lock_irqsave(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { - if (rmpp_recv->state != RMPP_STATE_COMPLETE) - ib_free_recv_mad(rmpp_recv->rmpp_wc); - rmpp_recv->state = RMPP_STATE_CANCELING; - } - spin_unlock_irqrestore(&agent->lock, flags); - - list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { cancel_delayed_work(&rmpp_recv->timeout_work); cancel_delayed_work(&rmpp_recv->cleanup_work); } + spin_unlock_irqrestore(&agent->lock, flags); flush_workqueue(agent->qp_info->port_priv->wq); list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv, &agent->rmpp_list, list) { list_del(&rmpp_recv->list); + if (rmpp_recv->state != RMPP_STATE_COMPLETE) + ib_free_recv_mad(rmpp_recv->rmpp_wc); destroy_rmpp_recv(rmpp_recv); } } @@ -272,10 +267,6 @@ static void recv_cleanup_handler(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&rmpp_recv->agent->lock, flags); - if (rmpp_recv->state == RMPP_STATE_CANCELING) { - spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); - return; - } list_del(&rmpp_recv->list); spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); destroy_rmpp_recv(rmpp_recv); @@ -305,7 +296,7 @@ create_rmpp_recv(struct ib_mad_agent_private *agent, INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler); spin_lock_init(&rmpp_recv->lock); rmpp_recv->state = RMPP_STATE_ACTIVE; - atomic_set(&rmpp_recv->refcount, 1); + refcount_set(&rmpp_recv->refcount, 1); rmpp_recv->rmpp_wc = mad_recv_wc; rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf; @@ -357,7 +348,7 @@ acquire_rmpp_recv(struct ib_mad_agent_private *agent, spin_lock_irqsave(&agent->lock, flags); rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); if (rmpp_recv) - atomic_inc(&rmpp_recv->refcount); + refcount_inc(&rmpp_recv->refcount); spin_unlock_irqrestore(&agent->lock, flags); return rmpp_recv; } @@ -391,8 +382,8 @@ static inline int get_seg_num(struct ib_mad_recv_buf *seg) return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); } -static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list, - struct ib_mad_recv_buf *seg) +static inline struct ib_mad_recv_buf *get_next_seg(struct list_head *rmpp_list, + struct ib_mad_recv_buf *seg) { if (seg->list.next == rmpp_list) return NULL; @@ -405,8 +396,8 @@ static inline int window_size(struct ib_mad_agent_private *agent) return max(agent->qp_info->recv_queue.max_active >> 3, 1); } -static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list, - int seg_num) +static struct ib_mad_recv_buf *find_seg_location(struct list_head *rmpp_list, + int seg_num) { struct ib_mad_recv_buf *seg_buf; int cur_seg_num; @@ -458,7 +449,7 @@ static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) return hdr_size + rmpp_recv->seg_num * data_size - pad; } -static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv) +static struct ib_mad_recv_wc *complete_rmpp(struct mad_rmpp_recv *rmpp_recv) { struct ib_mad_recv_wc *rmpp_wc; @@ -553,7 +544,7 @@ start_rmpp(struct ib_mad_agent_private *agent, destroy_rmpp_recv(rmpp_recv); return continue_rmpp(agent, mad_recv_wc); } - atomic_inc(&rmpp_recv->refcount); + refcount_inc(&rmpp_recv->refcount); if (get_last_flag(&mad_recv_wc->recv_buf)) { rmpp_recv->state = RMPP_STATE_COMPLETE; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index cd338ddc4a39..a236532a9026 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -42,7 +42,7 @@ #include <rdma/ib_cache.h> #include "sa.h" -static void mcast_add_one(struct ib_device *device); +static int mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { @@ -61,9 +61,9 @@ struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; - atomic_t refcount; + refcount_t refcount; struct completion comp; - u8 port_num; + u32 port_num; }; struct mcast_device { @@ -71,7 +71,7 @@ struct mcast_device { struct ib_event_handler event_handler; int start_port; int end_port; - struct mcast_port port[0]; + struct mcast_port port[]; }; enum mcast_state { @@ -117,7 +117,7 @@ struct mcast_member { struct mcast_group *group; struct list_head list; enum mcast_state state; - atomic_t refcount; + refcount_t refcount; struct completion comp; }; @@ -178,7 +178,7 @@ static struct mcast_group *mcast_insert(struct mcast_port *port, static void deref_port(struct mcast_port *port) { - if (atomic_dec_and_test(&port->refcount)) + if (refcount_dec_and_test(&port->refcount)) complete(&port->comp); } @@ -199,7 +199,7 @@ static void release_group(struct mcast_group *group) static void deref_member(struct mcast_member *member) { - if (atomic_dec_and_test(&member->refcount)) + if (refcount_dec_and_test(&member->refcount)) complete(&member->comp); } @@ -401,7 +401,7 @@ static void process_group_error(struct mcast_group *group) while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); - atomic_inc(&member->refcount); + refcount_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; @@ -445,7 +445,7 @@ retest: struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; - atomic_inc(&member->refcount); + refcount_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, @@ -497,7 +497,7 @@ static void process_join_error(struct mcast_group *group, int status) member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { - atomic_inc(&member->refcount); + refcount_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); @@ -589,7 +589,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port, kfree(group); group = cur_group; } else - atomic_inc(&port->refcount); + refcount_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); @@ -605,7 +605,7 @@ found: */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, @@ -632,7 +632,7 @@ ib_sa_join_multicast(struct ib_sa_client *client, member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); - atomic_set(&member->refcount, 1); + refcount_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], @@ -690,7 +690,7 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast) } EXPORT_SYMBOL(ib_sa_free_multicast); -int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, +int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; @@ -721,6 +721,7 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); * member record and gid of the device. * @device: RDMA device * @port_num: Port of the rdma device to consider + * @rec: Multicast member record to use * @ndev: Optional netdevice, applicable only for RoCE * @gid_type: GID type to consider * @ah_attr: AH attribute to fillup on successful completion @@ -731,7 +732,7 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); * success or appropriate error code. * */ -int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, +int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, @@ -815,7 +816,7 @@ static void mcast_event_handler(struct ib_event_handler *handler, } } -static void mcast_add_one(struct ib_device *device) +static int mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; @@ -825,7 +826,7 @@ static void mcast_add_one(struct ib_device *device) dev = kmalloc(struct_size(dev, port, device->phys_port_cnt), GFP_KERNEL); if (!dev) - return; + return -ENOMEM; dev->start_port = rdma_start_port(device); dev->end_port = rdma_end_port(device); @@ -839,13 +840,13 @@ static void mcast_add_one(struct ib_device *device) spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); - atomic_set(&port->refcount, 1); + refcount_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); - return; + return -EOPNOTSUPP; } dev->device = device; @@ -853,6 +854,7 @@ static void mcast_add_one(struct ib_device *device) INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); + return 0; } static void mcast_remove_one(struct ib_device *device, void *client_data) @@ -861,9 +863,6 @@ static void mcast_remove_one(struct ib_device *device, void *client_data) struct mcast_port *port; int i; - if (!dev) - return; - ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 8cd31ef25eff..1b2cc9e45ade 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -98,7 +98,7 @@ get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op) */ up_read(&rdma_nl_types[type].sem); - request_module("rdma-netlink-subsys-%d", type); + request_module("rdma-netlink-subsys-%u", type); down_read(&rdma_nl_types[type].sem); cb_table = READ_ONCE(rdma_nl_types[type].cb_table); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index e0b0a91da696..12dc97067ed2 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -92,7 +92,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, @@ -114,6 +116,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, @@ -129,6 +132,11 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, @@ -145,6 +153,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -241,7 +252,7 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { char fw[IB_FW_VERSION_NAME_MAX]; int ret = 0; - u8 port; + u32 port; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; @@ -384,6 +395,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_CM_ID] = "cm_id", [RDMA_RESTRACK_MR] = "mr", [RDMA_RESTRACK_CTX] = "ctx", + [RDMA_RESTRACK_SRQ] = "srq", }; struct nlattr *table_attr; @@ -446,27 +458,11 @@ static int fill_res_name_pid(struct sk_buff *msg, return err ? -EMSGSIZE : 0; } -static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg, - struct rdma_restrack_entry *res) -{ - if (!dev->ops.fill_res_entry) - return false; - return dev->ops.fill_res_entry(msg, res); -} - -static bool fill_stat_entry(struct ib_device *dev, struct sk_buff *msg, - struct rdma_restrack_entry *res) -{ - if (!dev->ops.fill_stat_entry) - return false; - return dev->ops.fill_stat_entry(msg, res); -} - -static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, - struct rdma_restrack_entry *res, uint32_t port) +static int fill_res_qp_entry_query(struct sk_buff *msg, + struct rdma_restrack_entry *res, + struct ib_device *dev, + struct ib_qp *qp) { - struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct ib_device *dev = qp->device; struct ib_qp_init_attr qp_init_attr; struct ib_qp_attr qp_attr; int ret; @@ -475,16 +471,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (ret) return ret; - if (port && port != qp_attr.port_num) - return -EAGAIN; - - /* In create_qp() port is not set yet */ - if (qp_attr.port_num && - nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) - goto err; - - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) - goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) @@ -508,19 +494,53 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; + if (dev->ops.fill_res_qp_entry) + return dev->ops.fill_res_qp_entry(msg, qp); + return 0; + +err: return -EMSGSIZE; +} + +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + int ret; + + if (port && port != qp->port) + return -EAGAIN; + + /* In create_qp() port is not set yet */ + if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) + return -EINVAL; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); + if (ret) + return -EMSGSIZE; + if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) - goto err; + return -EMSGSIZE; - if (fill_res_name_pid(msg, res)) - goto err; + ret = fill_res_name_pid(msg, res); + if (ret) + return -EMSGSIZE; - if (fill_res_entry(dev, msg, res)) - goto err; + return fill_res_qp_entry_query(msg, res, dev, qp); +} - return 0; +static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; -err: return -EMSGSIZE; + if (port && port != qp->port) + return -EAGAIN; + if (!dev->ops.fill_res_qp_entry_raw) + return -EINVAL; + return dev->ops.fill_res_qp_entry_raw(msg, qp); } static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, @@ -568,9 +588,8 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, if (fill_res_name_pid(msg, res)) goto err; - if (fill_res_entry(dev, msg, res)) - goto err; - + if (dev->ops.fill_res_cm_id_entry) + return dev->ops.fill_res_cm_id_entry(msg, cm_id); return 0; err: return -EMSGSIZE; @@ -583,35 +602,42 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) - goto err; + return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) - goto err; + return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) - goto err; + return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, cq->uobject->uevent.uobject.context->res.id)) - goto err; + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (fill_res_entry(dev, msg, res)) - goto err; + return (dev->ops.fill_res_cq_entry) ? + dev->ops.fill_res_cq_entry(msg, cq) : 0; +} - return 0; +static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct ib_device *dev = cq->device; -err: return -EMSGSIZE; + if (!dev->ops.fill_res_cq_entry_raw) + return -EINVAL; + return dev->ops.fill_res_cq_entry_raw(msg, cq); } static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, @@ -622,38 +648,45 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) - goto err; + return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) - goto err; + return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) - goto err; + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (fill_res_entry(dev, msg, res)) - goto err; + return (dev->ops.fill_res_mr_entry) ? + dev->ops.fill_res_mr_entry(msg, mr) : + 0; +} - return 0; +static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; -err: return -EMSGSIZE; + if (!dev->ops.fill_res_mr_entry_raw) + return -EINVAL; + return dev->ops.fill_res_mr_entry_raw(msg, mr); } static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); - struct ib_device *dev = pd->device; if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, @@ -676,15 +709,138 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, pd->uobject->context->res.id)) goto err; - if (fill_res_name_pid(msg, res)) - goto err; + return fill_res_name_pid(msg, res); + +err: return -EMSGSIZE; +} + +static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res); + + if (rdma_is_kernel_res(res)) + return 0; - if (fill_res_entry(dev, msg, res)) + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id)) + return -EMSGSIZE; + + return fill_res_name_pid(msg, res); +} + +static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range, + uint32_t max_range) +{ + struct nlattr *entry_attr; + + if (!min_range) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (min_range == max_range) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range)) + goto err; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range)) + goto err; + } + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq) +{ + uint32_t min_range = 0, prev = 0; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) + return -EMSGSIZE; + + rt = &srq->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + qp = container_of(res, struct ib_qp, res); + if (!qp->srq || (qp->srq->res.id != srq->res.id)) { + rdma_restrack_put(res); + continue; + } + + if (qp->qp_num < prev) + /* qp_num should be ascending */ + goto err_loop; + + if (min_range == 0) { + min_range = qp->qp_num; + } else if (qp->qp_num > (prev + 1)) { + if (fill_res_range_qp_entry(msg, min_range, prev)) + goto err_loop; + + min_range = qp->qp_num; + } + prev = qp->qp_num; + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + + if (fill_res_range_qp_entry(msg, min_range, prev)) goto err; + nla_nest_end(msg, table_attr); return 0; -err: return -EMSGSIZE; +err_loop: + rdma_restrack_put(res); + xa_unlock(&rt->xa); +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_srq *srq = container_of(res, struct ib_srq, res); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id)) + goto err; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id)) + goto err; + + if (ib_srq_has_cq(srq->srq_type)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, + srq->ext.cq->res.id)) + goto err; + } + + if (fill_res_srq_qps(msg, srq)) + goto err; + + return fill_res_name_pid(msg, res); + +err: + return -EMSGSIZE; } static int fill_stat_counter_mode(struct sk_buff *msg, @@ -695,11 +851,16 @@ static int fill_stat_counter_mode(struct sk_buff *msg, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) return -EMSGSIZE; - if (m->mode == RDMA_COUNTER_MODE_AUTO) + if (m->mode == RDMA_COUNTER_MODE_AUTO) { if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) return -EMSGSIZE; + if ((m->mask & RDMA_COUNTER_MASK_PID) && + fill_res_name_pid(msg, &counter->res)) + return -EMSGSIZE; + } + return 0; } @@ -738,9 +899,6 @@ static int fill_stat_counter_qps(struct sk_buff *msg, xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { qp = container_of(res, struct ib_qp, res); - if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) - continue; - if (!qp->counter || (qp->counter->id != counter->id)) continue; @@ -793,9 +951,8 @@ static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) goto err; - if (fill_stat_entry(dev, msg, res)) - goto err; - + if (dev->ops.fill_stat_mr_entry) + return dev->ops.fill_stat_mr_entry(msg, mr); return 0; err: @@ -813,14 +970,21 @@ static int fill_stat_counter_hwcounters(struct sk_buff *msg, if (!table_attr) return -EMSGSIZE; - for (i = 0; i < st->num_counters; i++) - if (rdma_nl_stat_hwcounter_entry(msg, st->names[i], st->value[i])) + mutex_lock(&st->lock); + for (i = 0; i < st->num_counters; i++) { + if (test_bit(i, st->is_disabled)) + continue; + if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name, + st->value[i])) goto err; + } + mutex_unlock(&st->lock); nla_nest_end(msg, table_attr); return 0; err: + mutex_unlock(&st->lock); nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } @@ -840,7 +1004,6 @@ static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || - fill_res_name_pid(msg, &counter->res) || fill_stat_counter_mode(msg, counter) || fill_stat_counter_qps(msg, counter) || fill_stat_counter_hwcounters(msg, counter)) @@ -916,8 +1079,12 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { char name[IB_DEVICE_NAME_MAX] = {}; - nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); + if (strlen(name) == 0) { + err = -EINVAL; + goto done; + } err = ib_device_rename(device, name); goto done; } @@ -1173,7 +1340,6 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, struct nldev_fill_res_entry { enum rdma_nldev_attr nldev_attr; - enum rdma_nldev_command nldev_cmd; u8 flags; u32 entry; u32 id; @@ -1185,44 +1351,51 @@ enum nldev_res_flags { static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, [RDMA_RESTRACK_COUNTER] = { - .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET, .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, }, + [RDMA_RESTRACK_CTX] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CTXN, + }, + [RDMA_RESTRACK_SRQ] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_SRQN, + }, + }; static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1277,7 +1450,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); if (fill_nldev_handle(msg, device)) { @@ -1288,11 +1462,10 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); ret = fill_func(msg, has_cap_net_admin, res, port); - - rdma_restrack_put(res); if (ret) goto err_free; + rdma_restrack_put(res); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); @@ -1356,7 +1529,8 @@ static int res_get_common_dumpit(struct sk_buff *skb, } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); if (fill_nldev_handle(skb, device)) { @@ -1438,27 +1612,32 @@ err_index: return ret; } -#define RES_GET_FUNCS(name, type) \ - static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ struct netlink_callback *cb) \ - { \ - return res_get_common_dumpit(skb, cb, type, \ - fill_res_##name##_entry); \ - } \ - static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ - struct nlmsghdr *nlh, \ + { \ + return res_get_common_dumpit(skb, cb, type, \ + fill_res_##name##_entry); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ struct netlink_ext_ack *extack) \ - { \ - return res_get_common_doit(skb, nlh, extack, type, \ - fill_res_##name##_entry); \ + { \ + return res_get_common_doit(skb, nlh, extack, type, \ + fill_res_##name##_entry); \ } RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); +RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX); +RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); @@ -1512,13 +1691,13 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; - nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(ibdev_name)); - if (strchr(ibdev_name, '%')) + if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) return -EINVAL; - nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); - nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); @@ -1560,7 +1739,7 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (!device) return -EINVAL; - if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) { ib_device_put(device); return -EINVAL; } @@ -1585,7 +1764,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; - nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { @@ -1680,6 +1859,19 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_free(msg); return err; } + + /* + * Copy-on-fork is supported. + * See commits: + * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") + * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") + * for more details. Don't backport this without them. + * + * Return value ignored on purpose, assume copy-on-fork is not + * supported in case of failure. + */ + nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1); + nlmsg_end(msg, nlh); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); } @@ -1705,24 +1897,113 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } +static int nldev_stat_set_mode_doit(struct sk_buff *msg, + struct netlink_ext_ack *extack, + struct nlattr *tb[], + struct ib_device *device, u32 port) +{ + u32 mode, mask = 0, qpn, cntn = 0; + int ret; + + /* Currently only counter for QP is supported */ + if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || + nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); + if (mode == RDMA_COUNTER_MODE_AUTO) { + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + return rdma_counter_set_auto_mode(device, port, mask, extack); + } + + if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + ret = rdma_counter_bind_qpn(device, port, qpn, cntn); + if (ret) + return ret; + } else { + ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn); + if (ret) + return ret; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + return 0; + +err_fill: + rdma_counter_unbind_qpn(device, port, qpn, cntn); + return ret; +} + +static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], + struct ib_device *device, + u32 port) +{ + struct rdma_hw_stats *stats; + struct nlattr *entry_attr; + unsigned long *target; + int rem, i, ret = 0; + u32 index; + + stats = ib_get_hw_stats_port(device, port); + if (!stats) + return -EINVAL; + + target = kcalloc(BITS_TO_LONGS(stats->num_counters), + sizeof(*stats->is_disabled), GFP_KERNEL); + if (!target) + return -ENOMEM; + + nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], + rem) { + index = nla_get_u32(entry_attr); + if ((index >= stats->num_counters) || + !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) { + ret = -EINVAL; + goto out; + } + + set_bit(index, target); + } + + for (i = 0; i < stats->num_counters; i++) { + if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)) + continue; + + ret = rdma_counter_modify(device, port, i, test_bit(i, target)); + if (ret) + goto out; + } + +out: + kfree(target); + return ret; +} + static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - u32 index, port, mode, mask = 0, qpn, cntn = 0; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; + u32 index, port; int ret; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); - /* Currently only counter for QP is supported */ - if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || - !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || - !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE]) - return -EINVAL; - - if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); @@ -1733,61 +2014,49 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; - goto err; + goto err_put_device; + } + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] && + !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { + ret = -EINVAL; + goto err_put_device; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; - goto err; + goto err_put_device; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_free_msg; + } - mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); - if (mode == RDMA_COUNTER_MODE_AUTO) { - if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) - mask = nla_get_u32( - tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); - - ret = rdma_counter_set_auto_mode(device, port, - mask ? true : false, mask); - if (ret) - goto err_msg; - } else { - if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) - goto err_msg; - qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); - if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { - cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); - ret = rdma_counter_bind_qpn(device, port, qpn, cntn); - } else { - ret = rdma_counter_bind_qpn_alloc(device, port, - qpn, &cntn); - } + if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { + ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); if (ret) - goto err_msg; + goto err_free_msg; + } - if (fill_nldev_handle(msg, device) || - nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || - nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || - nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { - ret = -EMSGSIZE; - goto err_fill; - } + if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { + ret = nldev_stat_set_counter_dynamic_doit(tb, device, port); + if (ret) + goto err_free_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); -err_fill: - rdma_counter_unbind_qpn(device, port, qpn, cntn); -err_msg: +err_free_msg: nlmsg_free(msg); -err: +err_put_device: ib_device_put(device); return ret; } @@ -1879,13 +2148,14 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, if (!device) return -EINVAL; - if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) { + if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) { ret = -EINVAL; goto err; } port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); - if (!rdma_is_port_valid(device, port)) { + stats = ib_get_hw_stats_port(device, port); + if (!stats) { ret = -EINVAL; goto err; } @@ -1907,11 +2177,6 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, goto err_msg; } - stats = device->port_data ? device->port_data[port].hw_stats : NULL; - if (stats == NULL) { - ret = -EINVAL; - goto err_msg; - } mutex_lock(&stats->lock); num_cnts = device->ops.get_hw_stats(device, stats, port, 0); @@ -1926,9 +2191,13 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, goto err_stats; } for (i = 0; i < num_cnts; i++) { + if (test_bit(i, stats->is_disabled)) + continue; + v = stats->value[i] + rdma_counter_get_hwstat_value(device, port, i); - if (rdma_nl_stat_hwcounter_entry(msg, stats->names[i], v)) { + if (rdma_nl_stat_hwcounter_entry(msg, + stats->descs[i].name, v)) { ret = -EMSGSIZE; goto err_table; } @@ -2076,6 +2345,99 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb, return ret; } +static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; + struct rdma_hw_stats *stats; + struct ib_device *device; + struct sk_buff *msg; + u32 devid, port; + int ret, i; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + stats = ib_get_hw_stats_port(device, port); + if (!stats) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put( + msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), + 0, 0); + + ret = -EMSGSIZE; + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + goto err_msg; + + table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table) + goto err_msg; + + mutex_lock(&stats->lock); + for (i = 0; i < stats->num_counters; i++) { + entry = nla_nest_start(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry) + goto err_msg_table; + + if (nla_put_string(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + stats->descs[i].name) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) + goto err_msg_entry; + + if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && + (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, + !test_bit(i, stats->is_disabled)))) + goto err_msg_entry; + + nla_nest_end(msg, entry); + } + mutex_unlock(&stats->lock); + + nla_nest_end(msg, table); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_msg_entry: + nla_nest_cancel(msg, entry); +err_msg_table: + mutex_unlock(&stats->lock); + nla_nest_cancel(msg, table); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -2124,6 +2486,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, + [RDMA_NLDEV_CMD_RES_CTX_GET] = { + .doit = nldev_res_get_ctx_doit, + .dump = nldev_res_get_ctx_dumpit, + }, + [RDMA_NLDEV_CMD_RES_SRQ_GET] = { + .doit = nldev_res_get_srq_doit, + .dump = nldev_res_get_srq_dumpit, + }, [RDMA_NLDEV_CMD_SYS_GET] = { .doit = nldev_sys_get_doit, }, @@ -2142,6 +2512,24 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { + .doit = nldev_res_get_qp_raw_doit, + .dump = nldev_res_get_qp_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { + .doit = nldev_res_get_cq_raw_doit, + .dump = nldev_res_get_cq_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { + .doit = nldev_res_get_mr_raw_doit, + .dump = nldev_res_get_mr_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { + .doit = nldev_stat_get_counter_status_doit, + }, }; void __init nldev_init(void) @@ -2149,7 +2537,7 @@ void __init nldev_init(void) rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); } -void __exit nldev_exit(void) +void nldev_exit(void) { rdma_nl_unregister(RDMA_NL_NLDEV); } diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h index af4879bdf3d6..64e2822af70f 100644 --- a/drivers/infiniband/core/opa_smi.h +++ b/drivers/infiniband/core/opa_smi.h @@ -40,11 +40,11 @@ #include "smi.h" enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, - int port_num, int phys_port_cnt); + u32 port_num, int phys_port_cnt); int opa_smi_get_fwd_port(struct opa_smp *smp); extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, - bool is_switch, int port_num); + bool is_switch, u32 port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 5128cb16bb48..29b1ab1d5f93 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -68,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, * In exclusive access mode, we check that the counter is zero (nobody * claimed this object) and we set it to -1. Releasing a shared access * lock is done simply by decreasing the counter. As for exclusive - * access locks, since only a single one of them is is allowed + * access locks, since only a single one of them is allowed * concurrently, setting the counter to zero is enough for releasing * this lock. */ @@ -112,7 +112,7 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj, * however the type's allocat_commit function cannot have been called and the * uobject cannot be on the uobjects_lists * - * For RDMA_REMOVE_DESTROY the caller shold be holding a kref (eg via + * For RDMA_REMOVE_DESTROY the caller should be holding a kref (eg via * rdma_lookup_get_uobject) and the object is left in a state where the caller * needs to call rdma_lookup_put_uobject. * @@ -137,15 +137,9 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, } else if (uobj->object) { ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, attrs); - if (ret) { - if (ib_is_destroy_retryable(ret, reason, uobj)) - return ret; - - /* Nothing to be done, dangle the memory and move on */ - WARN(true, - "ib_uverbs: failed to remove uobject id %d, driver err=%d", - uobj->id, ret); - } + if (ret) + /* Nothing to be done, wait till ucontext will clean it */ + return ret; uobj->object = NULL; } @@ -153,9 +147,9 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, uobj->context = NULL; /* - * For DESTROY the usecnt is held write locked, the caller is expected - * to put it unlock and put the object when done with it. Only DESTROY - * can remove the IDR handle. + * For DESTROY the usecnt is not changed, the caller is expected to + * manage it via uobj_put_destroy(). Only DESTROY can remove the IDR + * handle. */ if (reason != RDMA_REMOVE_DESTROY) atomic_set(&uobj->usecnt, 0); @@ -187,7 +181,7 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, /* * This calls uverbs_destroy_uobject() using the RDMA_REMOVE_DESTROY * sequence. It should only be used from command callbacks. On success the - * caller must pair this with rdma_lookup_put_uobject(LOOKUP_WRITE). This + * caller must pair this with uobj_put_destroy(). This * version requires the caller to have already obtained an * LOOKUP_DESTROY uobject kref. */ @@ -198,6 +192,13 @@ int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs) down_read(&ufile->hw_destroy_rwsem); + /* + * Once the uobject is destroyed by RDMA_REMOVE_DESTROY then it is left + * write locked as the callers put it back with UVERBS_LOOKUP_DESTROY. + * This is because any other concurrent thread can still see the object + * in the xarray due to RCU. Leaving it locked ensures nothing else will + * touch it. + */ ret = uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE); if (ret) goto out_unlock; @@ -216,7 +217,7 @@ out_unlock: /* * uobj_get_destroy destroys the HW object and returns a handle to the uobj * with a NULL object pointer. The caller must pair this with - * uverbs_put_destroy. + * uobj_put_destroy(). */ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, u32 id, struct uverbs_attr_bundle *attrs) @@ -250,8 +251,7 @@ int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, uobj = __uobj_get_destroy(obj, id, attrs); if (IS_ERR(uobj)) return PTR_ERR(uobj); - - rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE); + uobj_put_destroy(uobj); return 0; } @@ -360,7 +360,7 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj, * uverbs_uobject_fd_release(), and the caller is expected to ensure * that release is never done while a call to lookup is possible. */ - if (f->f_op != fd_type->fops) { + if (f->f_op != fd_type->fops || uobject->ufile != ufile) { fput(f); return ERR_PTR(-EBADF); } @@ -453,40 +453,46 @@ static struct ib_uobject * alloc_begin_fd_uobject(const struct uverbs_api_object *obj, struct uverbs_attr_bundle *attrs) { - const struct uverbs_obj_fd_type *fd_type = - container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + const struct uverbs_obj_fd_type *fd_type; int new_fd; - struct ib_uobject *uobj; + struct ib_uobject *uobj, *ret; struct file *filp; - if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release)) - return ERR_PTR(-EINVAL); - - new_fd = get_unused_fd_flags(O_CLOEXEC); - if (new_fd < 0) - return ERR_PTR(new_fd); - uobj = alloc_uobj(attrs, obj); if (IS_ERR(uobj)) + return uobj; + + fd_type = + container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && + fd_type->fops->release != &uverbs_async_event_release)) { + ret = ERR_PTR(-EINVAL); goto err_fd; + } + + new_fd = get_unused_fd_flags(O_CLOEXEC); + if (new_fd < 0) { + ret = ERR_PTR(new_fd); + goto err_fd; + } /* Note that uverbs_uobject_fd_release() is called during abort */ filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, fd_type->flags); if (IS_ERR(filp)) { - uobj = ERR_CAST(filp); - goto err_uobj; + ret = ERR_CAST(filp); + goto err_getfile; } uobj->object = filp; uobj->id = new_fd; return uobj; -err_uobj: - uverbs_uobject_put(uobj); -err_fd: +err_getfile: put_unused_fd(new_fd); - return uobj; +err_fd: + uverbs_uobject_put(uobj); + return ret; } struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, @@ -531,12 +537,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, struct uverbs_obj_idr_type, type); int ret = idr_type->destroy_object(uobj, why, attrs); - /* - * We can only fail gracefully if the user requested to destroy the - * object or when a retry may be called upon an error. - * In the rest of the cases, just remove whatever you can. - */ - if (ib_is_destroy_retryable(ret, why, uobj)) + if (ret) return ret; if (why == RDMA_REMOVE_ABORT) @@ -569,11 +570,8 @@ static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj, { const struct uverbs_obj_fd_type *fd_type = container_of( uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); - int ret = fd_type->destroy_object(uobj, why); - - if (ib_is_destroy_retryable(ret, why, uobj)) - return ret; + fd_type->destroy_object(uobj, why); return 0; } @@ -597,6 +595,27 @@ static void alloc_commit_idr_uobject(struct ib_uobject *uobj) WARN_ON(old != NULL); } +static void swap_idr_uobjects(struct ib_uobject *obj_old, + struct ib_uobject *obj_new) +{ + struct ib_uverbs_file *ufile = obj_old->ufile; + void *old; + + /* + * New must be an object that been allocated but not yet committed, this + * moves the pre-committed state to obj_old, new still must be comitted. + */ + old = xa_cmpxchg(&ufile->idr, obj_old->id, obj_old, XA_ZERO_ENTRY, + GFP_KERNEL); + if (WARN_ON(old != obj_old)) + return; + + swap(obj_old->id, obj_new->id); + + old = xa_cmpxchg(&ufile->idr, obj_old->id, NULL, obj_old, GFP_KERNEL); + WARN_ON(old != NULL); +} + static void alloc_commit_fd_uobject(struct ib_uobject *uobj) { int fd = uobj->id; @@ -626,9 +645,6 @@ void rdma_alloc_commit_uobject(struct ib_uobject *uobj, { struct ib_uverbs_file *ufile = attrs->ufile; - /* alloc_commit consumes the uobj kref */ - uobj->uapi_object->type_class->alloc_commit(uobj); - /* kref is held so long as the uobj is on the uobj list. */ uverbs_uobject_get(uobj); spin_lock_irq(&ufile->uobjects_lock); @@ -638,18 +654,65 @@ void rdma_alloc_commit_uobject(struct ib_uobject *uobj, /* matches atomic_set(-1) in alloc_uobj */ atomic_set(&uobj->usecnt, 0); + /* alloc_commit consumes the uobj kref */ + uobj->uapi_object->type_class->alloc_commit(uobj); + /* Matches the down_read in rdma_alloc_begin_uobject */ up_read(&ufile->hw_destroy_rwsem); } /* + * new_uobj will be assigned to the handle currently used by to_uobj, and + * to_uobj will be destroyed. + * + * Upon return the caller must do: + * rdma_alloc_commit_uobject(new_uobj) + * uobj_put_destroy(to_uobj) + * + * to_uobj must have a write get but the put mode switches to destroy once + * this is called. + */ +void rdma_assign_uobject(struct ib_uobject *to_uobj, struct ib_uobject *new_uobj, + struct uverbs_attr_bundle *attrs) +{ + assert_uverbs_usecnt(new_uobj, UVERBS_LOOKUP_WRITE); + + if (WARN_ON(to_uobj->uapi_object != new_uobj->uapi_object || + !to_uobj->uapi_object->type_class->swap_uobjects)) + return; + + to_uobj->uapi_object->type_class->swap_uobjects(to_uobj, new_uobj); + + /* + * If this fails then the uobject is still completely valid (though with + * a new ID) and we leak it until context close. + */ + uverbs_destroy_uobject(to_uobj, RDMA_REMOVE_DESTROY, attrs); +} + +/* * This consumes the kref for uobj. It is up to the caller to unwind the HW * object and anything else connected to uobj before calling this. */ void rdma_alloc_abort_uobject(struct ib_uobject *uobj, - struct uverbs_attr_bundle *attrs) + struct uverbs_attr_bundle *attrs, + bool hw_obj_valid) { struct ib_uverbs_file *ufile = uobj->ufile; + int ret; + + if (hw_obj_valid) { + ret = uobj->uapi_object->type_class->destroy_hw( + uobj, RDMA_REMOVE_ABORT, attrs); + /* + * If the driver couldn't destroy the object then go ahead and + * commit it. Leaking objects that can't be destroyed is only + * done during FD close after the driver has a few more tries to + * destroy it. + */ + if (WARN_ON(ret)) + return rdma_alloc_commit_uobject(uobj, attrs); + } uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); @@ -679,7 +742,6 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj, enum rdma_lookup_mode mode) { assert_uverbs_usecnt(uobj, mode); - uobj->uapi_object->type_class->lookup_put(uobj, mode); /* * In order to unlock an object, either decrease its usecnt for * read access or zero it in case of exclusive access. See @@ -696,6 +758,7 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj, break; } + uobj->uapi_object->type_class->lookup_put(uobj, mode); /* Pairs with the kref obtained by type->lookup_get */ uverbs_uobject_put(uobj); } @@ -734,6 +797,7 @@ const struct uverbs_obj_type_class uverbs_idr_class = { .lookup_put = lookup_put_idr_uobject, .destroy_hw = destroy_hw_idr_uobject, .remove_handle = remove_handle_idr_uobject, + .swap_uobjects = swap_idr_uobjects, }; EXPORT_SYMBOL(uverbs_idr_class); @@ -836,16 +900,23 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, * racing with a lookup_get. */ WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); + if (reason == RDMA_REMOVE_DRIVER_FAILURE) + obj->object = NULL; if (!uverbs_destroy_uobject(obj, reason, &attrs)) ret = 0; else atomic_set(&obj->usecnt, 0); } + + if (reason == RDMA_REMOVE_DRIVER_FAILURE) { + WARN_ON(!list_empty(&ufile->uobjects)); + return 0; + } return ret; } /* - * Destroy the uncontext and every uobject associated with it. + * Destroy the ucontext and every uobject associated with it. * * This is internally locked and can be called in parallel from multiple * contexts. @@ -862,21 +933,12 @@ void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, if (!ufile->ucontext) goto done; - ufile->ucontext->closing = true; - ufile->ucontext->cleanup_retryable = true; - while (!list_empty(&ufile->uobjects)) - if (__uverbs_cleanup_ufile(ufile, reason)) { - /* - * No entry was cleaned-up successfully during this - * iteration - */ - break; - } - - ufile->ucontext->cleanup_retryable = false; - if (!list_empty(&ufile->uobjects)) - __uverbs_cleanup_ufile(ufile, reason); + while (!list_empty(&ufile->uobjects) && + !__uverbs_cleanup_ufile(ufile, reason)) { + } + if (WARN_ON(!list_empty(&ufile->uobjects))) + __uverbs_cleanup_ufile(ufile, RDMA_REMOVE_DRIVER_FAILURE); ufile_destroy_ucontext(ufile, reason); done: @@ -921,8 +983,8 @@ uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, } void uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, bool commit, - struct uverbs_attr_bundle *attrs) + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs) { /* * refcounts should be handled at the object level and not at the @@ -945,7 +1007,7 @@ void uverbs_finalize_object(struct ib_uobject *uobj, if (commit) rdma_alloc_commit_uobject(uobj, attrs); else - rdma_alloc_abort_uobject(uobj, attrs); + rdma_alloc_abort_uobject(uobj, attrs, hw_obj_valid); break; default: WARN_ON(true); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 33978e0f1262..33706dad6c0f 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -64,8 +64,8 @@ uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, s64 id, struct uverbs_attr_bundle *attrs); void uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, bool commit, - struct uverbs_attr_bundle *attrs); + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); @@ -159,6 +159,9 @@ extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_qp[]; +extern const struct uapi_definition uverbs_def_obj_srq[]; +extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 62fbb0ae9cb4..1f935d9f6178 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -47,6 +47,7 @@ static const char *type2str(enum rdma_restrack_type type) [RDMA_RESTRACK_MR] = "MR", [RDMA_RESTRACK_CTX] = "CTX", [RDMA_RESTRACK_COUNTER] = "COUNTER", + [RDMA_RESTRACK_SRQ] = "SRQ", }; return names[type]; @@ -123,32 +124,6 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) } EXPORT_SYMBOL(rdma_restrack_count); -static void set_kern_name(struct rdma_restrack_entry *res) -{ - struct ib_pd *pd; - - switch (res->type) { - case RDMA_RESTRACK_QP: - pd = container_of(res, struct ib_qp, res)->pd; - if (!pd) { - WARN_ONCE(true, "XRC QPs are not supported\n"); - /* Survive, despite the programmer's error */ - res->kern_name = " "; - } - break; - case RDMA_RESTRACK_MR: - pd = container_of(res, struct ib_mr, res)->pd; - break; - default: - /* Other types set kern_name directly */ - pd = NULL; - break; - } - - if (pd) - res->kern_name = pd->res.kern_name; -} - static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) { switch (res->type) { @@ -167,60 +142,111 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct ib_ucontext, res)->device; case RDMA_RESTRACK_COUNTER: return container_of(res, struct rdma_counter, res)->device; + case RDMA_RESTRACK_SRQ: + return container_of(res, struct ib_srq, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; } } -void rdma_restrack_set_task(struct rdma_restrack_entry *res, - const char *caller) +/** + * rdma_restrack_attach_task() - attach the task onto this resource, + * valid for user space restrack entries. + * @res: resource entry + * @task: the task to attach + */ +static void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task) { - if (caller) { - res->kern_name = caller; + if (WARN_ON_ONCE(!task)) return; - } if (res->task) put_task_struct(res->task); - get_task_struct(current); - res->task = current; + get_task_struct(task); + res->task = task; + res->user = true; } -EXPORT_SYMBOL(rdma_restrack_set_task); /** - * rdma_restrack_attach_task() - attach the task onto this resource + * rdma_restrack_set_name() - set the task for this resource * @res: resource entry - * @task: the task to attach, the current task will be used if it is NULL. + * @caller: kernel name, the current task will be used if the caller is NULL. */ -void rdma_restrack_attach_task(struct rdma_restrack_entry *res, - struct task_struct *task) +void rdma_restrack_set_name(struct rdma_restrack_entry *res, const char *caller) { - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; + if (caller) { + res->kern_name = caller; + return; + } + + rdma_restrack_attach_task(res, current); } +EXPORT_SYMBOL(rdma_restrack_set_name); -static void rdma_restrack_add(struct rdma_restrack_entry *res) +/** + * rdma_restrack_parent_name() - set the restrack name properties based + * on parent restrack + * @dst: destination resource entry + * @parent: parent resource entry + */ +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent) +{ + if (rdma_is_kernel_res(parent)) + dst->kern_name = parent->kern_name; + else + rdma_restrack_attach_task(dst, parent->task); +} +EXPORT_SYMBOL(rdma_restrack_parent_name); + +/** + * rdma_restrack_new() - Initializes new restrack entry to allow _put() interface + * to release memory in fully automatic way. + * @res: Entry to initialize + * @type: REstrack type + */ +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type) +{ + kref_init(&res->kref); + init_completion(&res->comp); + res->type = type; +} +EXPORT_SYMBOL(rdma_restrack_new); + +/** + * rdma_restrack_add() - add object to the reource tracking database + * @res: resource entry + */ +void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); struct rdma_restrack_root *rt; - int ret; + int ret = 0; if (!dev) return; + if (res->no_track) + goto out; + rt = &dev->res[res->type]; - kref_init(&res->kref); - init_completion(&res->comp); if (res->type == RDMA_RESTRACK_QP) { /* Special case to ensure that LQPN points to right QP */ struct ib_qp *qp = container_of(res, struct ib_qp, res); - ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL); - res->id = ret ? 0 : qp->qp_num; + WARN_ONCE(qp->qp_num >> 24 || qp->port >> 8, + "QP number 0x%0X and port 0x%0X", qp->qp_num, + qp->port); + res->id = qp->qp_num; + if (qp->qp_type == IB_QPT_SMI || qp->qp_type == IB_QPT_GSI) + res->id |= qp->port << 24; + ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL); + if (ret) + res->id = 0; } else if (res->type == RDMA_RESTRACK_COUNTER) { /* Special case to ensure that cntn points to right counter */ struct rdma_counter *counter; @@ -231,43 +257,14 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) } else { ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, &rt->next_id, GFP_KERNEL); + ret = (ret < 0) ? ret : 0; } +out: if (!ret) res->valid = true; } - -/** - * rdma_restrack_kadd() - add kernel object to the reource tracking database - * @res: resource entry - */ -void rdma_restrack_kadd(struct rdma_restrack_entry *res) -{ - res->task = NULL; - set_kern_name(res); - res->user = false; - rdma_restrack_add(res); -} -EXPORT_SYMBOL(rdma_restrack_kadd); - -/** - * rdma_restrack_uadd() - add user object to the reource tracking database - * @res: resource entry - */ -void rdma_restrack_uadd(struct rdma_restrack_entry *res) -{ - if ((res->type != RDMA_RESTRACK_CM_ID) && - (res->type != RDMA_RESTRACK_COUNTER)) - res->task = NULL; - - if (!res->task) - rdma_restrack_set_task(res, NULL); - res->kern_name = NULL; - - res->user = true; - rdma_restrack_add(res); -} -EXPORT_SYMBOL(rdma_restrack_uadd); +EXPORT_SYMBOL(rdma_restrack_add); int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) { @@ -305,6 +302,10 @@ static void restrack_release(struct kref *kref) struct rdma_restrack_entry *res; res = container_of(kref, struct rdma_restrack_entry, kref); + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } complete(&res->comp); } @@ -314,13 +315,25 @@ int rdma_restrack_put(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_put); +/** + * rdma_restrack_del() - delete object from the reource tracking database + * @res: resource entry + */ void rdma_restrack_del(struct rdma_restrack_entry *res) { struct rdma_restrack_entry *old; struct rdma_restrack_root *rt; struct ib_device *dev; - if (!res->valid) + if (!res->valid) { + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } + return; + } + + if (res->no_track) goto out; dev = res_to_dev(res); @@ -330,16 +343,13 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) rt = &dev->res[res->type]; old = xa_erase(&rt->xa, res->id); + if (res->type == RDMA_RESTRACK_MR) + return; WARN_ON(old != res); - res->valid = false; +out: + res->valid = false; rdma_restrack_put(res); wait_for_completion(&res->comp); - -out: - if (res->task) { - put_task_struct(res->task); - res->task = NULL; - } } EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h index d084e5f89849..6a04fc41f738 100644 --- a/drivers/infiniband/core/restrack.h +++ b/drivers/infiniband/core/restrack.h @@ -25,6 +25,12 @@ struct rdma_restrack_root { int rdma_restrack_init(struct ib_device *dev); void rdma_restrack_clean(struct ib_device *dev); -void rdma_restrack_attach_task(struct rdma_restrack_entry *res, - struct task_struct *task); +void rdma_restrack_add(struct rdma_restrack_entry *res); +void rdma_restrack_del(struct rdma_restrack_entry *res); +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type); +void rdma_restrack_set_name(struct rdma_restrack_entry *res, + const char *caller); +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent); #endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 2860def84f4d..e958c43dd28f 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -70,7 +70,7 @@ struct netdev_event_work { }; static const struct { - bool (*is_supported)(const struct ib_device *device, u8 port_num); + bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, @@ -79,7 +79,7 @@ static const struct { #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) -unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; @@ -96,7 +96,7 @@ unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, - u8 port, union ib_gid *gid, + u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; @@ -144,7 +144,7 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool -is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port, +is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; @@ -168,7 +168,7 @@ is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port, } static bool -is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port, +is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; @@ -186,18 +186,19 @@ is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port, return res; } -/** is_ndev_for_default_gid_filter - Check if a given netdevice +/** + * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer - * @cookie_ndev: Netdevice to consider to form a default GID + * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool -is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port, +is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; @@ -223,13 +224,13 @@ is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port, return res; } -static bool pass_all_filter(struct ib_device *ib_dev, u8 port, +static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } -static bool upper_device_filter(struct ib_device *ib_dev, u8 port, +static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; @@ -249,7 +250,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u8 port, /** * is_upper_ndev_bond_master_filter - Check if a given netdevice - * is bond master device of netdevice of the the RDMA device of port. + * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice @@ -260,7 +261,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u8 port, * not have been established as slave device yet. */ static bool -is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port, +is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -280,7 +281,7 @@ is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port, static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, - u8 port, struct net_device *ndev, + u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; @@ -294,7 +295,7 @@ static void update_gid_ip(enum gid_op_type gid_op, } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, - u8 port, + u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { @@ -328,7 +329,7 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, - u8 port, struct net_device *ndev) + u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; @@ -372,7 +373,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, - u8 port, struct net_device *ndev) + u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; @@ -417,7 +418,7 @@ static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, } } -static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, +static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); @@ -425,13 +426,13 @@ static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, enum_netdev_ipv6_ips(ib_dev, port, ndev); } -static void add_netdev_ips(struct ib_device *ib_dev, u8 port, +static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } -static void del_netdev_ips(struct ib_device *ib_dev, u8 port, +static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); @@ -446,7 +447,7 @@ static void del_netdev_ips(struct ib_device *ib_dev, u8 port, * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ -static void del_default_gids(struct ib_device *ib_dev, u8 port, +static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; @@ -458,7 +459,7 @@ static void del_default_gids(struct ib_device *ib_dev, u8 port, IB_CACHE_GID_DEFAULT_MODE_DELETE); } -static void add_default_gids(struct ib_device *ib_dev, u8 port, +static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; @@ -470,7 +471,7 @@ static void add_default_gids(struct ib_device *ib_dev, u8 port, } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, - u8 port, + u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -505,7 +506,7 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * - * @device: the rdma device + * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { @@ -515,7 +516,7 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev) EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, - u8 port, + u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -531,10 +532,11 @@ struct upper_list { struct net_device *upper; }; -static int netdev_upper_walk(struct net_device *upper, void *data) +static int netdev_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - struct list_head *upper_list = data; + struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; @@ -546,19 +548,21 @@ static int netdev_upper_walk(struct net_device *upper, void *data) return 0; } -static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, +static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, - u8 port, + u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; + struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); + priv.data = &upper_list; rcu_read_lock(); - netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list); + netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); @@ -571,25 +575,25 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, } } -static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *event_ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); } -static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port, +static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); } -static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port, +static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } -static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, +static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 06e5b6787443..8367974b7998 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2016 HGST, a Western Digital Company. */ +#include <linux/memremap.h> #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/pci-p2pdma.h> @@ -25,7 +26,7 @@ MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); * registration is also enabled if registering memory might yield better * performance than using multiple SGE entries, see rdma_rw_io_needs_mr() */ -static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num) { if (rdma_protocol_iwarp(dev, port_num)) return true; @@ -42,7 +43,7 @@ static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) * optimization otherwise. Additionally we have a debug option to force usage * of MRs to help testing this code path. */ -static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num, enum dma_data_direction dir, int dma_nents) { if (dir == DMA_FROM_DEVICE) { @@ -87,7 +88,7 @@ static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) } /* Caller must have zero-initialized *reg. */ -static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, +static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) { @@ -121,7 +122,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, } static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct rdma_rw_reg_ctx *prev = NULL; @@ -129,7 +130,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, qp->integrity_en); int i, j, ret = 0, count = 0; - ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr); ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); if (!ctx->reg) { ret = -ENOMEM; @@ -273,23 +274,6 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, return 1; } -static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg, - u32 sg_cnt, enum dma_data_direction dir) -{ - if (is_pci_p2pdma_page(sg_page(sg))) - pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir); - else - ib_dma_unmap_sg(dev, sg, sg_cnt, dir); -} - -static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg, - u32 sg_cnt, enum dma_data_direction dir) -{ - if (is_pci_p2pdma_page(sg_page(sg))) - return pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir); - return ib_dma_map_sg(dev, sg, sg_cnt, dir); -} - /** * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context * @ctx: context to initialize @@ -305,17 +289,21 @@ static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg, * Returns the number of WQEs that will be needed on the workqueue if * successful, or a negative error code. */ -int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; int ret; - ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir); - if (!ret) - return -ENOMEM; - sg_cnt = ret; + ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0); + if (ret) + return ret; + sg_cnt = sgt.nents; /* * Skip to the S/G entry that sg_offset falls into: @@ -351,7 +339,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, return ret; out_unmap_sg: - rdma_rw_unmap_sg(dev, sg, sg_cnt, dir); + ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0); return ret; } EXPORT_SYMBOL(rdma_rw_ctx_init); @@ -374,7 +362,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_init); * successful, or a negative error code. */ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey, enum dma_data_direction dir) @@ -382,32 +370,36 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct ib_device *dev = qp->pd->device; u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, qp->integrity_en); + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; + struct sg_table prot_sgt = { + .sgl = prot_sg, + .orig_nents = prot_sg_cnt, + }; struct ib_rdma_wr *rdma_wr; int count = 0, ret; if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { - pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n", + pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n", sg_cnt, prot_sg_cnt, pages_per_mr); return -EINVAL; } - ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); - if (!ret) - return -ENOMEM; - sg_cnt = ret; + ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0); + if (ret) + return ret; if (prot_sg_cnt) { - ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); - if (!ret) { - ret = -ENOMEM; + ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0); + if (ret) goto out_unmap_sg; - } - prot_sg_cnt = ret; } ctx->type = RDMA_RW_SIG_MR; ctx->nr_ops = 1; - ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL); + ctx->reg = kzalloc(sizeof(*ctx->reg), GFP_KERNEL); if (!ctx->reg) { ret = -ENOMEM; goto out_unmap_prot_sg; @@ -423,10 +415,11 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); - ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg, - prot_sg_cnt, NULL, SZ_4K); + ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg, + prot_sgt.nents, NULL, SZ_4K); if (unlikely(ret)) { - pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt); + pr_err("failed to map PI sg (%u)\n", + sgt.nents + prot_sgt.nents); goto out_destroy_sig_mr; } @@ -465,10 +458,10 @@ out_destroy_sig_mr: out_free_ctx: kfree(ctx->reg); out_unmap_prot_sg: - if (prot_sg_cnt) - ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); + if (prot_sgt.nents) + ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0); out_unmap_sg: - ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0); return ret; } EXPORT_SYMBOL(rdma_rw_ctx_signature_init); @@ -502,7 +495,7 @@ static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) * completion notification. */ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) + u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { struct ib_send_wr *first_wr, *last_wr; int i; @@ -510,7 +503,6 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, switch (ctx->type) { case RDMA_RW_SIG_MR: case RDMA_RW_MR: - /* fallthrough */ for (i = 0; i < ctx->nr_ops; i++) { rdma_rw_update_lkey(&ctx->reg[i], ctx->reg[i].wr.wr.opcode != @@ -560,7 +552,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_wrs); * is not set @cqe must be set so that the caller gets a completion * notification. */ -int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { struct ib_send_wr *first_wr; @@ -579,8 +571,9 @@ EXPORT_SYMBOL(rdma_rw_ctx_post); * @sg_cnt: number of entries in @sg * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ */ -void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, - struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, + enum dma_data_direction dir) { int i; @@ -601,7 +594,7 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, break; } - rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); @@ -618,7 +611,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy); * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ */ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, enum dma_data_direction dir) { @@ -628,9 +621,9 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); kfree(ctx->reg); - ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); if (prot_sg_cnt) ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); @@ -645,7 +638,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); * compute max_rdma_ctxts and the size of the transport's Send and * Send Completion Queues. */ -unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num, +unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages) { unsigned int mr_pages; @@ -711,7 +704,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) IB_MR_TYPE_MEM_REG, max_num_sg, 0); if (ret) { - pr_err("%s: failed to allocated %d MRs\n", + pr_err("%s: failed to allocated %u MRs\n", __func__, nr_mrs); return ret; } @@ -721,7 +714,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg); if (ret) { - pr_err("%s: failed to allocated %d SIG MRs\n", + pr_err("%s: failed to allocated %u SIG MRs\n", __func__, nr_sig_mrs); goto out_free_rdma_mrs; } diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index cbaaaa92fff3..143de37ae598 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,7 +49,7 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, u8 method, + struct ib_device *device, u32 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 30d4c126a2db..0de83d9a4985 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -32,7 +32,6 @@ * SOFTWARE. */ -#include <linux/module.h> #include <linux/init.h> #include <linux/err.h> #include <linux/random.h> @@ -51,6 +50,7 @@ #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> #include <rdma/opa_addr.h> +#include <rdma/rdma_cm.h> #include "sa.h" #include "core_priv.h" @@ -95,17 +95,18 @@ struct ib_sa_port { struct delayed_work ib_cpi_work; spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; - u8 port_num; + u32 port_num; }; struct ib_sa_device { int start_port, end_port; struct ib_event_handler event_handler; - struct ib_sa_port port[0]; + struct ib_sa_port port[]; }; struct ib_sa_query { - void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *); + void (*callback)(struct ib_sa_query *sa_query, int status, + int num_prs, struct ib_sa_mad *mad); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; @@ -117,20 +118,21 @@ struct ib_sa_query { u32 seq; /* Local svc request sequence number */ unsigned long timeout; /* Local svc timeout */ u8 path_use; /* How will the pathrecord be used */ + + /* A separate buffer to save pathrecords of a response, as in cases + * like IB/netlink, mulptiple pathrecords are supported, so that + * mad->data is not large enough to hold them + */ + void *resp_pr_data; }; #define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 #define IB_SA_CANCEL 0x00000002 #define IB_SA_QUERY_OPA 0x00000004 -struct ib_sa_service_query { - void (*callback)(int, struct ib_sa_service_rec *, void *); - void *context; - struct ib_sa_query sa_query; -}; - struct ib_sa_path_query { - void (*callback)(int, struct sa_path_rec *, void *); + void (*callback)(int status, struct sa_path_rec *rec, + int num_paths, void *context); void *context; struct ib_sa_query sa_query; struct sa_path_rec *conv_pr; @@ -174,7 +176,7 @@ static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { }; -static void ib_sa_add_one(struct ib_device *device); +static int ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { @@ -190,7 +192,7 @@ static u32 tid; #define PATH_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct sa_path_rec, field), \ - .struct_size_bytes = sizeof((struct sa_path_rec *)0)->field, \ + .struct_size_bytes = sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { @@ -292,7 +294,7 @@ static const struct ib_field path_rec_table[] = { .struct_offset_bytes = \ offsetof(struct sa_path_rec, field), \ .struct_size_bytes = \ - sizeof((struct sa_path_rec *)0)->field, \ + sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field opa_path_rec_table[] = { @@ -420,7 +422,7 @@ static const struct ib_field opa_path_rec_table[] = { #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_mcmember_rec, field), \ .field_name = "sa_mcmember_rec:" #field static const struct ib_field mcmember_rec_table[] = { @@ -502,57 +504,9 @@ static const struct ib_field mcmember_rec_table[] = { .size_bits = 23 }, }; -#define SERVICE_REC_FIELD(field) \ - .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \ - .field_name = "sa_service_rec:" #field - -static const struct ib_field service_rec_table[] = { - { SERVICE_REC_FIELD(id), - .offset_words = 0, - .offset_bits = 0, - .size_bits = 64 }, - { SERVICE_REC_FIELD(gid), - .offset_words = 2, - .offset_bits = 0, - .size_bits = 128 }, - { SERVICE_REC_FIELD(pkey), - .offset_words = 6, - .offset_bits = 0, - .size_bits = 16 }, - { SERVICE_REC_FIELD(lease), - .offset_words = 7, - .offset_bits = 0, - .size_bits = 32 }, - { SERVICE_REC_FIELD(key), - .offset_words = 8, - .offset_bits = 0, - .size_bits = 128 }, - { SERVICE_REC_FIELD(name), - .offset_words = 12, - .offset_bits = 0, - .size_bits = 64*8 }, - { SERVICE_REC_FIELD(data8), - .offset_words = 28, - .offset_bits = 0, - .size_bits = 16*8 }, - { SERVICE_REC_FIELD(data16), - .offset_words = 32, - .offset_bits = 0, - .size_bits = 8*16 }, - { SERVICE_REC_FIELD(data32), - .offset_words = 36, - .offset_bits = 0, - .size_bits = 4*32 }, - { SERVICE_REC_FIELD(data64), - .offset_words = 40, - .offset_bits = 0, - .size_bits = 2*64 }, -}; - #define CLASSPORTINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ - .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \ .field_name = "ib_class_port_info:" #field static const struct ib_field ib_classport_info_rec_table[] = { @@ -630,7 +584,7 @@ static const struct ib_field ib_classport_info_rec_table[] = { .struct_offset_bytes =\ offsetof(struct opa_class_port_info, field), \ .struct_size_bytes = \ - sizeof((struct opa_class_port_info *)0)->field, \ + sizeof_field(struct opa_class_port_info, field), \ .field_name = "opa_class_port_info:" #field static const struct ib_field opa_classport_info_rec_table[] = { @@ -710,7 +664,7 @@ static const struct ib_field opa_classport_info_rec_table[] = { #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ - .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_guidinfo_rec, field), \ .field_name = "sa_guidinfo_rec:" #field static const struct ib_field guidinfo_rec_table[] = { @@ -760,13 +714,14 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, dev_name(&query->port->agent->device->dev), - LS_DEVICE_NAME_MAX); + strscpy_pad(header->device_name, + dev_name(&query->port->agent->device->dev), + LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && sa_rec->reversible != 0) - query->path_use = LS_RESOLVE_PATH_USE_GMP; + query->path_use = LS_RESOLVE_PATH_USE_ALL; else query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; header->path_use = query->path_use; @@ -829,13 +784,20 @@ static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) return len; } -static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) +static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; struct ib_sa_mad *mad; int len; + unsigned long flags; + unsigned long delay; + gfp_t gfp_flag; + int ret; + + INIT_LIST_HEAD(&query->list); + query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); mad = query->mad_buf->mad; len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask); @@ -860,36 +822,25 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - return rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_mask); -} + gfp_flag = ((gfp_mask & GFP_ATOMIC) == GFP_ATOMIC) ? GFP_ATOMIC : + GFP_NOWAIT; -static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) -{ - unsigned long flags; - unsigned long delay; - int ret; + spin_lock_irqsave(&ib_nl_request_lock, flags); + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_flag); - INIT_LIST_HEAD(&query->list); - query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + if (ret) + goto out; - /* Put the request on the list first.*/ - spin_lock_irqsave(&ib_nl_request_lock, flags); + /* Put the request on the list.*/ delay = msecs_to_jiffies(sa_local_svc_timeout_ms); query->timeout = delay + jiffies; list_add_tail(&query->list, &ib_nl_request_list); /* Start the timeout if this is the only request */ if (ib_nl_request_list.next == &query->list) queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); - spin_unlock_irqrestore(&ib_nl_request_lock, flags); - ret = ib_nl_send_msg(query, gfp_mask); - if (ret) { - ret = -EIO; - /* Remove the request */ - spin_lock_irqsave(&ib_nl_request_lock, flags); - list_del(&query->list); - spin_unlock_irqrestore(&ib_nl_request_lock, flags); - } +out: + spin_unlock_irqrestore(&ib_nl_request_lock, flags); return ret; } @@ -923,50 +874,81 @@ static void send_handler(struct ib_mad_agent *agent, static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, const struct nlmsghdr *nlh) { + struct ib_path_rec_data *srec, *drec; + struct ib_sa_path_query *path_query; struct ib_mad_send_wc mad_send_wc; - struct ib_sa_mad *mad = NULL; const struct nlattr *head, *curr; - struct ib_path_rec_data *rec; - int len, rem; + struct ib_sa_mad *mad = NULL; + int len, rem, num_prs = 0; u32 mask = 0; int status = -EIO; - if (query->callback) { - head = (const struct nlattr *) nlmsg_data(nlh); - len = nlmsg_len(nlh); - switch (query->path_use) { - case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: - mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; - break; + if (!query->callback) + goto out; - case LS_RESOLVE_PATH_USE_ALL: - case LS_RESOLVE_PATH_USE_GMP: - default: - mask = IB_PATH_PRIMARY | IB_PATH_GMP | - IB_PATH_BIDIRECTIONAL; - break; + path_query = container_of(query, struct ib_sa_path_query, sa_query); + mad = query->mad_buf->mad; + if (!path_query->conv_pr && + (be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) { + /* Need a larger buffer for possible multiple PRs */ + query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM, + sizeof(*drec), GFP_KERNEL); + if (!query->resp_pr_data) { + query->callback(query, -ENOMEM, 0, NULL); + return; } - nla_for_each_attr(curr, head, len, rem) { - if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) { - rec = nla_data(curr); - /* - * Get the first one. In the future, we may - * need to get up to 6 pathrecords. - */ - if ((rec->flags & mask) == mask) { - mad = query->mad_buf->mad; - mad->mad_hdr.method |= - IB_MGMT_METHOD_RESP; - memcpy(mad->data, rec->path_rec, - sizeof(rec->path_rec)); - status = 0; - break; - } - } + } + + head = (const struct nlattr *) nlmsg_data(nlh); + len = nlmsg_len(nlh); + switch (query->path_use) { + case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: + mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; + break; + + case LS_RESOLVE_PATH_USE_ALL: + mask = IB_PATH_PRIMARY; + break; + + case LS_RESOLVE_PATH_USE_GMP: + default: + mask = IB_PATH_PRIMARY | IB_PATH_GMP | + IB_PATH_BIDIRECTIONAL; + break; + } + + drec = (struct ib_path_rec_data *)query->resp_pr_data; + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD) + continue; + + srec = nla_data(curr); + if ((srec->flags & mask) != mask) + continue; + + status = 0; + if (!drec) { + memcpy(mad->data, srec->path_rec, + sizeof(srec->path_rec)); + num_prs = 1; + break; } - query->callback(query, status, mad); + + memcpy(drec, srec, sizeof(*drec)); + drec++; + num_prs++; + if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM) + break; } + if (!status) + mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; + + query->callback(query, status, num_prs, mad); + kvfree(query->resp_pr_data); + query->resp_pr_data = NULL; + +out: mad_send_wc.send_buf = query->mad_buf; mad_send_wc.status = IB_WC_SUCCESS; send_handler(query->mad_buf->mad_agent, &mad_send_wc); @@ -1092,10 +1074,9 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct netlink_ext_ack *extack) { unsigned long flags; - struct ib_sa_query *query; + struct ib_sa_query *query = NULL, *iter; struct ib_mad_send_buf *send_buf; struct ib_mad_send_wc mad_send_wc; - int found = 0; int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || @@ -1103,20 +1084,21 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); - list_for_each_entry(query, &ib_nl_request_list, list) { + list_for_each_entry(iter, &ib_nl_request_list, list) { /* * If the query is cancelled, let the timeout routine * take care of it. */ - if (nlh->nlmsg_seq == query->seq) { - found = !ib_sa_query_cancelled(query); - if (found) - list_del(&query->list); + if (nlh->nlmsg_seq == iter->seq) { + if (!ib_sa_query_cancelled(iter)) { + list_del(&iter->list); + query = iter; + } break; } } - if (!found) { + if (!query) { spin_unlock_irqrestore(&ib_nl_request_lock, flags); goto resp_out; } @@ -1176,7 +1158,6 @@ EXPORT_SYMBOL(ib_sa_unregister_client); void ib_sa_cancel_query(int id, struct ib_sa_query *query) { unsigned long flags; - struct ib_mad_agent *agent; struct ib_mad_send_buf *mad_buf; xa_lock_irqsave(&queries, flags); @@ -1184,7 +1165,6 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) xa_unlock_irqrestore(&queries, flags); return; } - agent = query->port->agent; mad_buf = query->mad_buf; xa_unlock_irqrestore(&queries, flags); @@ -1194,11 +1174,11 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) * sent to the MAD layer and has to be cancelled from there. */ if (!ib_nl_cancel_request(query)) - ib_cancel_mad(agent, mad_buf); + ib_cancel_mad(mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); -static u8 get_src_path_mask(struct ib_device *device, u8 port_num) +static u8 get_src_path_mask(struct ib_device *device, u32 port_num) { struct ib_sa_device *sa_dev; struct ib_sa_port *port; @@ -1217,7 +1197,7 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, +static int init_ah_attr_grh_fields(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *gid_attr) @@ -1255,7 +1235,7 @@ static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, * User must invoke rdma_destroy_ah_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_path(). */ -int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, +int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *gid_attr) @@ -1364,6 +1344,7 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, { unsigned long flags; int ret, id; + const int nmbr_sa_query_retries = 10; xa_lock_irqsave(&queries, flags); ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask); @@ -1371,7 +1352,13 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, if (ret < 0) return ret; - query->mad_buf->timeout_ms = timeout_ms; + query->mad_buf->timeout_ms = timeout_ms / nmbr_sa_query_retries; + query->mad_buf->retries = nmbr_sa_query_retries; + if (!query->mad_buf->timeout_ms) { + /* Special case, very small timeout_ms */ + query->mad_buf->timeout_ms = 1; + query->mad_buf->retries = timeout_ms; + } query->mad_buf->context[0] = query; query->id = id; @@ -1412,17 +1399,13 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) EXPORT_SYMBOL(ib_sa_pack_path); static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num) + struct ib_sa_device *sa_dev, + u32 port_num) { - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; unsigned long flags; bool ret = false; - if (!sa_dev) - return ret; - port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->classport_lock, flags); if (!port->classport_info.valid) @@ -1442,24 +1425,24 @@ enum opa_pr_supported { PR_IB_SUPPORTED }; -/** - * Check if current PR query can be an OPA query. +/* + * opa_pr_query_possible - Check if current PR query can be an OPA query. + * * Retuns PR_NOT_SUPPORTED if a path record query is not * possible, PR_OPA_SUPPORTED if an OPA path record query * is possible and PR_IB_SUPPORTED if an IB path record * query is possible. */ static int opa_pr_query_possible(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num, - struct sa_path_rec *rec) + struct ib_sa_device *sa_dev, + struct ib_device *device, u32 port_num) { struct ib_port_attr port_attr; if (ib_query_port(device, port_num, &port_attr)) return PR_NOT_SUPPORTED; - if (ib_sa_opa_pathrecord_support(client, device, port_num)) + if (ib_sa_opa_pathrecord_support(client, sa_dev, port_num)) return PR_OPA_SUPPORTED; if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) @@ -1468,41 +1451,90 @@ static int opa_pr_query_possible(struct ib_sa_client *client, return PR_IB_SUPPORTED; } +static void ib_sa_pr_callback_single(struct ib_sa_path_query *query, + int status, struct ib_sa_mad *mad) +{ + struct sa_path_rec rec = {}; + + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(&rec); + + if (query->conv_pr) { + struct sa_path_rec opa; + + memset(&opa, 0, sizeof(struct sa_path_rec)); + sa_convert_path_ib_to_opa(&opa, &rec); + query->callback(status, &opa, 1, query->context); + } else { + query->callback(status, &rec, 1, query->context); + } +} + +/** + * ib_sa_pr_callback_multiple() - Parse path records then do callback. + * + * In a multiple-PR case the PRs are saved in "query->resp_pr_data" + * (instead of"mad->data") and with "ib_path_rec_data" structure format, + * so that rec->flags can be set to indicate the type of PR. + * This is valid only in IB fabric. + */ +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, + int status, int num_prs, + struct ib_path_rec_data *rec_data) +{ + struct sa_path_rec *rec; + int i; + + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); + if (!rec) { + query->callback(-ENOMEM, NULL, 0, query->context); + return; + } + + for (i = 0; i < num_prs; i++) { + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec_data[i].path_rec, rec + i); + rec[i].rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(rec + i); + rec[i].flags = rec_data[i].flags; + } + + query->callback(status, rec, num_prs, query->context); + kvfree(rec); +} + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_prs, struct ib_sa_mad *mad) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); + struct sa_path_rec rec; - if (mad) { - struct sa_path_rec rec; - - if (sa_query->flags & IB_SA_QUERY_OPA) { - ib_unpack(opa_path_rec_table, - ARRAY_SIZE(opa_path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_OPA; - query->callback(status, &rec, query->context); - } else { - ib_unpack(path_rec_table, - ARRAY_SIZE(path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_IB; - sa_path_set_dmac_zero(&rec); - - if (query->conv_pr) { - struct sa_path_rec opa; + if (!mad || !num_prs) { + query->callback(status, NULL, 0, query->context); + return; + } - memset(&opa, 0, sizeof(struct sa_path_rec)); - sa_convert_path_ib_to_opa(&opa, &rec); - query->callback(status, &opa, query->context); - } else { - query->callback(status, &rec, query->context); - } + if (sa_query->flags & IB_SA_QUERY_OPA) { + if (num_prs != 1) { + query->callback(-EINVAL, NULL, 0, query->context); + return; } - } else - query->callback(status, NULL, query->context); + + ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_OPA; + query->callback(status, &rec, num_prs, query->context); + } else { + if (!sa_query->resp_pr_data) + ib_sa_pr_callback_single(query, status, mad); + else + ib_sa_pr_callback_multiple(query, status, num_prs, + sa_query->resp_pr_data); + } } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) @@ -1540,13 +1572,13 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, - void *context), + int num_paths, void *context), void *context, struct ib_sa_query **sa_query) { @@ -1574,7 +1606,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, query->sa_query.port = port; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { - status = opa_pr_query_possible(client, device, port_num, rec); + status = opa_pr_query_possible(client, sa_dev, device, port_num); if (status == PR_NOT_SUPPORTED) { ret = -EINVAL; goto err1; @@ -1644,131 +1676,8 @@ err1: } EXPORT_SYMBOL(ib_sa_path_rec_get); -static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) -{ - struct ib_sa_service_query *query = - container_of(sa_query, struct ib_sa_service_query, sa_query); - - if (mad) { - struct ib_sa_service_rec rec; - - ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), - mad->data, &rec); - query->callback(status, &rec, query->context); - } else - query->callback(status, NULL, query->context); -} - -static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) -{ - kfree(container_of(sa_query, struct ib_sa_service_query, sa_query)); -} - -/** - * ib_sa_service_rec_query - Start Service Record operation - * @client:SA client - * @device:device to send request on - * @port_num: port number to send request on - * @method:SA method - should be get, set, or delete - * @rec:Service Record to send in request - * @comp_mask:component mask to send in request - * @timeout_ms:time to wait for response - * @gfp_mask:GFP mask to use for internal allocations - * @callback:function called when request completes, times out or is - * canceled - * @context:opaque user context passed to callback - * @sa_query:request context, used to cancel request - * - * Send a Service Record set/get/delete to the SA to register, - * unregister or query a service record. - * The callback function will be called when the request completes (or - * fails); status is 0 for a successful response, -EINTR if the query - * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error - * occurred sending the query. The resp parameter of the callback is - * only valid if status is 0. - * - * If the return value of ib_sa_service_rec_query() is negative, it is an - * error code. Otherwise it is a request ID that can be used to cancel - * the query. - */ -int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - unsigned long timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) -{ - struct ib_sa_service_query *query; - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; - struct ib_mad_agent *agent; - struct ib_sa_mad *mad; - int ret; - - if (!sa_dev) - return -ENODEV; - - port = &sa_dev->port[port_num - sa_dev->start_port]; - agent = port->agent; - - if (method != IB_MGMT_METHOD_GET && - method != IB_MGMT_METHOD_SET && - method != IB_SA_METHOD_DELETE) - return -EINVAL; - - query = kzalloc(sizeof(*query), gfp_mask); - if (!query) - return -ENOMEM; - - query->sa_query.port = port; - ret = alloc_mad(&query->sa_query, gfp_mask); - if (ret) - goto err1; - - ib_sa_client_get(client); - query->sa_query.client = client; - query->callback = callback; - query->context = context; - - mad = query->sa_query.mad_buf->mad; - init_mad(&query->sa_query, agent); - - query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL; - query->sa_query.release = ib_sa_service_rec_release; - mad->mad_hdr.method = method; - mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); - mad->sa_hdr.comp_mask = comp_mask; - - ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), - rec, mad->data); - - *sa_query = &query->sa_query; - - ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); - if (ret < 0) - goto err2; - - return ret; - -err2: - *sa_query = NULL; - ib_sa_client_put(query->sa_query.client); - free_mad(&query->sa_query); - -err1: - kfree(query); - return ret; -} -EXPORT_SYMBOL(ib_sa_service_rec_query); - static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_prs, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = @@ -1790,7 +1699,7 @@ static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, @@ -1860,7 +1769,7 @@ err1: /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_paths, struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = @@ -1882,7 +1791,7 @@ static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) } int ib_sa_guid_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, unsigned long timeout_ms, gfp_t gfp_mask, @@ -1957,30 +1866,6 @@ err1: } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); -bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num) -{ - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; - bool ret = false; - unsigned long flags; - - if (!sa_dev) - return ret; - - port = &sa_dev->port[port_num - sa_dev->start_port]; - - spin_lock_irqsave(&port->classport_lock, flags); - if ((port->classport_info.valid) && - (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_IB)) - ret = ib_get_cpi_capmask2(&port->classport_info.data.ib) - & IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT; - spin_unlock_irqrestore(&port->classport_lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_sa_sendonly_fullmem_support); - struct ib_classport_info_context { struct completion done; struct ib_sa_query *sa_query; @@ -1994,7 +1879,7 @@ static void ib_classportinfo_cb(void *context) } static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_prs, struct ib_sa_mad *mad) { unsigned long flags; @@ -2170,13 +2055,13 @@ static void send_handler(struct ib_mad_agent *agent, /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: - query->callback(query, -ETIMEDOUT, NULL); + query->callback(query, -ETIMEDOUT, 0, NULL); break; case IB_WC_WR_FLUSH_ERR: - query->callback(query, -EINTR, NULL); + query->callback(query, -EINTR, 0, NULL); break; default: - query->callback(query, -EIO, NULL); + query->callback(query, -EIO, 0, NULL); break; } @@ -2204,10 +2089,10 @@ static void recv_handler(struct ib_mad_agent *mad_agent, if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, mad_recv_wc->recv_buf.mad->mad_hdr.status ? - -EINVAL : 0, + -EINVAL : 0, 1, (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); else - query->callback(query, -EIO, NULL); + query->callback(query, -EIO, 0, NULL); } ib_free_recv_mad(mad_recv_wc); @@ -2295,7 +2180,7 @@ static void ib_sa_event(struct ib_event_handler *handler, unsigned long flags; struct ib_sa_device *sa_dev = container_of(handler, typeof(*sa_dev), event_handler); - u8 port_num = event->element.port_num - sa_dev->start_port; + u32 port_num = event->element.port_num - sa_dev->start_port; struct ib_sa_port *port = &sa_dev->port[port_num]; if (!rdma_cap_ib_sa(handler->device, port->port_num)) @@ -2325,18 +2210,19 @@ static void ib_sa_event(struct ib_event_handler *handler, } } -static void ib_sa_add_one(struct ib_device *device) +static int ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; int count = 0; + int ret; s = rdma_start_port(device); e = rdma_end_port(device); sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL); if (!sa_dev) - return; + return -ENOMEM; sa_dev->start_port = s; sa_dev->end_port = e; @@ -2356,8 +2242,10 @@ static void ib_sa_add_one(struct ib_device *device) ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, recv_handler, sa_dev, 0); - if (IS_ERR(sa_dev->port[i].agent)) + if (IS_ERR(sa_dev->port[i].agent)) { + ret = PTR_ERR(sa_dev->port[i].agent); goto err; + } INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work, @@ -2366,8 +2254,10 @@ static void ib_sa_add_one(struct ib_device *device) count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(device, &sa_client, sa_dev); @@ -2386,7 +2276,7 @@ static void ib_sa_add_one(struct ib_device *device) update_sm_ah(&sa_dev->port[i].update_task); } - return; + return 0; err: while (--i >= 0) { @@ -2395,7 +2285,7 @@ err: } free: kfree(sa_dev); - return; + return ret; } static void ib_sa_remove_one(struct ib_device *device, void *client_data) @@ -2403,9 +2293,6 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) struct ib_sa_device *sa_dev = client_data; int i; - if (!sa_dev) - return; - ib_unregister_event_handler(&sa_dev->event_handler); flush_workqueue(ib_wq); @@ -2463,7 +2350,6 @@ err1: void ib_sa_cleanup(void) { cancel_delayed_work(&ib_nl_timed_work); - flush_workqueue(ib_nl_wq); destroy_workqueue(ib_nl_wq); mcast_cleanup(); ib_unregister_client(&sa_client); diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 2d5608315dc8..3512c2e54efc 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -72,7 +72,7 @@ static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp, if (ret) return ret; - ret = ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); + ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); return ret; } @@ -193,7 +193,7 @@ static void qp_to_error(struct ib_qp_security *sec) static inline void check_pkey_qps(struct pkey_index_qp_list *pkey, struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix) { struct ib_port_pkey *pp, *tmp_pp; @@ -245,7 +245,7 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) struct pkey_index_qp_list *tmp_pkey; struct pkey_index_qp_list *pkey; struct ib_device *dev; - u8 port_num = pp->port_num; + u32 port_num = pp->port_num; int ret = 0; if (pp->state != IB_PORT_PKEY_VALID) @@ -349,16 +349,11 @@ static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp, else if (qp_pps) new_pps->main.pkey_index = qp_pps->main.pkey_index; - if ((qp_attr_mask & IB_QP_PKEY_INDEX) && (qp_attr_mask & IB_QP_PORT)) + if (((qp_attr_mask & IB_QP_PKEY_INDEX) && + (qp_attr_mask & IB_QP_PORT)) || + (qp_pps && qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)) new_pps->main.state = IB_PORT_PKEY_VALID; - if (!(qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) && qp_pps) { - new_pps->main.port_num = qp_pps->main.port_num; - new_pps->main.pkey_index = qp_pps->main.pkey_index; - if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID) - new_pps->main.state = IB_PORT_PKEY_VALID; - } - if (qp_attr_mask & IB_QP_ALT_PATH) { new_pps->alt.port_num = qp_attr->alt_port_num; new_pps->alt.pkey_index = qp_attr->alt_pkey_index; @@ -543,7 +538,7 @@ void ib_destroy_qp_security_end(struct ib_qp_security *sec) } void ib_security_cache_change(struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix) { struct pkey_index_qp_list *pkey; @@ -591,7 +586,7 @@ int ib_security_modify_qp(struct ib_qp *qp, WARN_ONCE((qp_attr_mask & IB_QP_PORT && rdma_protocol_ib(real_qp->device, qp_attr->port_num) && !real_qp->qp_sec), - "%s: QP security is not initialized for IB QP: %d\n", + "%s: QP security is not initialized for IB QP: %u\n", __func__, real_qp->qp_num); /* The port/pkey settings are maintained only for the real QP. Open @@ -654,7 +649,7 @@ int ib_security_modify_qp(struct ib_qp *qp, } static int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, + u32 port_num, u16 pkey_index, void *sec) { @@ -669,10 +664,7 @@ static int ib_security_pkey_access(struct ib_device *dev, if (ret) return ret; - ret = ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); - - if (ret) - return ret; + ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); return security_ib_pkey_access(sec, subnet_prefix, pkey); } diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c index f19b23817c2b..45f09b75c893 100644 --- a/drivers/infiniband/core/smi.c +++ b/drivers/infiniband/core/smi.c @@ -41,7 +41,7 @@ #include "smi.h" #include "opa_smi.h" -static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, u32 port_num, u8 *hop_ptr, u8 hop_cnt, const u8 *initial_path, const u8 *return_path, @@ -127,7 +127,7 @@ static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, * Return IB_SMI_DISCARD if the SMP should be discarded */ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - bool is_switch, int port_num) + bool is_switch, u32 port_num) { return __smi_handle_dr_smp_send(is_switch, port_num, &smp->hop_ptr, smp->hop_cnt, @@ -139,7 +139,7 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, } enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, - bool is_switch, int port_num) + bool is_switch, u32 port_num) { return __smi_handle_dr_smp_send(is_switch, port_num, &smp->hop_ptr, smp->hop_cnt, @@ -152,7 +152,7 @@ enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, OPA_LID_PERMISSIVE); } -static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, u32 port_num, int phys_port_cnt, u8 *hop_ptr, u8 hop_cnt, const u8 *initial_path, @@ -238,7 +238,7 @@ static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, * Return IB_SMI_DISCARD if the SMP should be dropped */ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, - int port_num, int phys_port_cnt) + u32 port_num, int phys_port_cnt) { return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, &smp->hop_ptr, smp->hop_cnt, @@ -254,7 +254,7 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, * Return IB_SMI_DISCARD if the SMP should be dropped */ enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, - int port_num, int phys_port_cnt) + u32 port_num, int phys_port_cnt) { return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, &smp->hop_ptr, smp->hop_cnt, diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h index 91d9b353ab85..e350ed623c45 100644 --- a/drivers/infiniband/core/smi.h +++ b/drivers/infiniband/core/smi.h @@ -52,11 +52,11 @@ enum smi_forward_action { }; enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, - int port_num, int phys_port_cnt); + u32 port_num, int phys_port_cnt); int smi_get_fwd_port(struct ib_smp *smp); extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - bool is_switch, int port_num); + bool is_switch, u32 port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 087682e6969e..84c53bd2a52d 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -44,110 +44,174 @@ #include <rdma/ib_pma.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> +#include <rdma/ib_sysfs.h> -struct ib_port; +struct port_table_attribute { + struct ib_port_attribute attr; + char name[8]; + int index; + __be16 attr_id; +}; struct gid_attr_group { - struct ib_port *port; - struct kobject kobj; - struct attribute_group ndev; - struct attribute_group type; + struct ib_port *port; + struct kobject kobj; + struct attribute_group groups[2]; + const struct attribute_group *groups_list[3]; + struct port_table_attribute attrs_list[]; }; + struct ib_port { - struct kobject kobj; - struct ib_device *ibdev; + struct kobject kobj; + struct ib_device *ibdev; struct gid_attr_group *gid_attr_group; - struct attribute_group gid_group; - struct attribute_group pkey_group; - struct attribute_group *pma_table; - struct attribute_group *hw_stats_ag; - struct rdma_hw_stats *hw_stats; - u8 port_num; + struct hw_stats_port_data *hw_stats_data; + + struct attribute_group groups[3]; + const struct attribute_group *groups_list[5]; + u32 port_num; + struct port_table_attribute attrs_list[]; }; -struct port_attribute { - struct attribute attr; - ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); - ssize_t (*store)(struct ib_port *, struct port_attribute *, +struct hw_stats_device_attribute { + struct device_attribute attr; + ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf); + ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, const char *buf, size_t count); }; -#define PORT_ATTR(_name, _mode, _show, _store) \ -struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) - -#define PORT_ATTR_RO(_name) \ -struct port_attribute port_attr_##_name = __ATTR_RO(_name) +struct hw_stats_port_attribute { + struct ib_port_attribute attr; + ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf); + ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count); +}; -struct port_table_attribute { - struct port_attribute attr; - char name[8]; - int index; - __be16 attr_id; +struct hw_stats_device_data { + struct attribute_group group; + struct rdma_hw_stats *stats; + struct hw_stats_device_attribute attrs[]; }; -struct hw_stats_attribute { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, - struct attribute *attr, char *buf); - ssize_t (*store)(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count); - int index; - u8 port_num; +struct hw_stats_port_data { + struct rdma_hw_stats *stats; + struct hw_stats_port_attribute attrs[]; }; static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct ib_port, kobj); if (!port_attr->show) return -EIO; - return port_attr->show(p, port_attr, buf); + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); } static ssize_t port_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct ib_port, kobj); if (!port_attr->store) return -EIO; - return port_attr->store(p, port_attr, buf, count); + return port_attr->store(p->ibdev, p->port_num, port_attr, buf, count); } +struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj, + u32 *port_num) +{ + struct ib_port *port = container_of(kobj, struct ib_port, kobj); + + *port_num = port->port_num; + return port->ibdev; +} +EXPORT_SYMBOL(ib_port_sysfs_get_ibdev_kobj); + static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show, .store = port_attr_store }; +static ssize_t hw_stat_device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hw_stats_device_attribute *stat_attr = + container_of(attr, struct hw_stats_device_attribute, attr); + struct ib_device *ibdev = container_of(dev, struct ib_device, dev); + + return stat_attr->show(ibdev, ibdev->hw_stats_data->stats, + stat_attr - ibdev->hw_stats_data->attrs, 0, buf); +} + +static ssize_t hw_stat_device_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_device_attribute *stat_attr = + container_of(attr, struct hw_stats_device_attribute, attr); + struct ib_device *ibdev = container_of(dev, struct ib_device, dev); + + return stat_attr->store(ibdev, ibdev->hw_stats_data->stats, + stat_attr - ibdev->hw_stats_data->attrs, 0, buf, + count); +} + +static ssize_t hw_stat_port_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hw_stats_port_attribute *stat_attr = + container_of(attr, struct hw_stats_port_attribute, attr); + struct ib_port *port = ibdev->port_data[port_num].sysfs; + + return stat_attr->show(ibdev, port->hw_stats_data->stats, + stat_attr - port->hw_stats_data->attrs, + port->port_num, buf); +} + +static ssize_t hw_stat_port_store(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_port_attribute *stat_attr = + container_of(attr, struct hw_stats_port_attribute, attr); + struct ib_port *port = ibdev->port_data[port_num].sysfs; + + return stat_attr->store(ibdev, port->hw_stats_data->stats, + stat_attr - port->hw_stats_data->attrs, + port->port_num, buf, count); +} + static ssize_t gid_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct gid_attr_group, kobj)->port; if (!port_attr->show) return -EIO; - return port_attr->show(p, port_attr, buf); + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); } static const struct sysfs_ops gid_attr_sysfs_ops = { .show = gid_attr_show }; -static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t state_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; @@ -161,90 +225,91 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" }; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d: %s\n", attr.state, - attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : "UNKNOWN"); + return sysfs_emit(buf, "%d: %s\n", attr.state, + attr.state >= 0 && + attr.state < ARRAY_SIZE(state_name) ? + state_name[attr.state] : + "UNKNOWN"); } -static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t lid_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "0x%x\n", attr.lid); + return sysfs_emit(buf, "0x%x\n", attr.lid); } -static ssize_t lid_mask_count_show(struct ib_port *p, - struct port_attribute *unused, - char *buf) +static ssize_t lid_mask_count_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d\n", attr.lmc); + return sysfs_emit(buf, "%u\n", attr.lmc); } -static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t sm_lid_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "0x%x\n", attr.sm_lid); + return sysfs_emit(buf, "0x%x\n", attr.sm_lid); } -static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t sm_sl_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d\n", attr.sm_sl); + return sysfs_emit(buf, "%u\n", attr.sm_sl); } -static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "0x%08x\n", attr.port_cap_flags); + return sysfs_emit(buf, "0x%08x\n", attr.port_cap_flags); } -static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; char *speed = ""; int rate; /* in deci-Gb/sec */ ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; @@ -273,6 +338,10 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, speed = " HDR"; rate = 500; break; + case IB_SPEED_NDR: + speed = " NDR"; + rate = 1000; + break; case IB_SPEED_SDR: default: /* default to SDR for invalid rates */ speed = " SDR"; @@ -284,14 +353,14 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, if (rate < 0) return -EINVAL; - return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, + rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); } static const char *phys_state_to_str(enum ib_port_phys_state phys_state) { - static const char * phys_state_str[] = { + static const char *phys_state_str[] = { "<unknown>", "Sleep", "Polling", @@ -307,102 +376,113 @@ static const char *phys_state_to_str(enum ib_port_phys_state phys_state) return "<unknown>"; } -static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t phys_state_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d: %s\n", attr.phys_state, - phys_state_to_str(attr.phys_state)); + return sysfs_emit(buf, "%u: %s\n", attr.phys_state, + phys_state_to_str(attr.phys_state)); } -static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t link_layer_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { - switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + const char *output; + + switch (rdma_port_get_link_layer(ibdev, port_num)) { case IB_LINK_LAYER_INFINIBAND: - return sprintf(buf, "%s\n", "InfiniBand"); + output = "InfiniBand"; + break; case IB_LINK_LAYER_ETHERNET: - return sprintf(buf, "%s\n", "Ethernet"); + output = "Ethernet"; + break; default: - return sprintf(buf, "%s\n", "Unknown"); + output = "Unknown"; + break; } + + return sysfs_emit(buf, "%s\n", output); } -static PORT_ATTR_RO(state); -static PORT_ATTR_RO(lid); -static PORT_ATTR_RO(lid_mask_count); -static PORT_ATTR_RO(sm_lid); -static PORT_ATTR_RO(sm_sl); -static PORT_ATTR_RO(cap_mask); -static PORT_ATTR_RO(rate); -static PORT_ATTR_RO(phys_state); -static PORT_ATTR_RO(link_layer); +static IB_PORT_ATTR_RO(state); +static IB_PORT_ATTR_RO(lid); +static IB_PORT_ATTR_RO(lid_mask_count); +static IB_PORT_ATTR_RO(sm_lid); +static IB_PORT_ATTR_RO(sm_sl); +static IB_PORT_ATTR_RO(cap_mask); +static IB_PORT_ATTR_RO(rate); +static IB_PORT_ATTR_RO(phys_state); +static IB_PORT_ATTR_RO(link_layer); static struct attribute *port_default_attrs[] = { - &port_attr_state.attr, - &port_attr_lid.attr, - &port_attr_lid_mask_count.attr, - &port_attr_sm_lid.attr, - &port_attr_sm_sl.attr, - &port_attr_cap_mask.attr, - &port_attr_rate.attr, - &port_attr_phys_state.attr, - &port_attr_link_layer.attr, + &ib_port_attr_state.attr, + &ib_port_attr_lid.attr, + &ib_port_attr_lid_mask_count.attr, + &ib_port_attr_sm_lid.attr, + &ib_port_attr_sm_sl.attr, + &ib_port_attr_cap_mask.attr, + &ib_port_attr_rate.attr, + &ib_port_attr_phys_state.attr, + &ib_port_attr_link_layer.attr, NULL }; +ATTRIBUTE_GROUPS(port_default); -static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) +static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) { struct net_device *ndev; - size_t ret = -EINVAL; + int ret = -EINVAL; rcu_read_lock(); ndev = rcu_dereference(gid_attr->ndev); if (ndev) - ret = sprintf(buf, "%s\n", ndev->name); + ret = sysfs_emit(buf, "%s\n", ndev->name); rcu_read_unlock(); return ret; } -static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) +static ssize_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) { - return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); + return sysfs_emit(buf, "%s\n", + ib_cache_gid_type_str(gid_attr->gid_type)); } static ssize_t _show_port_gid_attr( - struct ib_port *p, struct port_attribute *attr, char *buf, - size_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) + struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, + char *buf, + ssize_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); const struct ib_gid_attr *gid_attr; ssize_t ret; - gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); + gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); if (IS_ERR(gid_attr)) - return PTR_ERR(gid_attr); + /* -EINVAL is returned for user space compatibility reasons. */ + return -EINVAL; ret = print(gid_attr, buf); rdma_put_gid_attr(gid_attr); return ret; } -static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t show_port_gid(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); const struct ib_gid_attr *gid_attr; - ssize_t ret; + int len; - gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); + gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); if (IS_ERR(gid_attr)) { const union ib_gid zgid = {}; @@ -415,54 +495,56 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, * space throwing such error on fail to read gid, return zero * GID as before. This maintains backward compatibility. */ - return sprintf(buf, "%pI6\n", zgid.raw); + return sysfs_emit(buf, "%pI6\n", zgid.raw); } - ret = sprintf(buf, "%pI6\n", gid_attr->gid.raw); + len = sysfs_emit(buf, "%pI6\n", gid_attr->gid.raw); rdma_put_gid_attr(gid_attr); - return ret; + return len; } -static ssize_t show_port_gid_attr_ndev(struct ib_port *p, - struct port_attribute *attr, char *buf) +static ssize_t show_port_gid_attr_ndev(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, + char *buf) { - return _show_port_gid_attr(p, attr, buf, print_ndev); + return _show_port_gid_attr(ibdev, port_num, attr, buf, print_ndev); } -static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, - struct port_attribute *attr, +static ssize_t show_port_gid_attr_gid_type(struct ib_device *ibdev, + u32 port_num, + struct ib_port_attribute *attr, char *buf) { - return _show_port_gid_attr(p, attr, buf, print_gid_type); + return _show_port_gid_attr(ibdev, port_num, attr, buf, print_gid_type); } -static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t show_port_pkey(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); u16 pkey; - ssize_t ret; + int ret; - ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); + ret = ib_query_pkey(ibdev, port_num, tab_attr->index, &pkey); if (ret) return ret; - return sprintf(buf, "0x%04x\n", pkey); + return sysfs_emit(buf, "0x%04x\n", pkey); } #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ struct port_table_attribute port_pma_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ - .attr_id = IB_PMA_PORT_COUNTERS , \ + .attr_id = IB_PMA_PORT_COUNTERS, \ } #define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ struct port_table_attribute port_pma_attr_ext_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ .index = (_offset) | ((_width) << 16), \ - .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT, \ } /* @@ -513,47 +595,45 @@ out: return ret; } -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t show_pma_counter(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); int offset = tab_attr->index & 0xffff; int width = (tab_attr->index >> 16) & 0xff; - ssize_t ret; + int ret; u8 data[8]; + int len; - ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, + ret = get_perf_mad(ibdev, port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) return ret; switch (width) { case 4: - ret = sprintf(buf, "%u\n", (*data >> - (4 - (offset % 8))) & 0xf); + len = sysfs_emit(buf, "%d\n", + (*data >> (4 - (offset % 8))) & 0xf); break; case 8: - ret = sprintf(buf, "%u\n", *data); + len = sysfs_emit(buf, "%u\n", *data); break; case 16: - ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)data)); + len = sysfs_emit(buf, "%u\n", be16_to_cpup((__be16 *)data)); break; case 32: - ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)data)); + len = sysfs_emit(buf, "%u\n", be32_to_cpup((__be32 *)data)); break; case 64: - ret = sprintf(buf, "%llu\n", - be64_to_cpup((__be64 *)data)); + len = sysfs_emit(buf, "%llu\n", be64_to_cpup((__be64 *)data)); break; - default: - ret = 0; + len = 0; + break; } - return ret; + return len; } static PORT_PMA_ATTR(symbol_error , 0, 16, 32); @@ -653,72 +733,49 @@ static struct attribute *pma_attrs_noietf[] = { NULL }; -static struct attribute_group pma_group = { +static const struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; -static struct attribute_group pma_group_ext = { +static const struct attribute_group pma_group_ext = { .name = "counters", .attrs = pma_attrs_ext }; -static struct attribute_group pma_group_noietf = { +static const struct attribute_group pma_group_noietf = { .name = "counters", .attrs = pma_attrs_noietf }; static void ib_port_release(struct kobject *kobj) { - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - struct attribute *a; + struct ib_port *port = container_of(kobj, struct ib_port, kobj); int i; - if (p->gid_group.attrs) { - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); - - kfree(p->gid_group.attrs); - } - - if (p->pkey_group.attrs) { - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); - - kfree(p->pkey_group.attrs); - } - - kfree(p); + for (i = 0; i != ARRAY_SIZE(port->groups); i++) + kfree(port->groups[i].attrs); + if (port->hw_stats_data) + rdma_free_hw_stats_struct(port->hw_stats_data->stats); + kfree(port->hw_stats_data); + kvfree(port); } static void ib_port_gid_attr_release(struct kobject *kobj) { - struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, - kobj); - struct attribute *a; + struct gid_attr_group *gid_attr_group = + container_of(kobj, struct gid_attr_group, kobj); int i; - if (g->ndev.attrs) { - for (i = 0; (a = g->ndev.attrs[i]); ++i) - kfree(a); - - kfree(g->ndev.attrs); - } - - if (g->type.attrs) { - for (i = 0; (a = g->type.attrs[i]); ++i) - kfree(a); - - kfree(g->type.attrs); - } - - kfree(g); + for (i = 0; i != ARRAY_SIZE(gid_attr_group->groups); i++) + kfree(gid_attr_group->groups[i].attrs); + kfree(gid_attr_group); } static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, - .default_attrs = port_default_attrs + .default_groups = port_default_groups, }; static struct kobj_type gid_attr_type = { @@ -726,55 +783,12 @@ static struct kobj_type gid_attr_type = { .release = ib_port_gid_attr_release }; -static struct attribute ** -alloc_group_attrs(ssize_t (*show)(struct ib_port *, - struct port_attribute *, char *buf), - int len) -{ - struct attribute **tab_attr; - struct port_table_attribute *element; - int i; - - tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); - if (!tab_attr) - return NULL; - - for (i = 0; i < len; i++) { - element = kzalloc(sizeof(struct port_table_attribute), - GFP_KERNEL); - if (!element) - goto err; - - if (snprintf(element->name, sizeof(element->name), - "%d", i) >= sizeof(element->name)) { - kfree(element); - goto err; - } - - element->attr.attr.name = element->name; - element->attr.attr.mode = S_IRUGO; - element->attr.show = show; - element->index = i; - sysfs_attr_init(&element->attr.attr); - - tab_attr[i] = &element->attr.attr; - } - - return tab_attr; - -err: - while (--i >= 0) - kfree(tab_attr[i]); - kfree(tab_attr); - return NULL; -} - /* * Figure out which counter table to use depending on * the device capabilities. */ -static struct attribute_group *get_counter_table(struct ib_device *dev, - int port_num) +static const struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) { struct ib_class_port_info cpi; @@ -794,7 +808,7 @@ static struct attribute_group *get_counter_table(struct ib_device *dev, } static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, - u8 port_num, int index) + u32 port_num, int index) { int ret; @@ -809,77 +823,50 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, return 0; } -static ssize_t print_hw_stat(struct ib_device *dev, int port_num, - struct rdma_hw_stats *stats, int index, char *buf) +static int print_hw_stat(struct ib_device *dev, int port_num, + struct rdma_hw_stats *stats, int index, char *buf) { u64 v = rdma_counter_get_hwstat_value(dev, port_num, index); - return sprintf(buf, "%llu\n", stats->value[index] + v); + return sysfs_emit(buf, "%llu\n", stats->value[index] + v); } -static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t show_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, unsigned int index, + unsigned int port_num, char *buf) { - struct ib_device *dev; - struct ib_port *port; - struct hw_stats_attribute *hsa; - struct rdma_hw_stats *stats; int ret; - hsa = container_of(attr, struct hw_stats_attribute, attr); - if (!hsa->port_num) { - dev = container_of((struct device *)kobj, - struct ib_device, dev); - stats = dev->hw_stats; - } else { - port = container_of(kobj, struct ib_port, kobj); - dev = port->ibdev; - stats = port->hw_stats; - } mutex_lock(&stats->lock); - ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + ret = update_hw_stats(ibdev, stats, port_num, index); if (ret) goto unlock; - ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf); + ret = print_hw_stat(ibdev, port_num, stats, index, buf); unlock: mutex_unlock(&stats->lock); return ret; } -static ssize_t show_stats_lifespan(struct kobject *kobj, - struct attribute *attr, +static ssize_t show_stats_lifespan(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf) { - struct hw_stats_attribute *hsa; - struct rdma_hw_stats *stats; int msecs; - hsa = container_of(attr, struct hw_stats_attribute, attr); - if (!hsa->port_num) { - struct ib_device *dev = container_of((struct device *)kobj, - struct ib_device, dev); - - stats = dev->hw_stats; - } else { - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - - stats = p->hw_stats; - } - mutex_lock(&stats->lock); msecs = jiffies_to_msecs(stats->lifespan); mutex_unlock(&stats->lock); - return sprintf(buf, "%d\n", msecs); + return sysfs_emit(buf, "%d\n", msecs); } -static ssize_t set_stats_lifespan(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t count) +static ssize_t set_stats_lifespan(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count) { - struct hw_stats_attribute *hsa; - struct rdma_hw_stats *stats; int msecs; int jiffies; int ret; @@ -890,17 +877,6 @@ static ssize_t set_stats_lifespan(struct kobject *kobj, if (msecs < 0 || msecs > 10000) return -EINVAL; jiffies = msecs_to_jiffies(msecs); - hsa = container_of(attr, struct hw_stats_attribute, attr); - if (!hsa->port_num) { - struct ib_device *dev = container_of((struct device *)kobj, - struct ib_device, dev); - - stats = dev->hw_stats; - } else { - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - - stats = p->hw_stats; - } mutex_lock(&stats->lock); stats->lifespan = jiffies; @@ -909,300 +885,427 @@ static ssize_t set_stats_lifespan(struct kobject *kobj, return count; } -static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +static struct hw_stats_device_data * +alloc_hw_stats_device(struct ib_device *ibdev) { - struct attribute **attr; - - sysfs_remove_group(kobj, attr_group); + struct hw_stats_device_data *data; + struct rdma_hw_stats *stats; - for (attr = attr_group->attrs; *attr; attr++) - kfree(*attr); - kfree(attr_group); -} + if (!ibdev->ops.alloc_hw_device_stats) + return ERR_PTR(-EOPNOTSUPP); + stats = ibdev->ops.alloc_hw_device_stats(ibdev); + if (!stats) + return ERR_PTR(-ENOMEM); + if (!stats->descs || stats->num_counters <= 0) + goto err_free_stats; -static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) -{ - struct hw_stats_attribute *hsa; + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + data = kzalloc(struct_size(data, attrs, stats->num_counters + 1), + GFP_KERNEL); + if (!data) + goto err_free_stats; + data->group.attrs = kcalloc(stats->num_counters + 2, + sizeof(*data->group.attrs), GFP_KERNEL); + if (!data->group.attrs) + goto err_free_data; - hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); - if (!hsa) - return NULL; + data->group.name = "hw_counters"; + data->stats = stats; + return data; - hsa->attr.name = (char *)name; - hsa->attr.mode = S_IRUGO; - hsa->show = show_hw_stats; - hsa->store = NULL; - hsa->index = index; - hsa->port_num = port_num; +err_free_data: + kfree(data); +err_free_stats: + rdma_free_hw_stats_struct(stats); + return ERR_PTR(-ENOMEM); +} - return &hsa->attr; +void ib_device_release_hw_stats(struct hw_stats_device_data *data) +{ + kfree(data->group.attrs); + rdma_free_hw_stats_struct(data->stats); + kfree(data); } -static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +int ib_setup_device_attrs(struct ib_device *ibdev) { - struct hw_stats_attribute *hsa; + struct hw_stats_device_attribute *attr; + struct hw_stats_device_data *data; + bool opstat_skipped = false; + int i, ret, pos = 0; + + data = alloc_hw_stats_device(ibdev); + if (IS_ERR(data)) { + if (PTR_ERR(data) == -EOPNOTSUPP) + return 0; + return PTR_ERR(data); + } + ibdev->hw_stats_data = data; - hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); - if (!hsa) - return NULL; + ret = ibdev->ops.get_hw_stats(ibdev, data->stats, 0, + data->stats->num_counters); + if (ret != data->stats->num_counters) { + if (WARN_ON(ret >= 0)) + return -EINVAL; + return ret; + } - hsa->attr.name = name; - hsa->attr.mode = S_IWUSR | S_IRUGO; - hsa->show = show_stats_lifespan; - hsa->store = set_stats_lifespan; - hsa->index = 0; - hsa->port_num = port_num; + data->stats->timestamp = jiffies; - return &hsa->attr; + for (i = 0; i < data->stats->num_counters; i++) { + if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { + opstat_skipped = true; + continue; + } + + WARN_ON(opstat_skipped); + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = data->stats->descs[i].name; + attr->attr.attr.mode = 0444; + attr->attr.show = hw_stat_device_show; + attr->show = show_hw_stats; + data->group.attrs[pos] = &attr->attr.attr; + pos++; + } + + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = "lifespan"; + attr->attr.attr.mode = 0644; + attr->attr.show = hw_stat_device_show; + attr->show = show_stats_lifespan; + attr->attr.store = hw_stat_device_store; + attr->store = set_stats_lifespan; + data->group.attrs[pos] = &attr->attr.attr; + for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++) + if (!ibdev->groups[i]) { + ibdev->groups[i] = &data->group; + return 0; + } + WARN(true, "struct ib_device->groups is too small"); + return -EINVAL; } -static void setup_hw_stats(struct ib_device *device, struct ib_port *port, - u8 port_num) +static struct hw_stats_port_data * +alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group) { - struct attribute_group *hsag; + struct ib_device *ibdev = port->ibdev; + struct hw_stats_port_data *data; struct rdma_hw_stats *stats; - int i, ret; - - stats = device->ops.alloc_hw_stats(device, port_num); + if (!ibdev->ops.alloc_hw_port_stats) + return ERR_PTR(-EOPNOTSUPP); + stats = ibdev->ops.alloc_hw_port_stats(port->ibdev, port->port_num); if (!stats) - return; - - if (!stats->names || stats->num_counters <= 0) + return ERR_PTR(-ENOMEM); + if (!stats->descs || stats->num_counters <= 0) goto err_free_stats; /* * Two extra attribue elements here, one for the lifespan entry and * one to NULL terminate the list for the sysfs core code */ - hsag = kzalloc(sizeof(*hsag) + - sizeof(void *) * (stats->num_counters + 2), + data = kzalloc(struct_size(data, attrs, stats->num_counters + 1), GFP_KERNEL); - if (!hsag) + if (!data) goto err_free_stats; + group->attrs = kcalloc(stats->num_counters + 2, + sizeof(*group->attrs), GFP_KERNEL); + if (!group->attrs) + goto err_free_data; - ret = device->ops.get_hw_stats(device, stats, port_num, - stats->num_counters); - if (ret != stats->num_counters) - goto err_free_hsag; + group->name = "hw_counters"; + data->stats = stats; + return data; - stats->timestamp = jiffies; - - hsag->name = "hw_counters"; - hsag->attrs = (void *)hsag + sizeof(*hsag); +err_free_data: + kfree(data); +err_free_stats: + rdma_free_hw_stats_struct(stats); + return ERR_PTR(-ENOMEM); +} - for (i = 0; i < stats->num_counters; i++) { - hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); - if (!hsag->attrs[i]) - goto err; - sysfs_attr_init(hsag->attrs[i]); +static int setup_hw_port_stats(struct ib_port *port, + struct attribute_group *group) +{ + struct hw_stats_port_attribute *attr; + struct hw_stats_port_data *data; + bool opstat_skipped = false; + int i, ret, pos = 0; + + data = alloc_hw_stats_port(port, group); + if (IS_ERR(data)) + return PTR_ERR(data); + + ret = port->ibdev->ops.get_hw_stats(port->ibdev, data->stats, + port->port_num, + data->stats->num_counters); + if (ret != data->stats->num_counters) { + if (WARN_ON(ret >= 0)) + return -EINVAL; + return ret; } - mutex_init(&stats->lock); - /* treat an error here as non-fatal */ - hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); - if (hsag->attrs[i]) - sysfs_attr_init(hsag->attrs[i]); + data->stats->timestamp = jiffies; - if (port) { - struct kobject *kobj = &port->kobj; - ret = sysfs_create_group(kobj, hsag); - if (ret) - goto err; - port->hw_stats_ag = hsag; - port->hw_stats = stats; - if (device->port_data) - device->port_data[port_num].hw_stats = stats; - } else { - struct kobject *kobj = &device->dev.kobj; - ret = sysfs_create_group(kobj, hsag); - if (ret) - goto err; - device->hw_stats_ag = hsag; - device->hw_stats = stats; + for (i = 0; i < data->stats->num_counters; i++) { + if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { + opstat_skipped = true; + continue; + } + + WARN_ON(opstat_skipped); + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = data->stats->descs[i].name; + attr->attr.attr.mode = 0444; + attr->attr.show = hw_stat_port_show; + attr->show = show_hw_stats; + group->attrs[pos] = &attr->attr.attr; + pos++; } - return; + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = "lifespan"; + attr->attr.attr.mode = 0644; + attr->attr.show = hw_stat_port_show; + attr->show = show_stats_lifespan; + attr->attr.store = hw_stat_port_store; + attr->store = set_stats_lifespan; + group->attrs[pos] = &attr->attr.attr; + + port->hw_stats_data = data; + return 0; +} -err: - for (; i >= 0; i--) - kfree(hsag->attrs[i]); -err_free_hsag: - kfree(hsag); -err_free_stats: - kfree(stats); - return; +struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, + u32 port_num) +{ + if (!ibdev->port_data || !rdma_is_port_valid(ibdev, port_num) || + !ibdev->port_data[port_num].sysfs->hw_stats_data) + return NULL; + return ibdev->port_data[port_num].sysfs->hw_stats_data->stats; } -static int add_port(struct ib_core_device *coredev, int port_num) +static int +alloc_port_table_group(const char *name, struct attribute_group *group, + struct port_table_attribute *attrs, size_t num, + ssize_t (*show)(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *, char *buf)) { - struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); - bool is_full_dev = &device->coredev == coredev; - struct ib_port *p; - struct ib_port_attr attr; + struct attribute **attr_list; int i; - int ret; - ret = ib_query_port(device, port_num, &attr); - if (ret) - return ret; - - p = kzalloc(sizeof *p, GFP_KERNEL); - if (!p) + attr_list = kcalloc(num + 1, sizeof(*attr_list), GFP_KERNEL); + if (!attr_list) return -ENOMEM; - p->ibdev = device; - p->port_num = port_num; + for (i = 0; i < num; i++) { + struct port_table_attribute *element = &attrs[i]; - ret = kobject_init_and_add(&p->kobj, &port_type, - coredev->ports_kobj, - "%d", port_num); - if (ret) { - kfree(p); - return ret; - } + if (snprintf(element->name, sizeof(element->name), "%d", i) >= + sizeof(element->name)) + goto err; - p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); - if (!p->gid_attr_group) { - ret = -ENOMEM; - goto err_put; - } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + element->attr.attr.mode = 0444; + element->attr.show = show; + element->index = i; - p->gid_attr_group->port = p; - ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, - &p->kobj, "gid_attrs"); - if (ret) { - kfree(p->gid_attr_group); - goto err_put; + attr_list[i] = &element->attr.attr; } + group->name = name; + group->attrs = attr_list; + return 0; +err: + kfree(attr_list); + return -EINVAL; +} - if (device->ops.process_mad && is_full_dev) { - p->pma_table = get_counter_table(device, port_num); - ret = sysfs_create_group(&p->kobj, p->pma_table); - if (ret) - goto err_put_gid_attrs; - } +/* + * Create the sysfs: + * ibp0s9/ports/XX/gid_attrs/{ndevs,types}/YYY + * YYY is the gid table index in decimal + */ +static int setup_gid_attrs(struct ib_port *port, + const struct ib_port_attr *attr) +{ + struct gid_attr_group *gid_attr_group; + int ret; - p->gid_group.name = "gids"; - p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); - if (!p->gid_group.attrs) { - ret = -ENOMEM; - goto err_remove_pma; - } + gid_attr_group = kzalloc(struct_size(gid_attr_group, attrs_list, + attr->gid_tbl_len * 2), + GFP_KERNEL); + if (!gid_attr_group) + return -ENOMEM; + gid_attr_group->port = port; + kobject_init(&gid_attr_group->kobj, &gid_attr_type); - ret = sysfs_create_group(&p->kobj, &p->gid_group); + ret = alloc_port_table_group("ndevs", &gid_attr_group->groups[0], + gid_attr_group->attrs_list, + attr->gid_tbl_len, + show_port_gid_attr_ndev); if (ret) - goto err_free_gid; + goto err_put; + gid_attr_group->groups_list[0] = &gid_attr_group->groups[0]; - p->gid_attr_group->ndev.name = "ndevs"; - p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, - attr.gid_tbl_len); - if (!p->gid_attr_group->ndev.attrs) { - ret = -ENOMEM; - goto err_remove_gid; - } + ret = alloc_port_table_group( + "types", &gid_attr_group->groups[1], + gid_attr_group->attrs_list + attr->gid_tbl_len, + attr->gid_tbl_len, show_port_gid_attr_gid_type); + if (ret) + goto err_put; + gid_attr_group->groups_list[1] = &gid_attr_group->groups[1]; - ret = sysfs_create_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->ndev); + ret = kobject_add(&gid_attr_group->kobj, &port->kobj, "gid_attrs"); if (ret) - goto err_free_gid_ndev; + goto err_put; + ret = sysfs_create_groups(&gid_attr_group->kobj, + gid_attr_group->groups_list); + if (ret) + goto err_del; + port->gid_attr_group = gid_attr_group; + return 0; - p->gid_attr_group->type.name = "types"; - p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, - attr.gid_tbl_len); - if (!p->gid_attr_group->type.attrs) { - ret = -ENOMEM; - goto err_remove_gid_ndev; - } +err_del: + kobject_del(&gid_attr_group->kobj); +err_put: + kobject_put(&gid_attr_group->kobj); + return ret; +} - ret = sysfs_create_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->type); - if (ret) - goto err_free_gid_type; +static void destroy_gid_attrs(struct ib_port *port) +{ + struct gid_attr_group *gid_attr_group = port->gid_attr_group; - p->pkey_group.name = "pkeys"; - p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, - attr.pkey_tbl_len); - if (!p->pkey_group.attrs) { - ret = -ENOMEM; - goto err_remove_gid_type; - } + if (!gid_attr_group) + return; + sysfs_remove_groups(&gid_attr_group->kobj, gid_attr_group->groups_list); + kobject_del(&gid_attr_group->kobj); + kobject_put(&gid_attr_group->kobj); +} - ret = sysfs_create_group(&p->kobj, &p->pkey_group); +/* + * Create the sysfs: + * ibp0s9/ports/XX/{gids,pkeys,counters}/YYY + */ +static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, + const struct ib_port_attr *attr) +{ + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + bool is_full_dev = &device->coredev == coredev; + const struct attribute_group **cur_group; + struct ib_port *p; + int ret; + + p = kvzalloc(struct_size(p, attrs_list, + attr->gid_tbl_len + attr->pkey_tbl_len), + GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + p->ibdev = device; + p->port_num = port_num; + kobject_init(&p->kobj, &port_type); + + cur_group = p->groups_list; + ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list, + attr->gid_tbl_len, show_port_gid); if (ret) - goto err_free_pkey; + goto err_put; + *cur_group++ = &p->groups[0]; - if (device->ops.init_port && is_full_dev) { - ret = device->ops.init_port(device, port_num, &p->kobj); + if (attr->pkey_tbl_len) { + ret = alloc_port_table_group("pkeys", &p->groups[1], + p->attrs_list + attr->gid_tbl_len, + attr->pkey_tbl_len, show_port_pkey); if (ret) - goto err_remove_pkey; + goto err_put; + *cur_group++ = &p->groups[1]; } /* * If port == 0, it means hw_counters are per device and not per - * port, so holder should be device. Therefore skip per port conunter - * initialization. + * port, so holder should be device. Therefore skip per port + * counter initialization. */ - if (device->ops.alloc_hw_stats && port_num && is_full_dev) - setup_hw_stats(device, p, port_num); - - list_add_tail(&p->kobj.entry, &coredev->port_list); - - kobject_uevent(&p->kobj, KOBJ_ADD); - return 0; - -err_remove_pkey: - sysfs_remove_group(&p->kobj, &p->pkey_group); - -err_free_pkey: - for (i = 0; i < attr.pkey_tbl_len; ++i) - kfree(p->pkey_group.attrs[i]); - - kfree(p->pkey_group.attrs); - p->pkey_group.attrs = NULL; - -err_remove_gid_type: - sysfs_remove_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->type); - -err_free_gid_type: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_attr_group->type.attrs[i]); - - kfree(p->gid_attr_group->type.attrs); - p->gid_attr_group->type.attrs = NULL; - -err_remove_gid_ndev: - sysfs_remove_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->ndev); - -err_free_gid_ndev: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_attr_group->ndev.attrs[i]); - - kfree(p->gid_attr_group->ndev.attrs); - p->gid_attr_group->ndev.attrs = NULL; - -err_remove_gid: - sysfs_remove_group(&p->kobj, &p->gid_group); + if (port_num && is_full_dev) { + ret = setup_hw_port_stats(p, &p->groups[2]); + if (ret && ret != -EOPNOTSUPP) + goto err_put; + if (!ret) + *cur_group++ = &p->groups[2]; + } -err_free_gid: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_group.attrs[i]); + if (device->ops.process_mad && is_full_dev) + *cur_group++ = get_counter_table(device, port_num); - kfree(p->gid_group.attrs); - p->gid_group.attrs = NULL; + ret = kobject_add(&p->kobj, coredev->ports_kobj, "%d", port_num); + if (ret) + goto err_put; + ret = sysfs_create_groups(&p->kobj, p->groups_list); + if (ret) + goto err_del; + if (is_full_dev) { + ret = sysfs_create_groups(&p->kobj, device->ops.port_groups); + if (ret) + goto err_groups; + } -err_remove_pma: - if (p->pma_table) - sysfs_remove_group(&p->kobj, p->pma_table); + list_add_tail(&p->kobj.entry, &coredev->port_list); + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = p; -err_put_gid_attrs: - kobject_put(&p->gid_attr_group->kobj); + return p; +err_groups: + sysfs_remove_groups(&p->kobj, p->groups_list); +err_del: + kobject_del(&p->kobj); err_put: kobject_put(&p->kobj); - return ret; + return ERR_PTR(ret); +} + +static void destroy_port(struct ib_core_device *coredev, struct ib_port *port) +{ + bool is_full_dev = &port->ibdev->coredev == coredev; + + if (port->ibdev->port_data && + port->ibdev->port_data[port->port_num].sysfs == port) + port->ibdev->port_data[port->port_num].sysfs = NULL; + list_del(&port->kobj.entry); + if (is_full_dev) + sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups); + sysfs_remove_groups(&port->kobj, port->groups_list); + kobject_del(&port->kobj); + kobject_put(&port->kobj); +} + +static const char *node_type_string(int node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + return "CA"; + case RDMA_NODE_IB_SWITCH: + return "switch"; + case RDMA_NODE_IB_ROUTER: + return "router"; + case RDMA_NODE_RNIC: + return "RNIC"; + case RDMA_NODE_USNIC: + return "usNIC"; + case RDMA_NODE_USNIC_UDP: + return "usNIC UDP"; + case RDMA_NODE_UNSPECIFIED: + return "unspecified"; + } + return "<unknown>"; } static ssize_t node_type_show(struct device *device, @@ -1210,16 +1313,8 @@ static ssize_t node_type_show(struct device *device, { struct ib_device *dev = rdma_device_to_ibdev(device); - switch (dev->node_type) { - case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); - case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); - case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); - case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); - case RDMA_NODE_UNSPECIFIED: return sprintf(buf, "%d: unspecified\n", dev->node_type); - case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); - case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); - default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); - } + return sysfs_emit(buf, "%u: %s\n", dev->node_type, + node_type_string(dev->node_type)); } static DEVICE_ATTR_RO(node_type); @@ -1227,12 +1322,13 @@ static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); + __be16 *guid = (__be16 *)&dev->attrs.sys_image_guid; - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); + return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(guid[0]), + be16_to_cpu(guid[1]), + be16_to_cpu(guid[2]), + be16_to_cpu(guid[3])); } static DEVICE_ATTR_RO(sys_image_guid); @@ -1240,12 +1336,13 @@ static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); + __be16 *node_guid = (__be16 *)&dev->node_guid; - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &dev->node_guid)[0]), - be16_to_cpu(((__be16 *) &dev->node_guid)[1]), - be16_to_cpu(((__be16 *) &dev->node_guid)[2]), - be16_to_cpu(((__be16 *) &dev->node_guid)[3])); + return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(node_guid[0]), + be16_to_cpu(node_guid[1]), + be16_to_cpu(node_guid[2]), + be16_to_cpu(node_guid[3])); } static DEVICE_ATTR_RO(node_guid); @@ -1254,7 +1351,7 @@ static ssize_t node_desc_show(struct device *device, { struct ib_device *dev = rdma_device_to_ibdev(device); - return sprintf(buf, "%.64s\n", dev->node_desc); + return sysfs_emit(buf, "%.64s\n", dev->node_desc); } static ssize_t node_desc_store(struct device *device, @@ -1281,10 +1378,11 @@ static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); + char version[IB_FW_VERSION_NAME_MAX] = {}; - ib_get_device_fw_str(dev, buf); - strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); - return strlen(buf); + ib_get_device_fw_str(dev, version); + + return sysfs_emit(buf, "%s\n", version); } static DEVICE_ATTR_RO(fw_ver); @@ -1303,30 +1401,13 @@ const struct attribute_group ib_dev_attr_group = { void ib_free_port_attrs(struct ib_core_device *coredev) { - struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); - bool is_full_dev = &device->coredev == coredev; struct kobject *p, *t; list_for_each_entry_safe(p, t, &coredev->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); - list_del(&p->entry); - if (port->hw_stats_ag) - free_hsag(&port->kobj, port->hw_stats_ag); - kfree(port->hw_stats); - if (device->port_data && is_full_dev) - device->port_data[port->port_num].hw_stats = NULL; - - if (port->pma_table) - sysfs_remove_group(p, port->pma_table); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - sysfs_remove_group(&port->gid_attr_group->kobj, - &port->gid_attr_group->ndev); - sysfs_remove_group(&port->gid_attr_group->kobj, - &port->gid_attr_group->type); - kobject_put(&port->gid_attr_group->kobj); - kobject_put(p); + destroy_gid_attrs(port); + destroy_port(coredev, port); } kobject_put(coredev->ports_kobj); @@ -1335,7 +1416,7 @@ void ib_free_port_attrs(struct ib_core_device *coredev) int ib_setup_port_attrs(struct ib_core_device *coredev) { struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); - unsigned int port; + u32 port_num; int ret; coredev->ports_kobj = kobject_create_and_add("ports", @@ -1343,12 +1424,24 @@ int ib_setup_port_attrs(struct ib_core_device *coredev) if (!coredev->ports_kobj) return -ENOMEM; - rdma_for_each_port (device, port) { - ret = add_port(coredev, port); + rdma_for_each_port (device, port_num) { + struct ib_port_attr attr; + struct ib_port *port; + + ret = ib_query_port(device, port_num, &attr); if (ret) goto err_put; - } + port = setup_port(coredev, port_num, &attr); + if (IS_ERR(port)) { + ret = PTR_ERR(port); + goto err_put; + } + + ret = setup_gid_attrs(port, &attr); + if (ret) + goto err_put; + } return 0; err_put: @@ -1356,68 +1449,27 @@ err_put: return ret; } -int ib_device_register_sysfs(struct ib_device *device) -{ - int ret; - - ret = ib_setup_port_attrs(&device->coredev); - if (ret) - return ret; - - if (device->ops.alloc_hw_stats) - setup_hw_stats(device, NULL, 0); - - return 0; -} - -void ib_device_unregister_sysfs(struct ib_device *device) -{ - if (device->hw_stats_ag) - free_hsag(&device->dev.kobj, device->hw_stats_ag); - kfree(device->hw_stats); - - ib_free_port_attrs(&device->coredev); -} - /** - * ib_port_register_module_stat - add module counters under relevant port - * of IB device. + * ib_port_register_client_groups - Add an ib_client's attributes to the port * - * @device: IB device to add counters + * @ibdev: IB device to add counters * @port_num: valid port number - * @kobj: pointer to the kobject to initialize - * @ktype: pointer to the ktype for this kobject. - * @name: the name of the kobject + * @groups: Group list of attributes + * + * Do not use. Only for legacy sysfs compatibility. */ -int ib_port_register_module_stat(struct ib_device *device, u8 port_num, - struct kobject *kobj, struct kobj_type *ktype, - const char *name) +int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups) { - struct kobject *p, *t; - int ret; - - list_for_each_entry_safe(p, t, &device->coredev.port_list, entry) { - struct ib_port *port = container_of(p, struct ib_port, kobj); - - if (port->port_num != port_num) - continue; - - ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s", - name); - if (ret) - return ret; - } - - return 0; + return sysfs_create_groups(&ibdev->port_data[port_num].sysfs->kobj, + groups); } -EXPORT_SYMBOL(ib_port_register_module_stat); +EXPORT_SYMBOL(ib_port_register_client_groups); -/** - * ib_port_unregister_module_stat - release module counters - * @kobj: pointer to the kobject to release - */ -void ib_port_unregister_module_stat(struct kobject *kobj) +void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups) { - kobject_put(kobj); + return sysfs_remove_groups(&ibdev->port_data[port_num].sysfs->kobj, + groups); } -EXPORT_SYMBOL(ib_port_unregister_module_stat); +EXPORT_SYMBOL(ib_port_unregister_client_groups); diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c index 6c3514beac4d..31e7860d35bf 100644 --- a/drivers/infiniband/core/trace.c +++ b/drivers/infiniband/core/trace.c @@ -9,6 +9,4 @@ #define CREATE_TRACE_POINTS -#include <rdma/ib_verbs.h> - #include <trace/events/rdma_core.h> diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 0274e9b704be..bf42650f125b 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,7 @@ #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> +#include <rdma/ib_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" @@ -79,28 +80,22 @@ struct ucma_file { struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; - struct workqueue_struct *close_wq; }; struct ucma_context { u32 id; struct completion comp; - atomic_t ref; + refcount_t ref; int events_reported; - int backlog; + atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; + struct mutex mutex; u64 uid; struct list_head list; struct list_head mc_list; - /* mark that device is in process of destroying the internal HW - * resources, protected by the ctx_table lock - */ - int closing; - /* sync between removal event and id destroy, protected by file mut */ - int destroying; struct work_struct close_work; }; @@ -117,17 +112,17 @@ struct ucma_multicast { struct ucma_event { struct ucma_context *ctx; + struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; - struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; - struct work_struct close_work; }; static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; +static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) @@ -137,7 +132,7 @@ static inline struct ucma_context *_ucma_find_context(int id, ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file || !ctx->cm_id) + else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } @@ -148,19 +143,16 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) { - if (ctx->closing) - ctx = ERR_PTR(-EIO); - else - atomic_inc(&ctx->ref); - } + if (!IS_ERR(ctx)) + if (!refcount_inc_not_zero(&ctx->ref)) + ctx = ERR_PTR(-ENXIO); xa_unlock(&ctx_table); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { - if (atomic_dec_and_test(&ctx->ref)) + if (refcount_dec_and_test(&ctx->ref)) complete(&ctx->comp); } @@ -181,26 +173,21 @@ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) return ctx; } -static void ucma_close_event_id(struct work_struct *work) -{ - struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); - - rdma_destroy_id(uevent_close->cm_id); - kfree(uevent_close); -} - static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing - * by its creator. + * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); + + /* Reading the cm_id without holding a positive ref is not allowed */ + ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) @@ -212,40 +199,32 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); - atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); + /* So list_del() will work if we don't do ucma_finish_ctx() */ + INIT_LIST_HEAD(&ctx->list); ctx->file = file; + mutex_init(&ctx->mutex); - if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&ctx->list, &file->ctx_list); + if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } return ctx; - -error: - kfree(ctx); - return NULL; } -static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) +static void ucma_set_ctx_cm_id(struct ucma_context *ctx, + struct rdma_cm_id *cm_id) { - struct ucma_multicast *mc; - - mc = kzalloc(sizeof(*mc), GFP_KERNEL); - if (!mc) - return NULL; - - mc->ctx = ctx; - if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&mc->list, &ctx->mc_list); - return mc; + refcount_set(&ctx->ref, 1); + ctx->cm_id = cm_id; +} -error: - kfree(mc); - return NULL; +static void ucma_finish_ctx(struct ucma_context *ctx) +{ + lockdep_assert_held(&ctx->file->mut); + list_add_tail(&ctx->list, &ctx->file->ctx_list); + xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, @@ -255,7 +234,7 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - dst->responder_resources =src->responder_resources; + dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; @@ -277,10 +256,15 @@ static void ucma_copy_ud_event(struct ib_device *device, dst->qkey = src->qkey; } -static void ucma_set_event_context(struct ucma_context *ctx, - struct rdma_cm_event *event, - struct ucma_event *uevent) +static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, + struct rdma_cm_event *event) { + struct ucma_event *uevent; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) + return NULL; + uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: @@ -295,44 +279,55 @@ static void ucma_set_event_context(struct ucma_context *ctx, uevent->resp.id = ctx->id; break; } + uevent->resp.event = event->event; + uevent->resp.status = event->status; + if (ctx->cm_id->qp_type == IB_QPT_UD) + ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, + &event->param.ud); + else + ucma_copy_conn_event(&uevent->resp.param.conn, + &event->param.conn); + + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + return uevent; } -/* Called with file->mut locked for the relevant context. */ -static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) { - struct ucma_context *ctx = cm_id->context; - struct ucma_event *con_req_eve; - int event_found = 0; + struct ucma_context *listen_ctx = cm_id->context; + struct ucma_context *ctx; + struct ucma_event *uevent; - if (ctx->destroying) - return; + if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) + return -ENOMEM; + ctx = ucma_alloc_ctx(listen_ctx->file); + if (!ctx) + goto err_backlog; + ucma_set_ctx_cm_id(ctx, cm_id); - /* only if context is pointing to cm_id that it owns it and can be - * queued to be closed, otherwise that cm_id is an inflight one that - * is part of that context event list pending to be detached and - * reattached to its new context as part of ucma_get_event, - * handled separately below. - */ - if (ctx->cm_id == cm_id) { - xa_lock(&ctx_table); - ctx->closing = 1; - xa_unlock(&ctx_table); - queue_work(ctx->file->close_wq, &ctx->close_work); - return; - } + uevent = ucma_create_uevent(listen_ctx, event); + if (!uevent) + goto err_alloc; + uevent->conn_req_ctx = ctx; + uevent->resp.id = ctx->id; - list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { - if (con_req_eve->cm_id == cm_id && - con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - list_del(&con_req_eve->list); - INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); - queue_work(ctx->file->close_wq, &con_req_eve->close_work); - event_found = 1; - break; - } - } - if (!event_found) - pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); + ctx->cm_id->context = ctx; + + mutex_lock(&ctx->file->mut); + ucma_finish_ctx(ctx); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + return 0; + +err_alloc: + ucma_destroy_private_ctx(ctx); +err_backlog: + atomic_inc(&listen_ctx->backlog); + /* Returning error causes the new ID to be destroyed */ + return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, @@ -340,69 +335,49 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; - int ret = 0; - uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); - if (!uevent) - return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + return ucma_connect_event_handler(cm_id, event); - mutex_lock(&ctx->file->mut); - uevent->cm_id = cm_id; - ucma_set_event_context(ctx, event, uevent); - uevent->resp.event = event->event; - uevent->resp.status = event->status; - if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, - &event->param.ud); - else - ucma_copy_conn_event(&uevent->resp.param.conn, - &event->param.conn); - - if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { - if (!ctx->backlog) { - ret = -ENOMEM; - kfree(uevent); - goto out; - } - ctx->backlog--; - } else if (!ctx->uid || ctx->cm_id != cm_id) { - /* - * We ignore events for new connections until userspace has set - * their context. This can only happen if an error occurs on a - * new connection before the user accepts it. This is okay, - * since the accept will just fail later. However, we do need - * to release the underlying HW resources in case of a device - * removal event. - */ - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); - - kfree(uevent); - goto out; + /* + * We ignore events for new connections until userspace has set their + * context. This can only happen if an error occurs on a new connection + * before the user accepts it. This is okay, since the accept will just + * fail later. However, we do need to release the underlying HW + * resources in case of a device removal event. + */ + if (ctx->uid) { + uevent = ucma_create_uevent(ctx, event); + if (!uevent) + return 0; + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); } - list_add_tail(&uevent->list, &ctx->file->event_list); - wake_up_interruptible(&ctx->file->poll_wait); - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); -out: - mutex_unlock(&ctx->file->mut); - return ret; + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { + xa_lock(&ctx_table); + if (xa_load(&ctx_table, ctx->id) == ctx) + queue_work(system_unbound_wq, &ctx->close_work); + xa_unlock(&ctx_table); + } + return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct ucma_context *ctx; struct rdma_ucm_get_event cmd; struct ucma_event *uevent; - int ret = 0; /* * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ - if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved)) + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - + sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -422,35 +397,25 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, mutex_lock(&file->mut); } - uevent = list_entry(file->event_list.next, struct ucma_event, list); - - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - ctx = ucma_alloc_ctx(file); - if (!ctx) { - ret = -ENOMEM; - goto done; - } - uevent->ctx->backlog++; - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } + uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { - ret = -EFAULT; - goto done; + mutex_unlock(&file->mut); + return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; - kfree(uevent); -done: + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); - return ret; + + kfree(uevent); + return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) @@ -491,38 +456,32 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, if (ret) return ret; - mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); - mutex_unlock(&file->mut); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; - cm_id = __rdma_create_id(current->nsproxy->net_ns, - ucma_event_handler, ctx, cmd.ps, qp_type, NULL); + cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } + ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err2; + goto err1; } - ctx->cm_id = cm_id; + mutex_lock(&file->mut); + ucma_finish_ctx(ctx); + mutex_unlock(&file->mut); return 0; -err2: - rdma_destroy_id(cm_id); err1: - xa_erase(&ctx_table, ctx->id); - mutex_lock(&file->mut); - list_del(&ctx->list); - mutex_unlock(&file->mut); - kfree(ctx); + ucma_destroy_private_ctx(ctx); return ret; } @@ -530,19 +489,25 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; - mutex_lock(&ctx->file->mut); + xa_lock(&multicast_table); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); - xa_erase(&multicast_table, mc->id); + /* + * At this point mc->ctx->ref is 0 so the mc cannot leave the + * lock on the reader and this is enough serialization + */ + __xa_erase(&multicast_table, mc->id); kfree(mc); } - mutex_unlock(&ctx->file->mut); + xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; + rdma_lock_handler(mc->ctx->cm_id); + mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; @@ -550,45 +515,75 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) list_del(&uevent->list); kfree(uevent); } + mutex_unlock(&mc->ctx->file->mut); + rdma_unlock_handler(mc->ctx->cm_id); } -/* - * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At - * this point, no new events will be reported from the hardware. However, we - * still need to cleanup the UCMA context for this ID. Specifically, there - * might be events that have not yet been consumed by the user space software. - * These might include pending connect requests which we have not completed - * processing. We cannot call rdma_destroy_id while holding the lock of the - * context (file->mut), as it might cause a deadlock. We therefore extract all - * relevant events from the context pending events list while holding the - * mutex. After that we release them as needed. - */ -static int ucma_free_ctx(struct ucma_context *ctx) +static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); - - ucma_cleanup_multicast(ctx); - - /* Cleanup events not yet reported to the user. */ + /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx == ctx) + if (uevent->ctx != ctx) + continue; + + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && + xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, + uevent->conn_req_ctx, XA_ZERO_ENTRY, + GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); + continue; + } + list_del(&uevent->list); + kfree(uevent); } list_del(&ctx->list); + events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); + /* + * If this was a listening ID then any connections spawned from it that + * have not been delivered to userspace are cleaned up too. Must be done + * outside any locks. + */ list_for_each_entry_safe(uevent, tmp, &list, list) { - list_del(&uevent->list); - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) - rdma_destroy_id(uevent->cm_id); + ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } + return events_reported; +} - events_reported = ctx->events_reported; +/* + * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie + * the ctx is not public to the user). This either because: + * - ucma_finish_ctx() hasn't been called + * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) + */ +static int ucma_destroy_private_ctx(struct ucma_context *ctx) +{ + int events_reported; + + /* + * Destroy the underlying cm_id. New work queuing is prevented now by + * the removal from the xarray. Once the work is cancled ref will either + * be 0 because the work ran to completion and consumed the ref from the + * xarray, or it will be positive because we still have the ref from the + * xarray. This can also be 0 in cases where cm_id was never set + */ + cancel_work_sync(&ctx->close_work); + if (refcount_read(&ctx->ref)) + ucma_close_id(&ctx->close_work); + + events_reported = ucma_cleanup_ctx_events(ctx); + ucma_cleanup_multicast(ctx); + + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, + GFP_KERNEL) != NULL); + mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } @@ -609,31 +604,17 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); - if (!IS_ERR(ctx)) - __xa_erase(&ctx_table, ctx->id); + if (!IS_ERR(ctx)) { + if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx) + ctx = ERR_PTR(-ENOENT); + } xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&ctx->file->mut); - ctx->destroying = 1; - mutex_unlock(&ctx->file->mut); - - flush_workqueue(ctx->file->close_wq); - /* At this point it's guaranteed that there is no inflight - * closing task */ - xa_lock(&ctx_table); - if (!ctx->closing) { - xa_unlock(&ctx_table); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - rdma_destroy_id(ctx->cm_id); - } else { - xa_unlock(&ctx_table); - } - - resp.events_reported = ucma_free_ctx(ctx); + resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -658,7 +639,10 @@ static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); return ret; } @@ -681,7 +665,9 @@ static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -705,8 +691,10 @@ static ssize_t ucma_resolve_ip(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -731,8 +719,10 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -752,7 +742,9 @@ static ssize_t ucma_resolve_route(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -762,8 +754,8 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, { struct rdma_dev_addr *dev_addr; - resp->num_paths = route->num_paths; - switch (route->num_paths) { + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, @@ -775,7 +767,7 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); - /* fall through */ + fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); @@ -789,8 +781,8 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { - resp->num_paths = route->num_paths; - switch (route->num_paths) { + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); @@ -801,7 +793,7 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); - /* fall through */ + fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); @@ -831,7 +823,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -841,6 +833,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? @@ -854,6 +847,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) @@ -864,8 +858,9 @@ static ssize_t ucma_query_route(struct ucma_file *file, ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) + mutex_unlock(&ctx->mutex); + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, + min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); @@ -879,6 +874,7 @@ static void ucma_query_device_addr(struct rdma_cm_id *cm_id, return; resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); @@ -891,7 +887,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -906,7 +902,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, ucma_query_device_addr(ctx->cm_id, &resp); - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -925,7 +921,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, if (!resp) return -ENOMEM; - resp->num_paths = ctx->cm_id->route.num_paths; + resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { @@ -958,7 +954,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, struct sockaddr_ib *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -991,7 +987,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, &ctx->cm_id->route.addr.dst_addr); } - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -1014,6 +1010,7 @@ static ssize_t ucma_query(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); switch (cmd.option) { case RDMA_USER_CM_QUERY_ADDR: ret = ucma_query_addr(ctx, response, out_len); @@ -1028,6 +1025,7 @@ static ssize_t ucma_query(struct ucma_file *file, ret = -ENOSYS; break; } + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1039,25 +1037,30 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id, { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; - dst->responder_resources =src->responder_resources; + dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; - dst->qp_num = src->qp_num; + dst->qp_num = src->qp_num & 0xFFFFFF; dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_connect cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct rdma_ucm_connect cmd; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) @@ -1068,7 +1071,14 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - ret = rdma_connect(ctx->cm_id, &conn_param); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + + mutex_lock(&ctx->mutex); + ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1087,9 +1097,13 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? - cmd.backlog : max_backlog; - ret = rdma_listen(ctx->cm_id, ctx->backlog); + if (cmd.backlog <= 0 || cmd.backlog > max_backlog) + cmd.backlog = max_backlog; + atomic_set(&ctx->backlog, cmd.backlog); + + mutex_lock(&ctx->mutex); + ret = rdma_listen(ctx->cm_id, cmd.backlog); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1099,26 +1113,44 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - mutex_lock(&file->mut); - ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); - if (!ret) + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); + if (!ret) { + /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; - mutex_unlock(&file->mut); - } else - ret = __rdma_accept(ctx->cm_id, NULL, NULL); - + } + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } ucma_put_ctx(ctx); return ret; } @@ -1133,11 +1165,25 @@ static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + if (!cmd.reason) + cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; + + switch (cmd.reason) { + case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: + break; + default: + return -EINVAL; + } + ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); + mutex_lock(&ctx->mutex); + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, + cmd.reason); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1156,7 +1202,9 @@ static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_disconnect(ctx->cm_id); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1187,7 +1235,9 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; + mutex_lock(&ctx->mutex); ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); + mutex_unlock(&ctx->mutex); if (ret) goto out; @@ -1273,9 +1323,13 @@ static int ucma_set_ib_path(struct ucma_context *ctx, struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); + mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &opa); + mutex_unlock(&ctx->mutex); } else { + mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &sa_path); + mutex_unlock(&ctx->mutex); } if (ret) return ret; @@ -1308,7 +1362,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level, switch (level) { case RDMA_OPTION_ID: + mutex_lock(&ctx->mutex); ret = ucma_set_option_id(ctx, optname, optval, optlen); + mutex_unlock(&ctx->mutex); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); @@ -1368,8 +1424,10 @@ static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); if (ctx->cm_id->device) ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1403,42 +1461,59 @@ static ssize_t ucma_process_join(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&file->mut); - mc = ucma_alloc_multicast(ctx); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; - goto err1; + goto err_put_ctx; } + + mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); + + xa_lock(&multicast_table); + if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_mc; + } + + list_add_tail(&mc->list, &ctx->mc_list); + xa_unlock(&multicast_table); + + mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); + mutex_unlock(&ctx->mutex); if (ret) - goto err2; + goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err3; + goto err_leave_multicast; } xa_store(&multicast_table, mc->id, mc, 0); - mutex_unlock(&file->mut); ucma_put_ctx(ctx); return 0; -err3: +err_leave_multicast: + mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); -err2: - xa_erase(&multicast_table, mc->id); +err_xa_erase: + xa_lock(&multicast_table); list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); +err_free_mc: + xa_unlock(&multicast_table); kfree(mc); -err1: - mutex_unlock(&file->mut); +err_put_ctx: ucma_put_ctx(ctx); return ret; } @@ -1500,24 +1575,26 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); - else if (mc->ctx->file != file) + else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); - else if (!atomic_inc_not_zero(&mc->ctx->ref)) + else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); - else - __xa_erase(&multicast_table, mc->id); - xa_unlock(&multicast_table); if (IS_ERR(mc)) { + xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); + + mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); - mutex_lock(&mc->ctx->file->mut); + mutex_unlock(&mc->ctx->mutex); + ucma_cleanup_mc_events(mc); - list_del(&mc->list); - mutex_unlock(&mc->ctx->file->mut); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; @@ -1530,45 +1607,15 @@ out: return ret; } -static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - /* Acquire mutex's based on pointer comparison to prevent deadlock. */ - if (file1 < file2) { - mutex_lock(&file1->mut); - mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&file2->mut); - mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); - } -} - -static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - if (file1 < file2) { - mutex_unlock(&file2->mut); - mutex_unlock(&file1->mut); - } else { - mutex_unlock(&file1->mut); - mutex_unlock(&file2->mut); - } -} - -static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) -{ - struct ucma_event *uevent, *tmp; - - list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) - if (uevent->ctx == ctx) - list_move_tail(&uevent->list, &file->event_list); -} - static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; + struct ucma_event *uevent, *tmp; struct ucma_context *ctx; + LIST_HEAD(event_list); struct fd f; struct ucma_file *cur_file; int ret = 0; @@ -1584,40 +1631,53 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, ret = -EINVAL; goto file_put; } + cur_file = f.file->private_data; /* Validate current fd and prevent destruction of id. */ - ctx = ucma_get_ctx(f.file->private_data, cmd.id); + ctx = ucma_get_ctx(cur_file, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; } - cur_file = ctx->file; - if (cur_file == new_file) { - resp.events_reported = ctx->events_reported; - goto response; - } - + rdma_lock_handler(ctx->cm_id); /* - * Migrate events between fd's, maintaining order, and avoiding new - * events being added before existing events. + * ctx->file can only be changed under the handler & xa_lock. xa_load() + * must be checked again to ensure the ctx hasn't begun destruction + * since the ucma_get_ctx(). */ - ucma_lock_files(cur_file, new_file); xa_lock(&ctx_table); - - list_move_tail(&ctx->list, &new_file->ctx_list); - ucma_move_events(ctx, new_file); + if (_ucma_find_context(cmd.id, cur_file) != ctx) { + xa_unlock(&ctx_table); + ret = -ENOENT; + goto err_unlock; + } ctx->file = new_file; + xa_unlock(&ctx_table); + + mutex_lock(&cur_file->mut); + list_del(&ctx->list); + /* + * At this point lock_handler() prevents addition of new uevents for + * this ctx. + */ + list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; + mutex_unlock(&cur_file->mut); - xa_unlock(&ctx_table); - ucma_unlock_files(cur_file, new_file); + mutex_lock(&new_file->mut); + list_add_tail(&ctx->list, &new_file->ctx_list); + list_splice_tail(&event_list, &new_file->event_list); + mutex_unlock(&new_file->mut); -response: if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; +err_unlock: + rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); file_put: fdput(f); @@ -1660,8 +1720,8 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf, ssize_t ret; if (!ib_safe_file_access(filp)) { - pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", - task_tgid_vnr(current), current->comm); + pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", + __func__, task_tgid_vnr(current), current->comm); return -EACCES; } @@ -1717,13 +1777,6 @@ static int ucma_open(struct inode *inode, struct file *filp) if (!file) return -ENOMEM; - file->close_wq = alloc_ordered_workqueue("ucma_close_id", - WQ_MEM_RECLAIM); - if (!file->close_wq) { - kfree(file); - return -ENOMEM; - } - INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); @@ -1738,37 +1791,23 @@ static int ucma_open(struct inode *inode, struct file *filp) static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; - struct ucma_context *ctx, *tmp; - - mutex_lock(&file->mut); - list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { - ctx->destroying = 1; - mutex_unlock(&file->mut); - xa_erase(&ctx_table, ctx->id); - flush_workqueue(file->close_wq); - /* At that step once ctx was marked as destroying and workqueue - * was flushed we are safe from any inflights handlers that - * might put other closing task. - */ - xa_lock(&ctx_table); - if (!ctx->closing) { - xa_unlock(&ctx_table); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - /* rdma_destroy_id ensures that no event handlers are - * inflight for that id before releasing it. - */ - rdma_destroy_id(ctx->cm_id); - } else { - xa_unlock(&ctx_table); - } + /* + * All paths that touch ctx_list or ctx_list starting from write() are + * prevented by this being a FD release function. The list_add_tail() in + * ucma_connect_event_handler() can run concurrently, however it only + * adds to the list *after* a listening ID. By only reading the first of + * the list, and relying on ucma_destroy_private_ctx() to block + * ucma_connect_event_handler(), no additional locking is needed. + */ + while (!list_empty(&file->ctx_list)) { + struct ucma_context *ctx = list_first_entry( + &file->ctx_list, struct ucma_context, list); - ucma_free_ctx(ctx); - mutex_lock(&file->mut); + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx); + ucma_destroy_private_ctx(ctx); } - mutex_unlock(&file->mut); - destroy_workqueue(file->close_wq); kfree(file); return 0; } @@ -1803,13 +1842,12 @@ static struct ib_client rdma_cma_client = { }; MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); -static ssize_t show_abi_version(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t abi_version_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); + return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); } -static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static DEVICE_ATTR_RO(abi_version); static int __init ucma_init(void) { diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 29a45d2f8898..64d9c492de64 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -41,7 +41,7 @@ #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ - .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_unpacked_ ## header, field), \ .field_name = #header ":" #field static const struct ib_field lrh_table[] = { @@ -479,7 +479,7 @@ int ib_ud_header_unpack(void *buf, buf += IB_LRH_BYTES; if (header->lrh.link_version != 0) { - pr_warn("Invalid LRH.link_version %d\n", + pr_warn("Invalid LRH.link_version %u\n", header->lrh.link_version); return -EINVAL; } @@ -496,7 +496,7 @@ int ib_ud_header_unpack(void *buf, buf += IB_GRH_BYTES; if (header->grh.ip_version != 6) { - pr_warn("Invalid GRH.ip_version %d\n", + pr_warn("Invalid GRH.ip_version %u\n", header->grh.ip_version); return -EINVAL; } @@ -508,7 +508,7 @@ int ib_ud_header_unpack(void *buf, break; default: - pr_warn("Invalid LRH.link_next_header %d\n", + pr_warn("Invalid LRH.link_next_header %u\n", header->lrh.link_next_header); return -EINVAL; } @@ -530,7 +530,7 @@ int ib_ud_header_unpack(void *buf, } if (header->bth.transport_header_version != 0) { - pr_warn("Invalid BTH.transport_header_version %d\n", + pr_warn("Invalid BTH.transport_header_version %u\n", header->bth.transport_header_version); return -EINVAL; } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 06b6125b5ae1..86d479772fbc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -2,6 +2,7 @@ * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,92 +40,26 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/count_zeros.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct sg_page_iter sg_iter; - struct page *page; - - if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, - DMA_BIDIRECTIONAL); - - for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { - page = sg_page_iter_page(&sg_iter); - unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty); - } - - sg_free_table(&umem->sg_head); -} - -/* ib_umem_add_sg_table - Add N contiguous pages to scatter table - * - * sg: current scatterlist entry - * page_list: array of npage struct page pointers - * npages: number of pages in page_list - * max_seg_sz: maximum segment size in bytes - * nents: [out] number of entries in the scatterlist - * - * Return new end of scatterlist - */ -static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, - struct page **page_list, - unsigned long npages, - unsigned int max_seg_sz, - int *nents) -{ - unsigned long first_pfn; - unsigned long i = 0; - bool update_cur_sg = false; - bool first = !sg_page(sg); - - /* Check if new page_list is contiguous with end of previous page_list. - * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. - */ - if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == - page_to_pfn(page_list[0]))) - update_cur_sg = true; - - while (i != npages) { - unsigned long len; - struct page *first_page = page_list[i]; - - first_pfn = page_to_pfn(first_page); - - /* Compute the number of contiguous pages we have starting - * at i - */ - for (len = 0; i != npages && - first_pfn + len == page_to_pfn(page_list[i]) && - len < (max_seg_sz >> PAGE_SHIFT); - len++) - i++; - - /* Squash N contiguous pages from page_list into current sge */ - if (update_cur_sg) { - if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { - sg_set_page(sg, sg_page(sg), - sg->length + (len << PAGE_SHIFT), - 0); - update_cur_sg = false; - continue; - } - update_cur_sg = false; - } + bool make_dirty = umem->writable && dirty; + struct scatterlist *sg; + unsigned int i; - /* Squash N contiguous pages into next sge or first sge */ - if (!first) - sg = sg_next(sg); + if (dirty) + ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, 0); - (*nents)++; - sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); - first = false; - } + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) + unpin_user_page_range_dirty_lock(sg_page(sg), + DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); - return sg; + sg_free_append_table(&umem->sgt_append); } /** @@ -146,22 +81,37 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, unsigned long virt) { struct scatterlist *sg; - unsigned int best_pg_bit; unsigned long va, pgoff; dma_addr_t mask; int i; - /* At minimum, drivers must support PAGE_SIZE or smaller */ - if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0)))) - return 0; + if (umem->is_odp) { + unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift); + + /* ODP must always be self consistent. */ + if (!(pgsz_bitmap & page_size)) + return 0; + return page_size; + } + + /* rdma_for_each_block() has a bug if the page size is smaller than the + * page size used to build the umem. For now prevent smaller page sizes + * from being returned. + */ + pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT); - va = virt; - /* max page size not to exceed MR length */ - mask = roundup_pow_of_two(umem->length); + umem->iova = va = virt; + /* The best result is the smallest page size that results in the minimum + * number of required pages. Compute the largest page size that could + * work based on VA address bits that don't change. + */ + mask = pgsz_bitmap & + GENMASK(BITS_PER_LONG - 1, + bits_per((umem->length - 1 + virt) ^ virt)); /* offset into first SGL */ pgoff = umem->address & ~PAGE_MASK; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { + for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { /* Walk SGL and reduce max page size if VA/PA bits differ * for any address. */ @@ -171,13 +121,18 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, * the maximum possible page size as the low bits of the iova * must be zero when starting the next chunk. */ - if (i != (umem->nmap - 1)) + if (i != (umem->sgt_append.sgt.nents - 1)) mask |= va; pgoff = 0; } - best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap); - return BIT_ULL(best_pg_bit); + /* The mask accumulates 1's in each position where the VA and physical + * address differ, thus the length of trailing 0 is the largest page + * size that can pass the VA through to the physical. + */ + if (mask) + pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); + return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; } EXPORT_SYMBOL(ib_umem_find_best_pgsz); @@ -197,10 +152,10 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, unsigned long lock_limit; unsigned long new_pinned; unsigned long cur_base; + unsigned long dma_attr = 0; struct mm_struct *mm; unsigned long npages; - int ret; - struct scatterlist *sg; + int pinned, ret; unsigned int gup_flags = FOLL_WRITE; /* @@ -223,6 +178,11 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, umem->ibdev = device; umem->length = size; umem->address = addr; + /* + * Drivers should call ib_umem_find_best_pgsz() to set the iova + * correctly. + */ + umem->iova = addr; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); @@ -250,50 +210,44 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, cur_base = addr & PAGE_MASK; - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); - if (ret) - goto vma; - if (!umem->writable) gup_flags |= FOLL_FORCE; - sg = umem->sg_head.sgl; - while (npages) { - ret = pin_user_pages_fast(cur_base, + cond_resched(); + pinned = pin_user_pages_fast(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), gup_flags | FOLL_LONGTERM, page_list); - if (ret < 0) + if (pinned < 0) { + ret = pinned; goto umem_release; + } - cur_base += ret * PAGE_SIZE; - npages -= ret; - - sg = ib_umem_add_sg_table(sg, page_list, ret, - dma_get_max_seg_size(device->dma_device), - &umem->sg_nents); + cur_base += pinned * PAGE_SIZE; + npages -= pinned; + ret = sg_alloc_append_table_from_pages( + &umem->sgt_append, page_list, pinned, 0, + pinned << PAGE_SHIFT, ib_dma_max_seg_size(device), + npages, GFP_KERNEL); + if (ret) { + unpin_user_pages_dirty_lock(page_list, pinned, 0); + goto umem_release; + } } - sg_mark_end(sg); - - umem->nmap = ib_dma_map_sg(device, - umem->sg_head.sgl, - umem->sg_nents, - DMA_BIDIRECTIONAL); + if (access & IB_ACCESS_RELAXED_ORDERING) + dma_attr |= DMA_ATTR_WEAK_ORDERING; - if (!umem->nmap) { - ret = -ENOMEM; + ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, dma_attr); + if (ret) goto umem_release; - } - - ret = 0; goto out; umem_release: __ib_umem_release(device, umem, 0); -vma: atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: free_page((unsigned long) page_list); @@ -314,6 +268,8 @@ void ib_umem_release(struct ib_umem *umem) { if (!umem) return; + if (umem->is_dmabuf) + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem)); if (umem->is_odp) return ib_umem_odp_release(to_ib_umem_odp(umem)); @@ -325,18 +281,6 @@ void ib_umem_release(struct ib_umem *umem) } EXPORT_SYMBOL(ib_umem_release); -int ib_umem_page_count(struct ib_umem *umem) -{ - int i, n = 0; - struct scatterlist *sg; - - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> PAGE_SHIFT; - - return n; -} -EXPORT_SYMBOL(ib_umem_page_count); - /* * Copy from the given ib_umem's pages to the given buffer. * @@ -354,12 +298,13 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, int ret; if (offset > umem->length || length > umem->length - offset) { - pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", - offset, umem->length, end); + pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n", + __func__, offset, umem->length, end); return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length, + ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.orig_nents, dst, length, offset + ib_umem_offset(umem)); if (ret < 0) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c new file mode 100644 index 000000000000..04c04e6d24c3 --- /dev/null +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + */ + +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> +#include <linux/dma-mapping.h> +#include <linux/module.h> + +#include "uverbs.h" + +MODULE_IMPORT_NS(DMA_BUF); + +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct sg_table *sgt; + struct scatterlist *sg; + unsigned long start, end, cur = 0; + unsigned int nmap = 0; + long ret; + int i; + + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (umem_dmabuf->sgt) + goto wait_fence; + + sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL); + if (IS_ERR(sgt)) + return PTR_ERR(sgt); + + /* modify the sg list in-place to match umem address and length */ + + start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE); + end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length, + PAGE_SIZE); + for_each_sgtable_dma_sg(sgt, sg, i) { + if (start < cur + sg_dma_len(sg) && cur < end) + nmap++; + if (cur <= start && start < cur + sg_dma_len(sg)) { + unsigned long offset = start - cur; + + umem_dmabuf->first_sg = sg; + umem_dmabuf->first_sg_offset = offset; + sg_dma_address(sg) += offset; + sg_dma_len(sg) -= offset; + cur += offset; + } + if (cur < end && end <= cur + sg_dma_len(sg)) { + unsigned long trim = cur + sg_dma_len(sg) - end; + + umem_dmabuf->last_sg = sg; + umem_dmabuf->last_sg_trim = trim; + sg_dma_len(sg) -= trim; + break; + } + cur += sg_dma_len(sg); + } + + umem_dmabuf->umem.sgt_append.sgt.sgl = umem_dmabuf->first_sg; + umem_dmabuf->umem.sgt_append.sgt.nents = nmap; + umem_dmabuf->sgt = sgt; + +wait_fence: + /* + * Although the sg list is valid now, the content of the pages + * may be not up-to-date. Wait for the exporter to finish + * the migration. + */ + ret = dma_resv_wait_timeout(umem_dmabuf->attach->dmabuf->resv, + DMA_RESV_USAGE_KERNEL, + false, MAX_SCHEDULE_TIMEOUT); + if (ret < 0) + return ret; + if (ret == 0) + return -ETIMEDOUT; + return 0; +} +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages); + +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) +{ + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (!umem_dmabuf->sgt) + return; + + /* retore the original sg list */ + if (umem_dmabuf->first_sg) { + sg_dma_address(umem_dmabuf->first_sg) -= + umem_dmabuf->first_sg_offset; + sg_dma_len(umem_dmabuf->first_sg) += + umem_dmabuf->first_sg_offset; + umem_dmabuf->first_sg = NULL; + umem_dmabuf->first_sg_offset = 0; + } + if (umem_dmabuf->last_sg) { + sg_dma_len(umem_dmabuf->last_sg) += + umem_dmabuf->last_sg_trim; + umem_dmabuf->last_sg = NULL; + umem_dmabuf->last_sg_trim = 0; + } + + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, + DMA_BIDIRECTIONAL); + + umem_dmabuf->sgt = NULL; +} +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages); + +struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + struct dma_buf *dmabuf; + struct ib_umem_dmabuf *umem_dmabuf; + struct ib_umem *umem; + unsigned long end; + struct ib_umem_dmabuf *ret = ERR_PTR(-EINVAL); + + if (check_add_overflow(offset, (unsigned long)size, &end)) + return ret; + + if (unlikely(!ops || !ops->move_notify)) + return ret; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + if (dmabuf->size < end) + goto out_release_dmabuf; + + umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL); + if (!umem_dmabuf) { + ret = ERR_PTR(-ENOMEM); + goto out_release_dmabuf; + } + + umem = &umem_dmabuf->umem; + umem->ibdev = device; + umem->length = size; + umem->address = offset; + umem->writable = ib_access_writable(access); + umem->is_dmabuf = 1; + + if (!ib_umem_num_pages(umem)) + goto out_free_umem; + + umem_dmabuf->attach = dma_buf_dynamic_attach( + dmabuf, + device->dma_device, + ops, + umem_dmabuf); + if (IS_ERR(umem_dmabuf->attach)) { + ret = ERR_CAST(umem_dmabuf->attach); + goto out_free_umem; + } + return umem_dmabuf; + +out_free_umem: + kfree(umem_dmabuf); + +out_release_dmabuf: + dma_buf_put(dmabuf); + return ret; +} +EXPORT_SYMBOL(ib_umem_dmabuf_get); + +static void +ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) +{ + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; + + ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev, + "Invalidate callback should not be called when memory is pinned\n"); +} + +static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { + .allow_peer2peer = true, + .move_notify = ib_umem_dmabuf_unsupported_move_notify, +}; + +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + struct ib_umem_dmabuf *umem_dmabuf; + int err; + + umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); + if (IS_ERR(umem_dmabuf)) + return umem_dmabuf; + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + err = dma_buf_pin(umem_dmabuf->attach); + if (err) + goto err_release; + umem_dmabuf->pinned = 1; + + err = ib_umem_dmabuf_map_pages(umem_dmabuf); + if (err) + goto err_unpin; + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + + return umem_dmabuf; + +err_unpin: + dma_buf_unpin(umem_dmabuf->attach); +err_release: + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); + +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + dma_resv_lock(dmabuf->resv, NULL); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); + if (umem_dmabuf->pinned) + dma_buf_unpin(umem_dmabuf->attach); + dma_resv_unlock(dmabuf->resv); + + dma_buf_detach(dmabuf, umem_dmabuf->attach); + dma_buf_put(dmabuf); + kfree(umem_dmabuf); +} diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index cd656ad4953b..e9fa22d31c23 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -40,10 +40,9 @@ #include <linux/vmalloc.h> #include <linux/hugetlb.h> #include <linux/interval_tree.h> +#include <linux/hmm.h> #include <linux/pagemap.h> -#include <rdma/ib_verbs.h> -#include <rdma/ib_umem.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" @@ -60,7 +59,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, size_t page_size = 1UL << umem_odp->page_shift; unsigned long start; unsigned long end; - size_t pages; + size_t ndmas, npfns; start = ALIGN_DOWN(umem_odp->umem.address, page_size); if (check_add_overflow(umem_odp->umem.address, @@ -71,20 +70,21 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, if (unlikely(end < page_size)) return -EOVERFLOW; - pages = (end - start) >> umem_odp->page_shift; - if (!pages) + ndmas = (end - start) >> umem_odp->page_shift; + if (!ndmas) return -EINVAL; - umem_odp->page_list = kvcalloc( - pages, sizeof(*umem_odp->page_list), GFP_KERNEL); - if (!umem_odp->page_list) + npfns = (end - start) >> PAGE_SHIFT; + umem_odp->pfn_list = kvcalloc( + npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); + if (!umem_odp->pfn_list) return -ENOMEM; umem_odp->dma_list = kvcalloc( - pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); + ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); if (!umem_odp->dma_list) { ret = -ENOMEM; - goto out_page_list; + goto out_pfn_list; } ret = mmu_interval_notifier_insert(&umem_odp->notifier, @@ -98,8 +98,8 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, out_dma_list: kvfree(umem_odp->dma_list); -out_page_list: - kvfree(umem_odp->page_list); +out_pfn_list: + kvfree(umem_odp->pfn_list); return ret; } @@ -152,6 +152,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); * ib_alloc_implicit_odp_umem() * @addr: The starting userspace VA * @size: The length of the userspace VA + * @ops: MMU interval ops, currently only @invalidate */ struct ib_umem_odp * ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, @@ -213,6 +214,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned + * @ops: MMU interval ops, currently only @invalidate * * The driver should use when the access flags indicate ODP memory. It avoids * pinning, instead, stores the mm for future page fault handling in @@ -223,7 +225,6 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, const struct mmu_interval_notifier_ops *ops) { struct ib_umem_odp *umem_odp; - struct mm_struct *mm; int ret; if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) @@ -237,7 +238,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, umem_odp->umem.length = size; umem_odp->umem.address = addr; umem_odp->umem.writable = ib_access_writable(access); - umem_odp->umem.owning_mm = mm = current->mm; + umem_odp->umem.owning_mm = current->mm; umem_odp->notifier.ops = ops; umem_odp->page_shift = PAGE_SHIFT; @@ -274,9 +275,9 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); mmu_interval_notifier_remove(&umem_odp->notifier); kvfree(umem_odp->dma_list); - kvfree(umem_odp->page_list); - put_pid(umem_odp->tgid); + kvfree(umem_odp->pfn_list); } + put_pid(umem_odp->tgid); kfree(umem_odp); } EXPORT_SYMBOL(ib_umem_odp_release); @@ -285,87 +286,53 @@ EXPORT_SYMBOL(ib_umem_odp_release); * Map for DMA and insert a single page into the on-demand paging page tables. * * @umem: the umem to insert the page to. - * @page_index: index in the umem to add the page to. + * @dma_index: index in the umem to add the dma to. * @page: the page struct to map and add. * @access_mask: access permissions needed for this page. - * @current_seq: sequence number for synchronization with invalidations. - * the sequence number is taken from - * umem_odp->notifiers_seq. * - * The function returns -EFAULT if the DMA mapping operation fails. It returns - * -EAGAIN if a concurrent invalidation prevents us from updating the page. + * The function returns -EFAULT if the DMA mapping operation fails. * - * The page is released via put_page even if the operation failed. For on-demand - * pinning, the page is released whenever it isn't stored in the umem. */ static int ib_umem_odp_map_dma_single_page( struct ib_umem_odp *umem_odp, - unsigned int page_index, + unsigned int dma_index, struct page *page, - u64 access_mask, - unsigned long current_seq) + u64 access_mask) { struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t dma_addr; - int ret = 0; + dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) { - ret = -EAGAIN; - goto out; - } - if (!(umem_odp->dma_list[page_index])) { - dma_addr = - ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, dma_addr)) { - ret = -EFAULT; - goto out; - } - umem_odp->dma_list[page_index] = dma_addr | access_mask; - umem_odp->page_list[page_index] = page; - umem_odp->npages++; - } else if (umem_odp->page_list[page_index] == page) { - umem_odp->dma_list[page_index] |= access_mask; - } else { + if (*dma_addr) { /* - * This is a race here where we could have done: - * - * CPU0 CPU1 - * get_user_pages() - * invalidate() - * page_fault() - * mutex_lock(umem_mutex) - * page from GUP != page in ODP - * - * It should be prevented by the retry test above as reading - * the seq number should be reliable under the - * umem_mutex. Thus something is really not working right if - * things get here. + * If the page is already dma mapped it means it went through + * a non-invalidating trasition, like read-only to writable. + * Resync the flags. */ - WARN(true, - "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem_odp->page_list[page_index], page); - ret = -EAGAIN; + *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; + return 0; } -out: - put_page(page); - return ret; + *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, *dma_addr)) { + *dma_addr = 0; + return -EFAULT; + } + umem_odp->npages++; + *dma_addr |= access_mask; + return 0; } /** - * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * - * Pins the range of pages passed in the argument, and maps them to - * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem_odp->dma_list. + * Maps the range passed in the argument to DMA addresses. + * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. + * Upon success the ODP MR will be locked to let caller complete its device + * page table update. * * Returns the number of pages mapped in success, negative error code * for failure. - * An -EAGAIN error code is returned when a concurrent mmu notifier prevents - * the function from completing its task. - * An -ENOENT error code indicates that userspace process is being terminated - * and mm was already destroyed. * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be @@ -374,21 +341,19 @@ out: * the return value. * @access_mask: bit mask of the requested access permissions for the given * range. - * @current_seq: the MMU notifiers sequance value for synchronization with - * invalidations. the sequance number is read from - * umem_odp->notifiers_seq before calling this function + * @fault: is faulting required for the given range */ -int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, - u64 bcnt, u64 access_mask, - unsigned long current_seq) +int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, bool fault) + __acquires(&umem_odp->umem_mutex) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = umem_odp->umem.owning_mm; - struct page **local_page_list = NULL; - u64 page_mask, off; - int j, k, ret = 0, start_idx, npages = 0; - unsigned int flags = 0, page_shift; - phys_addr_t p = 0; + int pfn_index, dma_index, ret = 0, start_idx; + unsigned int page_shift, hmm_order, pfn_start_idx; + unsigned long num_pfns, current_seq; + struct hmm_range range = {}; + unsigned long timeout; if (access_mask == 0) return -EINVAL; @@ -397,15 +362,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; - local_page_list = (struct page **)__get_free_page(GFP_KERNEL); - if (!local_page_list) - return -ENOMEM; - page_shift = umem_odp->page_shift; - page_mask = ~(BIT(page_shift) - 1); - off = user_virt & (~page_mask); - user_virt = user_virt & page_mask; - bcnt += off; /* Charge for the first page offset as well. */ /* * owning_process is allowed to be NULL, this means somehow the mm is @@ -418,99 +375,104 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, goto out_put_task; } - if (access_mask & ODP_WRITE_ALLOWED_BIT) - flags |= FOLL_WRITE; + range.notifier = &umem_odp->notifier; + range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); + range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); + pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + num_pfns = (range.end - range.start) >> PAGE_SHIFT; + if (fault) { + range.default_flags = HMM_PFN_REQ_FAULT; - start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; - k = start_idx; + if (access_mask & ODP_WRITE_ALLOWED_BIT) + range.default_flags |= HMM_PFN_REQ_WRITE; + } - while (bcnt > 0) { - const size_t gup_num_pages = min_t(size_t, - ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, - PAGE_SIZE / sizeof(struct page *)); + range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); - down_read(&owning_mm->mmap_sem); - /* - * Note: this might result in redundent page getting. We can - * avoid this by checking dma_list to be 0 before calling - * get_user_pages. However, this make the code much more - * complex (and doesn't gain us much performance in most use - * cases). - */ - npages = get_user_pages_remote(owning_process, owning_mm, - user_virt, gup_num_pages, - flags, local_page_list, NULL, NULL); - up_read(&owning_mm->mmap_sem); - - if (npages < 0) { - if (npages != -EAGAIN) - pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); - else - pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); - break; - } +retry: + current_seq = range.notifier_seq = + mmu_interval_read_begin(&umem_odp->notifier); - bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem_odp->umem_mutex); - for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { - if (user_virt & ~page_mask) { - p += PAGE_SIZE; - if (page_to_phys(local_page_list[j]) != p) { - ret = -EFAULT; - break; - } - put_page(local_page_list[j]); - continue; - } + mmap_read_lock(owning_mm); + ret = hmm_range_fault(&range); + mmap_read_unlock(owning_mm); + if (unlikely(ret)) { + if (ret == -EBUSY && !time_after(jiffies, timeout)) + goto retry; + goto out_put_mm; + } - ret = ib_umem_odp_map_dma_single_page( - umem_odp, k, local_page_list[j], - access_mask, current_seq); - if (ret < 0) { - if (ret != -EAGAIN) - pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - else - pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } + start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; + dma_index = start_idx; - p = page_to_phys(local_page_list[j]); - k++; - } + mutex_lock(&umem_odp->umem_mutex); + if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { mutex_unlock(&umem_odp->umem_mutex); + goto retry; + } - if (ret < 0) { + for (pfn_index = 0; pfn_index < num_pfns; + pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { + + if (fault) { /* - * Release pages, remembering that the first page - * to hit an error was already released by - * ib_umem_odp_map_dma_single_page(). + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. */ - if (npages - (j + 1) > 0) - release_pages(&local_page_list[j+1], - npages - (j + 1)); + WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + } else { + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { + WARN_ON(umem_odp->dma_list[dma_index]); + continue; + } + access_mask = ODP_READ_ALLOWED_BIT; + if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) + access_mask |= ODP_WRITE_ALLOWED_BIT; + } + + hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); + /* If a hugepage was detected and ODP wasn't set for, the umem + * page_shift will be used, the opposite case is an error. + */ + if (hmm_order + PAGE_SHIFT < page_shift) { + ret = -EINVAL; + ibdev_dbg(umem_odp->umem.ibdev, + "%s: un-expected hmm_order %u, page_shift %u\n", + __func__, hmm_order, page_shift); break; } - } - if (ret >= 0) { - if (npages < 0 && k == start_idx) - ret = npages; - else - ret = k - start_idx; + ret = ib_umem_odp_map_dma_single_page( + umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), + access_mask); + if (ret < 0) { + ibdev_dbg(umem_odp->umem.ibdev, + "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); + break; + } } + /* upon success lock should stay on hold for the callee */ + if (!ret) + ret = dma_index - start_idx; + else + mutex_unlock(&umem_odp->umem_mutex); - mmput(owning_mm); +out_put_mm: + mmput_async(owning_mm); out_put_task: if (owning_process) put_task_struct(owning_process); - free_page((unsigned long)local_page_list); return ret; } -EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); +EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + dma_addr_t dma_addr; + dma_addr_t dma; int idx; u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; @@ -519,20 +481,16 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); - /* Note that during the run of this function, the - * notifiers_count of the MR is > 0, preventing any racing - * faults from completion. We might be racing with other - * invalidations, so we must make sure we free each page only - * once. */ for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - if (umem_odp->page_list[idx]) { - struct page *page = umem_odp->page_list[idx]; - dma_addr_t dma = umem_odp->dma_list[idx]; - dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; + dma = umem_odp->dma_list[idx]; - WARN_ON(!dma_addr); + /* The access flags guaranteed a valid DMA address in case was NULL */ + if (dma) { + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); + dma_addr = dma & ODP_DMA_ADDR_MASK; ib_dma_unmap_page(dev, dma_addr, BIT(umem_odp->page_shift), DMA_BIDIRECTIONAL); @@ -549,7 +507,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, */ set_page_dirty(head_page); } - umem_odp->page_list[idx] = NULL; umem_odp->dma_list[idx] = 0; umem_odp->npages--; } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 1235ffb2389b..98cb594cd9a6 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -101,7 +101,7 @@ struct ib_umad_port { struct ib_device *ib_dev; struct ib_umad_device *umad_dev; int dev_num; - u8 port_num; + u32 port_num; }; struct ib_umad_device { @@ -142,7 +142,7 @@ static dev_t dynamic_issm_dev; static DEFINE_IDA(umad_ida); -static void ib_umad_add_one(struct ib_device *device); +static int ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); static void ib_umad_dev_free(struct kref *kref) @@ -165,8 +165,8 @@ static void ib_umad_dev_put(struct ib_umad_device *dev) static int hdr_size(struct ib_umad_file *file) { - return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : - sizeof (struct ib_user_mad_hdr_old); + return file->use_pkey_index ? sizeof(struct ib_user_mad_hdr) : + sizeof(struct ib_user_mad_hdr_old); } /* caller must hold file->mutex */ @@ -379,6 +379,11 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, mutex_lock(&file->mutex); + if (file->agents_dead) { + mutex_unlock(&file->mutex); + return -EIO; + } + while (list_empty(&file->recv_list)) { mutex_unlock(&file->mutex); @@ -392,6 +397,11 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, mutex_lock(&file->mutex); } + if (file->agents_dead) { + mutex_unlock(&file->mutex); + return -EIO; + } + packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); @@ -524,7 +534,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, agent = __get_agent(file, packet->mad.hdr.id); if (!agent) { - ret = -EINVAL; + ret = -EIO; goto err_up; } @@ -653,10 +663,14 @@ static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait) /* we will always be able to post a MAD send */ __poll_t mask = EPOLLOUT | EPOLLWRNORM; + mutex_lock(&file->mutex); poll_wait(filp, &file->recv_wait, wait); if (!list_empty(&file->recv_list)) mask |= EPOLLIN | EPOLLRDNORM; + if (file->agents_dead) + mask = EPOLLERR; + mutex_unlock(&file->mutex); return mask; } @@ -674,8 +688,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, mutex_lock(&file->mutex); if (!file->port->ib_dev) { - dev_notice(&file->port->dev, - "ib_umad_reg_agent: invalid device\n"); + dev_notice(&file->port->dev, "%s: invalid device\n", __func__); ret = -EPIPE; goto out; } @@ -687,7 +700,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, if (ureq.qpn != 0 && ureq.qpn != 1) { dev_notice(&file->port->dev, - "ib_umad_reg_agent: invalid QPN %d specified\n", + "%s: invalid QPN %u specified\n", __func__, ureq.qpn); ret = -EINVAL; goto out; @@ -697,9 +710,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, if (!__get_agent(file, agent_id)) goto found; - dev_notice(&file->port->dev, - "ib_umad_reg_agent: Max Agents (%u) reached\n", + dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; goto out; @@ -776,8 +789,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) mutex_lock(&file->mutex); if (!file->port->ib_dev) { - dev_notice(&file->port->dev, - "ib_umad_reg_agent2: invalid device\n"); + dev_notice(&file->port->dev, "%s: invalid device\n", __func__); ret = -EPIPE; goto out; } @@ -788,17 +800,16 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) } if (ureq.qpn != 0 && ureq.qpn != 1) { - dev_notice(&file->port->dev, - "ib_umad_reg_agent2: invalid QPN %d specified\n", - ureq.qpn); + dev_notice(&file->port->dev, "%s: invalid QPN %u specified\n", + __func__, ureq.qpn); ret = -EINVAL; goto out; } if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { dev_notice(&file->port->dev, - "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", - ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + "%s failed: invalid registration flags specified 0x%x; supported 0x%x\n", + __func__, ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); ret = -EINVAL; if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, @@ -813,8 +824,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) if (!__get_agent(file, agent_id)) goto found; - dev_notice(&file->port->dev, - "ib_umad_reg_agent2: Max Agents (%u) reached\n", + dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; @@ -826,7 +836,7 @@ found: req.mgmt_class_version = ureq.mgmt_class_version; if (ureq.oui & 0xff000000) { dev_notice(&file->port->dev, - "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + "%s failed: oui invalid 0x%08x\n", __func__, ureq.oui); ret = -EINVAL; goto out; @@ -1129,17 +1139,30 @@ static const struct file_operations umad_sm_fops = { .llseek = no_llseek, }; +static struct ib_umad_port *get_port(struct ib_device *ibdev, + struct ib_umad_device *umad_dev, + u32 port) +{ + if (!umad_dev) + return ERR_PTR(-EOPNOTSUPP); + if (!rdma_is_port_valid(ibdev, port)) + return ERR_PTR(-EINVAL); + if (!rdma_cap_ib_mad(ibdev, port)) + return ERR_PTR(-EOPNOTSUPP); + + return &umad_dev->ports[port - rdma_start_port(ibdev)]; +} + static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res) { - struct ib_umad_device *umad_dev = client_data; + struct ib_umad_port *port = get_port(ibdev, client_data, res->port); - if (!rdma_is_port_valid(ibdev, res->port)) - return -EINVAL; + if (IS_ERR(port)) + return PTR_ERR(port); res->abi = IB_USER_MAD_ABI_VERSION; - res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev; - + res->cdev = &port->dev; return 0; } @@ -1154,15 +1177,13 @@ MODULE_ALIAS_RDMA_CLIENT("umad"); static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res) { - struct ib_umad_device *umad_dev = - ib_get_client_data(ibdev, &umad_client); + struct ib_umad_port *port = get_port(ibdev, client_data, res->port); - if (!rdma_is_port_valid(ibdev, res->port)) - return -EINVAL; + if (IS_ERR(port)) + return PTR_ERR(port); res->abi = IB_USER_MAD_ABI_VERSION; - res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev; - + res->cdev = &port->sm_dev; return 0; } @@ -1180,7 +1201,7 @@ static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev)); + return sysfs_emit(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR_RO(ibdev); @@ -1192,7 +1213,7 @@ static ssize_t port_show(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%d\n", port->port_num); + return sysfs_emit(buf, "%d\n", port->port_num); } static DEVICE_ATTR_RO(port); @@ -1211,7 +1232,7 @@ static char *umad_devnode(struct device *dev, umode_t *mode) static ssize_t abi_version_show(struct class *class, struct class_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); + return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION); } static CLASS_ATTR_RO(abi_version); @@ -1325,6 +1346,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) list_for_each_entry(file, &port->file_list, port_list) { mutex_lock(&file->mutex); file->agents_dead = 1; + wake_up_interruptible(&file->recv_wait); mutex_unlock(&file->mutex); for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) @@ -1341,37 +1363,41 @@ static void ib_umad_kill_port(struct ib_umad_port *port) put_device(&port->dev); } -static void ib_umad_add_one(struct ib_device *device) +static int ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; int count = 0; + int ret; s = rdma_start_port(device); e = rdma_end_port(device); umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL); if (!umad_dev) - return; + return -ENOMEM; kref_init(&umad_dev->kref); for (i = s; i <= e; ++i) { if (!rdma_cap_ib_mad(device, i)) continue; - if (ib_umad_init_port(device, i, umad_dev, - &umad_dev->ports[i - s])) + ret = ib_umad_init_port(device, i, umad_dev, + &umad_dev->ports[i - s]); + if (ret) goto err; count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(device, &umad_client, umad_dev); - return; + return 0; err: while (--i >= s) { @@ -1383,6 +1409,7 @@ err: free: /* balances kref_init */ ib_umad_dev_put(umad_dev); + return ret; } static void ib_umad_remove_one(struct ib_device *device, void *client_data) @@ -1390,9 +1417,6 @@ static void ib_umad_remove_one(struct ib_device *device, void *client_data) struct ib_umad_device *umad_dev = client_data; unsigned int i; - if (!umad_dev) - return; - rdma_for_each_port (device, i) { if (rdma_cap_ib_mad(device, i)) ib_umad_kill_port( diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 7df71983212d..821d93c8f712 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -97,7 +97,7 @@ ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata, */ struct ib_uverbs_device { - atomic_t refcount; + refcount_t refcount; u32 num_comp_vectors; struct completion comp; struct device dev; @@ -142,7 +142,7 @@ struct ib_uverbs_file { * ucontext_lock held */ struct ib_ucontext *ucontext; - struct ib_uverbs_async_event_file *async_file; + struct ib_uverbs_async_event_file *default_async_file; struct list_head list; /* @@ -180,6 +180,7 @@ struct ib_uverbs_mcast_entry { struct ib_uevent_object { struct ib_uobject uobject; + struct ib_uverbs_async_event_file *event_file; /* List member for ib_uverbs_async_event_file list */ struct list_head event_list; u32 events_reported; @@ -219,6 +220,7 @@ void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); void ib_uverbs_init_async_event_file(struct ib_uverbs_async_event_file *ev_file); void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); +int uverbs_async_event_release(struct inode *inode, struct file *filp); int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs); int ib_init_ucontext(struct uverbs_attr_bundle *attrs); @@ -227,6 +229,9 @@ void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, struct ib_ucq_object *uobj); void ib_uverbs_release_uevent(struct ib_uevent_object *uobj); void ib_uverbs_release_file(struct kref *ref); +void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, + __u64 element, __u64 event, + struct list_head *obj_list, u32 *counter); void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); @@ -292,6 +297,24 @@ static inline u32 make_port_cap_flags(const struct ib_port_attr *attr) return res; } +static inline struct ib_uverbs_async_event_file * +ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, + u16 id) +{ + struct ib_uobject *async_ev_file_uobj; + struct ib_uverbs_async_event_file *async_ev_file; + + async_ev_file_uobj = uverbs_attr_get_uobject(attrs, id); + if (IS_ERR(async_ev_file_uobj)) + async_ev_file = READ_ONCE(attrs->ufile->default_async_file); + else + async_ev_file = container_of(async_ev_file_uobj, + struct ib_uverbs_async_event_file, + uobj); + if (async_ev_file) + uverbs_uobject_get(&async_ev_file->uobj); + return async_ev_file; +} void copy_port_attr_to_resp(struct ib_port_attr *attr, struct ib_uverbs_query_port_resp *resp, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 060b4ebbd2ba..4796f6a8828c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -218,10 +218,12 @@ int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs) if (!ucontext) return -ENOMEM; - ucontext->res.type = RDMA_RESTRACK_CTX; ucontext->device = ib_dev; ucontext->ufile = ufile; xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC); + + rdma_restrack_new(&ucontext->res, RDMA_RESTRACK_CTX); + rdma_restrack_set_name(&ucontext->res, NULL); attrs->context = ucontext; return 0; } @@ -250,7 +252,7 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs) if (ret) goto err_uncharge; - rdma_restrack_uadd(&ucontext->res); + rdma_restrack_add(&ucontext->res); /* * Make sure that ib_uverbs_get_ucontext() sees the pointer update @@ -311,8 +313,9 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) return 0; err_uobj: - rdma_alloc_abort_uobject(uobj, attrs); + rdma_alloc_abort_uobject(uobj, attrs, false); err_ucontext: + rdma_restrack_put(&attrs->context->res); kfree(attrs->context); attrs->context = NULL; return ret; @@ -334,7 +337,7 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->hw_ver = attr->hw_ver; resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); + resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge); resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; @@ -356,14 +359,12 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; - resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; resp->max_srq_sge = attr->max_srq_sge; resp->max_pkeys = attr->max_pkeys; resp->local_ca_ack_delay = attr->local_ca_ack_delay; - resp->phys_port_cnt = ib_dev->phys_port_cnt; + resp->phys_port_cnt = min_t(u32, ib_dev->phys_port_cnt, U8_MAX); } static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs) @@ -417,8 +418,8 @@ static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs) static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_alloc_pd_resp resp = {}; struct ib_uverbs_alloc_pd cmd; - struct ib_uverbs_alloc_pd_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; int ret; @@ -440,30 +441,24 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) pd->device = ib_dev; pd->uobject = uobj; - pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); - pd->res.type = RDMA_RESTRACK_PD; + + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, NULL); ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata); if (ret) goto err_alloc; + rdma_restrack_add(&pd->res); uobj->object = pd; - memset(&resp, 0, sizeof resp); - resp.pd_handle = uobj->id; - rdma_restrack_uadd(&pd->res); + uobj_finalize_uobj_create(uobj, attrs); - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - - rdma_alloc_commit_uobject(uobj, attrs); - return 0; + resp.pd_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); -err_copy: - ib_dealloc_pd_user(pd, uverbs_get_cleared_udata(attrs)); - pd = NULL; err_alloc: + rdma_restrack_put(&pd->res); kfree(pd); err: uobj_alloc_abort(uobj, attrs); @@ -570,15 +565,15 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev, static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_device *ibudev = attrs->ufile->device; + struct ib_uverbs_open_xrcd_resp resp = {}; struct ib_uverbs_open_xrcd cmd; - struct ib_uverbs_open_xrcd_resp resp; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; - struct fd f = {NULL, 0}; struct inode *inode = NULL; - int ret = 0; int new_xrcd = 0; struct ib_device *ib_dev; + struct fd f = {}; + int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) @@ -616,24 +611,16 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) } if (!xrcd) { - xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata); + xrcd = ib_alloc_xrcd_user(ib_dev, inode, &attrs->driver_udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } - - xrcd->inode = inode; - xrcd->device = ib_dev; - atomic_set(&xrcd->usecnt, 0); - mutex_init(&xrcd->tgt_qp_mutex); - INIT_LIST_HEAD(&xrcd->tgt_qp_list); new_xrcd = 1; } atomic_set(&obj->refcnt, 0); obj->uobject.object = xrcd; - memset(&resp, 0, sizeof resp); - resp.xrcd_handle = obj->uobject.id; if (inode) { if (new_xrcd) { @@ -645,27 +632,17 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) atomic_inc(&xrcd->usecnt); } - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - if (f.file) fdput(f); mutex_unlock(&ibudev->xrcd_tree_mutex); + uobj_finalize_uobj_create(&obj->uobject, attrs); - rdma_alloc_commit_uobject(&obj->uobject, attrs); - return 0; - -err_copy: - if (inode) { - if (new_xrcd) - xrcd_table_delete(ibudev, inode); - atomic_dec(&xrcd->usecnt); - } + resp.xrcd_handle = obj->uobject.id; + return uverbs_response(attrs, &resp, sizeof(resp)); err_dealloc_xrcd: - ib_dealloc_xrcd(xrcd, uverbs_get_cleared_udata(attrs)); + ib_dealloc_xrcd_user(xrcd, uverbs_get_cleared_udata(attrs)); err: uobj_alloc_abort(&obj->uobject, attrs); @@ -703,9 +680,8 @@ int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, if (inode && !atomic_dec_and_test(&xrcd->usecnt)) return 0; - ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata); - - if (ib_is_destroy_retryable(ret, why, uobject)) { + ret = ib_dealloc_xrcd_user(xrcd, &attrs->driver_udata); + if (ret) { atomic_inc(&xrcd->usecnt); return ret; } @@ -713,13 +689,13 @@ int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, if (inode) xrcd_table_delete(dev, inode); - return ret; + return 0; } static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_reg_mr_resp resp = {}; struct ib_uverbs_reg_mr cmd; - struct ib_uverbs_reg_mr_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; @@ -733,29 +709,20 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - ret = ib_check_mr_access(cmd.access_flags); - if (ret) - return ret; - uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); + ret = ib_check_mr_access(ib_dev, cmd.access_flags); + if (ret) + goto err_free; + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); if (!pd) { ret = -EINVAL; goto err_free; } - if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { - if (!(pd->device->attrs.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { - pr_debug("ODP support not available\n"); - ret = -EINVAL; - goto err_put; - } - } - mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &attrs->driver_udata); @@ -771,31 +738,24 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) mr->sig_attrs = NULL; mr->uobject = uobj; atomic_inc(&pd->usecnt); - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_uadd(&mr->res); - - uobj->object = mr; - - memset(&resp, 0, sizeof resp); - resp.lkey = mr->lkey; - resp.rkey = mr->rkey; - resp.mr_handle = uobj->id; + mr->iova = cmd.hca_va; + mr->length = cmd.length; - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); - rdma_alloc_commit_uobject(uobj, attrs); - return 0; - -err_copy: - ib_dereg_mr_user(mr, uverbs_get_cleared_udata(attrs)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + resp.mr_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); err_put: uobj_put_obj_read(pd); - err_free: uobj_alloc_abort(uobj, attrs); return ret; @@ -805,23 +765,28 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_rereg_mr cmd; struct ib_uverbs_rereg_mr_resp resp; - struct ib_pd *pd = NULL; struct ib_mr *mr; - struct ib_pd *old_pd; int ret; struct ib_uobject *uobj; + struct ib_uobject *new_uobj; + struct ib_device *ib_dev; + struct ib_pd *orig_pd; + struct ib_pd *new_pd; + struct ib_mr *new_mr; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) return ret; - if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + if (!cmd.flags) return -EINVAL; + if (cmd.flags & ~IB_MR_REREG_SUPPORTED) + return -EOPNOTSUPP; + if ((cmd.flags & IB_MR_REREG_TRANS) && - (!cmd.start || !cmd.hca_va || 0 >= cmd.length || - (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) - return -EINVAL; + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) + return -EINVAL; uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, attrs); if (IS_ERR(uobj)) @@ -835,32 +800,72 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) } if (cmd.flags & IB_MR_REREG_ACCESS) { - ret = ib_check_mr_access(cmd.access_flags); + ret = ib_check_mr_access(mr->device, cmd.access_flags); if (ret) goto put_uobjs; } + orig_pd = mr->pd; if (cmd.flags & IB_MR_REREG_PD) { - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, - attrs); - if (!pd) { + new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, + attrs); + if (!new_pd) { ret = -EINVAL; goto put_uobjs; } + } else { + new_pd = mr->pd; } - old_pd = mr->pd; - ret = mr->device->ops.rereg_user_mr(mr, cmd.flags, cmd.start, - cmd.length, cmd.hca_va, - cmd.access_flags, pd, - &attrs->driver_udata); - if (ret) + /* + * The driver might create a new HW object as part of the rereg, we need + * to have a uobject ready to hold it. + */ + new_uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); + if (IS_ERR(new_uobj)) { + ret = PTR_ERR(new_uobj); goto put_uobj_pd; + } - if (cmd.flags & IB_MR_REREG_PD) { - atomic_inc(&pd->usecnt); - mr->pd = pd; - atomic_dec(&old_pd->usecnt); + new_mr = ib_dev->ops.rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, + cmd.hca_va, cmd.access_flags, new_pd, + &attrs->driver_udata); + if (IS_ERR(new_mr)) { + ret = PTR_ERR(new_mr); + goto put_new_uobj; + } + if (new_mr) { + new_mr->device = new_pd->device; + new_mr->pd = new_pd; + new_mr->type = IB_MR_TYPE_USER; + new_mr->uobject = uobj; + atomic_inc(&new_pd->usecnt); + new_uobj->object = new_mr; + + rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&new_mr->res, NULL); + rdma_restrack_add(&new_mr->res); + + /* + * The new uobj for the new HW object is put into the same spot + * in the IDR and the old uobj & HW object is deleted. + */ + rdma_assign_uobject(uobj, new_uobj, attrs); + rdma_alloc_commit_uobject(new_uobj, attrs); + uobj_put_destroy(uobj); + new_uobj = NULL; + uobj = NULL; + mr = new_mr; + } else { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_dec(&orig_pd->usecnt); + mr->pd = new_pd; + atomic_inc(&new_pd->usecnt); + } + if (cmd.flags & IB_MR_REREG_TRANS) { + mr->iova = cmd.hca_va; + mr->length = cmd.length; + } } memset(&resp, 0, sizeof(resp)); @@ -869,12 +874,16 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) ret = uverbs_response(attrs, &resp, sizeof(resp)); +put_new_uobj: + if (new_uobj) + uobj_alloc_abort(new_uobj, attrs); put_uobj_pd: if (cmd.flags & IB_MR_REREG_PD) - uobj_put_obj_read(pd); + uobj_put_obj_read(new_pd); put_uobjs: - uobj_put_write(uobj); + if (uobj) + uobj_put_write(uobj); return ret; } @@ -894,7 +903,7 @@ static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs) static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_alloc_mw cmd; - struct ib_uverbs_alloc_mw_resp resp; + struct ib_uverbs_alloc_mw_resp resp = {}; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; @@ -920,33 +929,33 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) goto err_put; } - mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata); - if (IS_ERR(mw)) { - ret = PTR_ERR(mw); + mw = rdma_zalloc_drv_obj(ib_dev, ib_mw); + if (!mw) { + ret = -ENOMEM; goto err_put; } - mw->device = pd->device; - mw->pd = pd; + mw->device = ib_dev; + mw->pd = pd; mw->uobject = uobj; + mw->type = cmd.mw_type; + + ret = pd->device->ops.alloc_mw(mw, &attrs->driver_udata); + if (ret) + goto err_alloc; + atomic_inc(&pd->usecnt); uobj->object = mw; + uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); - memset(&resp, 0, sizeof(resp)); - resp.rkey = mw->rkey; + resp.rkey = mw->rkey; resp.mw_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - - uobj_put_obj_read(pd); - rdma_alloc_commit_uobject(uobj, attrs); - return 0; - -err_copy: - uverbs_dealloc_mw(mw); +err_alloc: + kfree(mw); err_put: uobj_put_obj_read(pd); err_free: @@ -983,40 +992,33 @@ static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - resp.fd = uobj->id; - ev_file = container_of(uobj, struct ib_uverbs_completion_event_file, uobj); ib_uverbs_init_event_queue(&ev_file->ev_queue); + uobj_finalize_uobj_create(uobj, attrs); - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) { - uobj_alloc_abort(uobj, attrs); - return ret; - } - - rdma_alloc_commit_uobject(uobj, attrs); - return 0; + resp.fd = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); } -static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, - struct ib_uverbs_ex_create_cq *cmd) +static int create_cq(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_ex_create_cq *cmd) { struct ib_ucq_object *obj; struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_cq *cq; int ret; - struct ib_uverbs_ex_create_cq_resp resp; + struct ib_uverbs_ex_create_cq_resp resp = {}; struct ib_cq_init_attr attr = {}; struct ib_device *ib_dev; if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors) - return ERR_PTR(-EINVAL); + return -EINVAL; obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs, &ib_dev); if (IS_ERR(obj)) - return obj; + return PTR_ERR(obj); if (cmd->comp_channel >= 0) { ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs); @@ -1046,46 +1048,40 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); if (ret) goto err_free; + rdma_restrack_add(&cq->res); obj->uevent.uobject.object = cq; - memset(&resp, 0, sizeof resp); + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); + resp.base.cq_handle = obj->uevent.uobject.id; - resp.base.cqe = cq->cqe; + resp.base.cqe = cq->cqe; resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_uadd(&cq->res); - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_cb; - - rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs); - return obj; - -err_cb: - ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs)); - cq = NULL; err_free: + rdma_restrack_put(&cq->res); kfree(cq); err_file: if (ev_file) ib_uverbs_release_ucq(ev_file, obj); - err: uobj_alloc_abort(&obj->uevent.uobject, attrs); - - return ERR_PTR(ret); + return ret; } static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_create_cq cmd; struct ib_uverbs_ex_create_cq cmd_ex; - struct ib_ucq_object *obj; int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -1098,14 +1094,12 @@ static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs) cmd_ex.comp_vector = cmd.comp_vector; cmd_ex.comp_channel = cmd.comp_channel; - obj = create_cq(attrs, &cmd_ex); - return PTR_ERR_OR_ZERO(obj); + return create_cq(attrs, &cmd_ex); } static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_ex_create_cq cmd; - struct ib_ucq_object *obj; int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -1118,8 +1112,7 @@ static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs) if (cmd.reserved) return -EINVAL; - obj = create_cq(attrs, &cmd); - return PTR_ERR_OR_ZERO(obj); + return create_cq(attrs, &cmd); } static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) @@ -1127,7 +1120,7 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) struct ib_uverbs_resize_cq cmd; struct ib_uverbs_resize_cq_resp resp = {}; struct ib_cq *cq; - int ret = -EINVAL; + int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) @@ -1294,14 +1287,27 @@ static int create_qp(struct uverbs_attr_bundle *attrs, struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_qp_init_attr attr = {}; - struct ib_uverbs_ex_create_qp_resp resp; + struct ib_uverbs_ex_create_qp_resp resp = {}; int ret; struct ib_rwq_ind_table *ind_tbl = NULL; bool has_sq = true; struct ib_device *ib_dev; - if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) - return -EPERM; + switch (cmd->qp_type) { + case IB_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + break; + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + case IB_QPT_DRIVER: + break; + default: + return -EINVAL; + } obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs, &ib_dev); @@ -1376,7 +1382,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (has_sq) scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, attrs); - if (!ind_tbl) + if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI) rcq = rcq ?: scq; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); @@ -1396,7 +1402,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs, attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd->qp_type; - attr.create_flags = 0; attr.cap.max_send_wr = cmd->max_send_wr; attr.cap.max_recv_wr = cmd->max_recv_wr; @@ -1429,51 +1434,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs, attr.source_qpn = cmd->source_qpn; } - if (cmd->qp_type == IB_QPT_XRC_TGT) - qp = ib_create_qp(pd, &attr); - else - qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata, - obj); - + qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj, + KBUILD_MODNAME); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } - - if (cmd->qp_type != IB_QPT_XRC_TGT) { - ret = ib_create_qp_security(qp, device); - if (ret) - goto err_cb; - - atomic_inc(&pd->usecnt); - if (attr.send_cq) - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); - if (ind_tbl) - atomic_inc(&ind_tbl->usecnt); - } else { - /* It is done in _ib_create_qp for other QP types */ - qp->uobject = obj; - } + ib_qp_usecnt_inc(qp); obj->uevent.uobject.object = qp; - - memset(&resp, 0, sizeof resp); - resp.base.qpn = qp->qp_num; - resp.base.qp_handle = obj->uevent.uobject.id; - resp.base.max_recv_sge = attr.cap.max_recv_sge; - resp.base.max_send_sge = attr.cap.max_send_sge; - resp.base.max_recv_wr = attr.cap.max_recv_wr; - resp.base.max_send_wr = attr.cap.max_send_wr; - resp.base.max_inline_data = attr.cap.max_inline_data; - resp.response_length = uverbs_response_length(attrs, sizeof(resp)); - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_cb; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, @@ -1495,11 +1467,17 @@ static int create_qp(struct uverbs_attr_bundle *attrs, UVERBS_LOOKUP_READ); if (ind_tbl) uobj_put_obj_read(ind_tbl); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); - rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs); - return 0; -err_cb: - ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs)); + resp.base.qpn = qp->qp_num; + resp.base.qp_handle = obj->uevent.uobject.id; + resp.base.max_recv_sge = attr.cap.max_recv_sge; + resp.base.max_send_sge = attr.cap.max_send_sge; + resp.base.max_recv_wr = attr.cap.max_recv_wr; + resp.base.max_send_wr = attr.cap.max_send_wr; + resp.base.max_inline_data = attr.cap.max_inline_data; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); err_put: if (!IS_ERR(xrcd_uobj)) @@ -1570,14 +1548,14 @@ static int ib_uverbs_ex_create_qp(struct uverbs_attr_bundle *attrs) static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_create_qp_resp resp = {}; struct ib_uverbs_open_qp cmd; - struct ib_uverbs_create_qp_resp resp; struct ib_uqp_object *obj; struct ib_xrcd *xrcd; - struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_qp *qp; struct ib_qp_open_attr attr = {}; int ret; + struct ib_uobject *xrcd_uobj; struct ib_device *ib_dev; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -1617,24 +1595,16 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) obj->uevent.uobject.object = qp; obj->uevent.uobject.user_handle = cmd.user_handle; - memset(&resp, 0, sizeof resp); - resp.qpn = qp->qp_num; - resp.qp_handle = obj->uevent.uobject.id; - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_destroy; - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); qp->uobject = obj; uobj_put_read(xrcd_uobj); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); - rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs); - return 0; + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + return uverbs_response(attrs, &resp, sizeof(resp)); -err_destroy: - ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs)); err_xrcd: uobj_put_read(xrcd_uobj); err_put: @@ -1947,8 +1917,7 @@ static int ib_uverbs_modify_qp(struct uverbs_attr_bundle *attrs) if (ret) return ret; - if (cmd.base.attr_mask & - ~((IB_USER_LEGACY_LAST_QP_ATTR_MASK << 1) - 1)) + if (cmd.base.attr_mask & ~IB_QP_ATTR_STANDARD_BITS) return -EOPNOTSUPP; return modify_qp(attrs, &cmd); @@ -1970,10 +1939,7 @@ static int ib_uverbs_ex_modify_qp(struct uverbs_attr_bundle *attrs) * Last bit is reserved for extending the attr_mask by * using another field. */ - BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31)); - - if (cmd.base.attr_mask & - ~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1)) + if (cmd.base.attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT)) return -EOPNOTSUPP; ret = modify_qp(attrs, &cmd); @@ -2010,12 +1976,13 @@ static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs) static void *alloc_wr(size_t wr_size, __u32 num_sge) { - if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof (struct ib_sge))) / - sizeof (struct ib_sge)) + if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof(struct ib_sge))) / + sizeof(struct ib_sge)) return NULL; - return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + - num_sge * sizeof (struct ib_sge), GFP_KERNEL); + return kmalloc(ALIGN(wr_size, sizeof(struct ib_sge)) + + num_sge * sizeof(struct ib_sge), + GFP_KERNEL); } static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) @@ -2224,7 +2191,7 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, const struct ib_sge __user *sgls; const void __user *wqes; - if (wqe_size < sizeof (struct ib_uverbs_recv_wr)) + if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); @@ -2257,14 +2224,14 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, } if (user_wr->num_sge >= - (U32_MAX - ALIGN(sizeof *next, sizeof (struct ib_sge))) / - sizeof (struct ib_sge)) { + (U32_MAX - ALIGN(sizeof(*next), sizeof(struct ib_sge))) / + sizeof(struct ib_sge)) { ret = -EINVAL; goto err; } - next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + - user_wr->num_sge * sizeof (struct ib_sge), + next = kmalloc(ALIGN(sizeof(*next), sizeof(struct ib_sge)) + + user_wr->num_sge * sizeof(struct ib_sge), GFP_KERNEL); if (!next) { ret = -ENOMEM; @@ -2282,8 +2249,8 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, next->num_sge = user_wr->num_sge; if (next->num_sge) { - next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + next->sg_list = (void *)next + + ALIGN(sizeof(*next), sizeof(struct ib_sge)); if (copy_from_user(next->sg_list, sgls + sg_ind, next->num_sge * sizeof(struct ib_sge))) { @@ -2470,24 +2437,14 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) ah->uobject = uobj; uobj->user_handle = cmd.user_handle; uobj->object = ah; - - resp.ah_handle = uobj->id; - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - uobj_put_obj_read(pd); - rdma_alloc_commit_uobject(uobj, attrs); - return 0; + uobj_finalize_uobj_create(uobj, attrs); -err_copy: - rdma_destroy_ah_user(ah, RDMA_DESTROY_AH_SLEEPABLE, - uverbs_get_cleared_udata(attrs)); + resp.ah_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); err_put: uobj_put_obj_read(pd); - err: uobj_alloc_abort(uobj, attrs); return ret; @@ -2954,11 +2911,11 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) wq_init_attr.cq = cq; wq_init_attr.max_sge = cmd.max_sge; wq_init_attr.max_wr = cmd.max_wr; - wq_init_attr.wq_context = attrs->ufile; wq_init_attr.wq_type = cmd.wq_type; wq_init_attr.event_handler = ib_uverbs_wq_event_handler; wq_init_attr.create_flags = cmd.create_flags; INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd.user_handle; wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); if (IS_ERR(wq)) { @@ -2972,31 +2929,25 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) wq->cq = cq; wq->pd = pd; wq->device = pd->device; - wq->wq_context = wq_init_attr.wq_context; atomic_set(&wq->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&cq->usecnt); - wq->uobject = obj; - obj->uevent.uobject.object = wq; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + + uobj_put_obj_read(pd); + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); - memset(&resp, 0, sizeof(resp)); resp.wq_handle = obj->uevent.uobject.id; resp.max_sge = wq_init_attr.max_sge; resp.max_wr = wq_init_attr.max_wr; resp.wqn = wq->wq_num; resp.response_length = uverbs_response_length(attrs, sizeof(resp)); - err = uverbs_response(attrs, &resp, sizeof(resp)); - if (err) - goto err_copy; - - uobj_put_obj_read(pd); - rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, - UVERBS_LOOKUP_READ); - rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs); - return 0; + return uverbs_response(attrs, &resp, sizeof(resp)); -err_copy: - ib_destroy_wq(wq, uverbs_get_cleared_udata(attrs)); err_put_cq: rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); @@ -3057,12 +3008,29 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs) if (!wq) return -EINVAL; - wq_attr.curr_wq_state = cmd.curr_wq_state; - wq_attr.wq_state = cmd.wq_state; if (cmd.attr_mask & IB_WQ_FLAGS) { wq_attr.flags = cmd.flags; wq_attr.flags_mask = cmd.flags_mask; } + + if (cmd.attr_mask & IB_WQ_CUR_STATE) { + if (cmd.curr_wq_state > IB_WQS_ERR) + return -EINVAL; + + wq_attr.curr_wq_state = cmd.curr_wq_state; + } else { + wq_attr.curr_wq_state = wq->state; + } + + if (cmd.attr_mask & IB_WQ_STATE) { + if (cmd.wq_state > IB_WQS_ERR) + return -EINVAL; + + wq_attr.wq_state = cmd.wq_state; + } else { + wq_attr.wq_state = wq_attr.curr_wq_state; + } + ret = wq->device->ops.modify_wq(wq, &wq_attr, cmd.attr_mask, &attrs->driver_udata); rdma_lookup_put_uobject(&wq->uobject->uevent.uobject, @@ -3074,14 +3042,14 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_ex_create_rwq_ind_table cmd; struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; - struct ib_uobject *uobj; + struct ib_uobject *uobj; int err; struct ib_rwq_ind_table_init_attr init_attr = {}; struct ib_rwq_ind_table *rwq_ind_tbl; - struct ib_wq **wqs = NULL; + struct ib_wq **wqs = NULL; u32 *wqs_handles = NULL; struct ib_wq *wq = NULL; - int i, j, num_read_wqs; + int i, num_read_wqs; u32 num_wq_handles; struct uverbs_req_iter iter; struct ib_device *ib_dev; @@ -3127,6 +3095,7 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) } wqs[num_read_wqs] = wq; + atomic_inc(&wqs[num_read_wqs]->usecnt); } uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev); @@ -3135,17 +3104,15 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) goto put_wqs; } - init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; - init_attr.ind_tbl = wqs; - - rwq_ind_tbl = ib_dev->ops.create_rwq_ind_table(ib_dev, &init_attr, - &attrs->driver_udata); - - if (IS_ERR(rwq_ind_tbl)) { - err = PTR_ERR(rwq_ind_tbl); + rwq_ind_tbl = rdma_zalloc_drv_obj(ib_dev, ib_rwq_ind_table); + if (!rwq_ind_tbl) { + err = -ENOMEM; goto err_uobj; } + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl->ind_tbl = wqs; rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; rwq_ind_tbl->uobject = uobj; @@ -3153,34 +3120,32 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) rwq_ind_tbl->device = ib_dev; atomic_set(&rwq_ind_tbl->usecnt, 0); + err = ib_dev->ops.create_rwq_ind_table(rwq_ind_tbl, &init_attr, + &attrs->driver_udata); + if (err) + goto err_create; + for (i = 0; i < num_wq_handles; i++) - atomic_inc(&wqs[i]->usecnt); + rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + kfree(wqs_handles); + uobj_finalize_uobj_create(uobj, attrs); resp.ind_tbl_handle = uobj->id; resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); - err = uverbs_response(attrs, &resp, sizeof(resp)); - if (err) - goto err_copy; - - kfree(wqs_handles); - - for (j = 0; j < num_read_wqs; j++) - rdma_lookup_put_uobject(&wqs[j]->uobject->uevent.uobject, - UVERBS_LOOKUP_READ); - - rdma_alloc_commit_uobject(uobj, attrs); - return 0; - -err_copy: - ib_destroy_rwq_ind_table(rwq_ind_tbl); +err_create: + kfree(rwq_ind_tbl); err_uobj: uobj_alloc_abort(uobj, attrs); put_wqs: - for (j = 0; j < num_read_wqs; j++) - rdma_lookup_put_uobject(&wqs[j]->uobject->uevent.uobject, + for (i = 0; i < num_read_wqs; i++) { + rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, UVERBS_LOOKUP_READ); + atomic_dec(&wqs[i]->usecnt); + } err_free: kfree(wqs_handles); kfree(wqs); @@ -3206,7 +3171,7 @@ static int ib_uverbs_ex_destroy_rwq_ind_table(struct uverbs_attr_bundle *attrs) static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_create_flow cmd; - struct ib_uverbs_create_flow_resp resp; + struct ib_uverbs_create_flow_resp resp = {}; struct ib_uobject *uobj; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; @@ -3274,6 +3239,11 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) goto err_free_attr; } + if (!rdma_is_port_valid(uobj->context->device, cmd.flow_attr.port)) { + err = -EINVAL; + goto err_uobj; + } + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); if (!qp) { err = -EINVAL; @@ -3323,14 +3293,14 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) ib_spec += ((union ib_flow_spec *) ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { - pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + pr_warn("create flow failed, flow %d: %u bytes left from uverb cmd\n", i, cmd.flow_attr.size); err = -EINVAL; goto err_free; } - flow_id = qp->device->ops.create_flow( - qp, flow_attr, IB_FLOW_DOMAIN_USER, &attrs->driver_udata); + flow_id = qp->device->ops.create_flow(qp, flow_attr, + &attrs->driver_udata); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); @@ -3339,23 +3309,17 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); - memset(&resp, 0, sizeof(resp)); - resp.flow_handle = uobj->id; - - err = uverbs_response(attrs, &resp, sizeof(resp)); - if (err) - goto err_copy; - rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, UVERBS_LOOKUP_READ); kfree(flow_attr); + if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - rdma_alloc_commit_uobject(uobj, attrs); - return 0; -err_copy: - if (!qp->device->ops.destroy_flow(flow_id)) - atomic_dec(&qp->usecnt); + uobj_finalize_uobj_create(uobj, attrs); + + resp.flow_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + err_free: ib_uverbs_flow_resources_free(uflow_res); err_free_flow_attr: @@ -3390,13 +3354,13 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) { - struct ib_uverbs_create_srq_resp resp; + struct ib_uverbs_create_srq_resp resp = {}; struct ib_usrq_object *obj; struct ib_pd *pd; struct ib_srq *srq; - struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_srq_init_attr attr; int ret; + struct ib_uobject *xrcd_uobj; struct ib_device *ib_dev; obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs, @@ -3441,58 +3405,29 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, } attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = attrs->ufile; attr.srq_type = cmd->srq_type; attr.attr.max_wr = cmd->max_wr; attr.attr.max_sge = cmd->max_sge; attr.attr.srq_limit = cmd->srq_limit; INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd->user_handle; - srq = rdma_zalloc_drv_obj(ib_dev, ib_srq); - if (!srq) { - ret = -ENOMEM; - goto err_put; - } - - srq->device = pd->device; - srq->pd = pd; - srq->srq_type = cmd->srq_type; - srq->uobject = obj; - srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; - - ret = pd->device->ops.create_srq(srq, &attr, udata); - if (ret) - goto err_free; - - if (ib_srq_has_cq(cmd->srq_type)) { - srq->ext.cq = attr.ext.cq; - atomic_inc(&attr.ext.cq->usecnt); - } - - if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.xrcd->usecnt); + srq = ib_create_srq_user(pd, &attr, obj, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put_pd; } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); - obj->uevent.uobject.object = srq; obj->uevent.uobject.user_handle = cmd->user_handle; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); - memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uevent.uobject.id; - resp.max_wr = attr.attr.max_wr; - resp.max_sge = attr.attr.max_sge; if (cmd->srq_type == IB_SRQT_XRC) resp.srqn = srq->ext.xrc.srq_num; - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - if (cmd->srq_type == IB_SRQT_XRC) uobj_put_read(xrcd_uobj); @@ -3501,18 +3436,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, UVERBS_LOOKUP_READ); uobj_put_obj_read(pd); - rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs); - return 0; + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); -err_copy: - ib_destroy_srq_user(srq, uverbs_get_cleared_udata(attrs)); - /* It was released in ib_destroy_srq_user */ - srq = NULL; -err_free: - kfree(srq); -err_put: - uobj_put_obj_read(pd); + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + return uverbs_response(attrs, &resp, sizeof(resp)); +err_put_pd: + uobj_put_obj_read(pd); err_put_cq: if (ib_srq_has_cq(cmd->srq_type)) rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject, @@ -3751,7 +3683,7 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) #define UAPI_DEF_WRITE_IO(req, resp) \ .write.has_resp = 1 + \ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \ - BUILD_BUG_ON_ZERO(sizeof(((req *)0)->response) != \ + BUILD_BUG_ON_ZERO(sizeof_field(req, response) != \ sizeof(u64)), \ .write.req_size = sizeof(req), .write.resp_size = sizeof(resp) @@ -3791,13 +3723,13 @@ const struct uapi_definition uverbs_def_write_intf[] = { ib_uverbs_create_ah, UAPI_DEF_WRITE_UDATA_IO( struct ib_uverbs_create_ah, - struct ib_uverbs_create_ah_resp), - UAPI_DEF_METHOD_NEEDS_FN(create_ah)), + struct ib_uverbs_create_ah_resp)), DECLARE_UVERBS_WRITE( IB_USER_VERBS_CMD_DESTROY_AH, ib_uverbs_destroy_ah, - UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah), - UAPI_DEF_METHOD_NEEDS_FN(destroy_ah))), + UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah)), + UAPI_DEF_OBJ_NEEDS_FN(create_user_ah), + UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), DECLARE_UVERBS_OBJECT( UVERBS_OBJECT_COMP_CHANNEL, @@ -3851,7 +3783,7 @@ const struct uapi_definition uverbs_def_write_intf[] = { IB_USER_VERBS_EX_CMD_MODIFY_CQ, ib_uverbs_ex_modify_cq, UAPI_DEF_WRITE_I(struct ib_uverbs_ex_modify_cq), - UAPI_DEF_METHOD_NEEDS_FN(create_cq))), + UAPI_DEF_METHOD_NEEDS_FN(modify_cq))), DECLARE_UVERBS_OBJECT( UVERBS_OBJECT_DEVICE, @@ -4097,8 +4029,7 @@ const struct uapi_definition uverbs_def_write_intf[] = { DECLARE_UVERBS_WRITE( IB_USER_VERBS_CMD_CLOSE_XRCD, ib_uverbs_close_xrcd, - UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd), - UAPI_DEF_METHOD_NEEDS_FN(dealloc_xrcd)), + UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd)), DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_QP, ib_uverbs_open_qp, UAPI_DEF_WRITE_UDATA_IO( @@ -4108,8 +4039,9 @@ const struct uapi_definition uverbs_def_write_intf[] = { ib_uverbs_open_xrcd, UAPI_DEF_WRITE_UDATA_IO( struct ib_uverbs_open_xrcd, - struct ib_uverbs_open_xrcd_resp), - UAPI_DEF_METHOD_NEEDS_FN(alloc_xrcd))), + struct ib_uverbs_open_xrcd_resp)), + UAPI_DEF_OBJ_NEEDS_FN(alloc_xrcd), + UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)), {}, }; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 538affbc517e..d9799706c58e 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -58,6 +58,7 @@ struct bundle_priv { DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -90,7 +91,7 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, } /** - * uverbs_alloc() - Quickly allocate memory for use with a bundle + * _uverbs_alloc() - Quickly allocate memory for use with a bundle * @bundle: The bundle * @size: Number of bytes to allocate * @flags: Allocator flags @@ -136,7 +137,7 @@ EXPORT_SYMBOL(_uverbs_alloc); static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, u16 len) { - if (uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data)) + if (uattr->len > sizeof_field(struct ib_uverbs_attr, data)) return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, uattr->len - len); @@ -230,7 +231,8 @@ static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, for (i = 0; i != attr->len; i++) uverbs_finalize_object(attr->uobjects[i], - spec->u2.objs_arr.access, commit, attrs); + spec->u2.objs_arr.access, false, commit, + attrs); } static int uverbs_process_attr(struct bundle_priv *pbundle, @@ -257,7 +259,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, return -EOPNOTSUPP; e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id; - /* fall through */ + fallthrough; case UVERBS_ATTR_TYPE_PTR_IN: /* Ensure that any data provided by userspace beyond the known * struct is zero. Userspace that knows how to use some future @@ -269,7 +271,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len)) return -EOPNOTSUPP; - /* fall through */ + fallthrough; case UVERBS_ATTR_TYPE_PTR_OUT: if (uattr->len < val_spec->u.ptr.min_len || (!val_spec->zero_trailing && @@ -335,6 +337,14 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, break; + case UVERBS_ATTR_TYPE_RAW_FD: + if (uattr->attr_data.reserved || uattr->len != 0 || + uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX) + return -EINVAL; + /* _uverbs_get_const_signed() is the accessor */ + e->ptr_attr.data = uattr->data_s64; + break; + case UVERBS_ATTR_TYPE_IDRS_ARRAY: return uverbs_process_idrs_array(pbundle, attr_uapi, &e->objs_arr_attr, uattr, @@ -502,7 +512,9 @@ static void bundle_destroy(struct bundle_priv *pbundle, bool commit) uverbs_finalize_object( attr->obj_attr.uobject, - attr->obj_attr.attr_elm->spec.u.obj.access, commit, + attr->obj_attr.attr_elm->spec.u.obj.access, + test_bit(i, pbundle->uobj_hw_obj_valid), + commit, &pbundle->bundle); } @@ -590,6 +602,8 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); + memset(pbundle->uobj_hw_obj_valid, 0, + sizeof(pbundle->uobj_hw_obj_valid)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); bundle_destroy(pbundle, ret == 0); @@ -746,9 +760,10 @@ int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx) return uverbs_set_output(bundle, attr); } -int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, - size_t idx, s64 lower_bound, u64 upper_bound, - s64 *def_val) +int _uverbs_get_const_signed(s64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) { const struct uverbs_attr *attr; @@ -767,7 +782,30 @@ int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, return 0; } -EXPORT_SYMBOL(_uverbs_get_const); +EXPORT_SYMBOL(_uverbs_get_const_signed); + +int _uverbs_get_const_unsigned(u64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 upper_bound, u64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to > upper_bound) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const_unsigned); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) @@ -784,3 +822,16 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, } return uverbs_copy_to(bundle, idx, from, size); } +EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); + +/* Once called an abort will call through to the type's destroy_hw() */ +void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, + u16 idx) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + + __set_bit(uapi_bkey_attr(uapi_key_attr(idx)), + pbundle->uobj_hw_obj_valid); +} +EXPORT_SYMBOL(uverbs_finalize_uobj_create); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2d4083bf4a04..d54434088727 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -75,7 +75,7 @@ static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; static DEFINE_IDA(uverbs_ida); -static void ib_uverbs_add_one(struct ib_device *device); +static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); /* @@ -108,8 +108,11 @@ int uverbs_dealloc_mw(struct ib_mw *mw) int ret; ret = mw->device->ops.dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + kfree(mw); return ret; } @@ -146,8 +149,7 @@ void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) { - struct ib_uverbs_async_event_file *async_file = - READ_ONCE(uobj->uobject.ufile->async_file); + struct ib_uverbs_async_event_file *async_file = uobj->event_file; struct ib_uverbs_event *evt, *tmp; if (!async_file) @@ -159,6 +161,7 @@ void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) kfree(evt); } spin_unlock_irq(&async_file->ev_queue.lock); + uverbs_uobject_put(&async_file->uobj); } void ib_uverbs_detach_umcast(struct ib_qp *qp, @@ -194,11 +197,11 @@ void ib_uverbs_release_file(struct kref *ref) module_put(ib_dev->ops.owner); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); - if (atomic_dec_and_test(&file->device->refcount)) + if (refcount_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); - if (file->async_file) - uverbs_uobject_put(&file->async_file->uobj); + if (file->default_async_file) + uverbs_uobject_put(&file->default_async_file->uobj); put_device(&file->device->dev); if (file->disassociate_page) @@ -296,6 +299,8 @@ static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); if (!list_empty(&ev_queue->event_list)) pollflags = EPOLLIN | EPOLLRDNORM; + else if (ev_queue->is_closed) + pollflags = EPOLLERR; spin_unlock_irq(&ev_queue->lock); return pollflags; @@ -346,7 +351,7 @@ const struct file_operations uverbs_async_event_fops = { .owner = THIS_MODULE, .read = ib_uverbs_async_event_read, .poll = ib_uverbs_async_event_poll, - .release = uverbs_uobject_fd_release, + .release = uverbs_async_event_release, .fasync = ib_uverbs_async_event_fasync, .llseek = no_llseek, }; @@ -386,10 +391,9 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN); } -static void -ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, - __u64 element, __u64 event, struct list_head *obj_list, - u32 *counter) +void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, + __u64 element, __u64 event, + struct list_head *obj_list, u32 *counter) { struct ib_uverbs_event *entry; unsigned long flags; @@ -426,7 +430,7 @@ ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, static void uverbs_uobj_event(struct ib_uevent_object *eobj, struct ib_event *event) { - ib_uverbs_async_handler(READ_ONCE(eobj->uobject.ufile->async_file), + ib_uverbs_async_handler(eobj->event_file, eobj->uobject.user_handle, event->event, &eobj->event_list, &eobj->events_reported); } @@ -483,10 +487,10 @@ void ib_uverbs_init_async_event_file( /* The first async_event_file becomes the default one for the file. */ mutex_lock(&uverbs_file->ucontext_lock); - if (!uverbs_file->async_file) { + if (!uverbs_file->default_async_file) { /* Pairs with the put in ib_uverbs_release_file */ uverbs_uobject_get(&async_file->uobj); - smp_store_release(&uverbs_file->async_file, async_file); + smp_store_release(&uverbs_file->default_async_file, async_file); } mutex_unlock(&uverbs_file->ucontext_lock); @@ -600,6 +604,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); bundle.ufile = file; bundle.context = NULL; /* only valid if bundle has uobject */ + bundle.uobject = NULL; if (!method_elm->is_ex) { size_t in_len = hdr.in_words * 4 - sizeof(hdr); size_t out_len = hdr.out_words * 4; @@ -663,6 +668,9 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, } ret = method_elm->handler(&bundle); + if (bundle.uobject) + uverbs_finalize_object(bundle.uobject, UVERBS_ACCESS_NEW, true, + !ret, &bundle); out_unlock: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return (ret) ? : count; @@ -820,6 +828,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) ret = mmget_not_zero(mm); if (!ret) { list_del_init(&priv->list); + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } mm = NULL; continue; } @@ -830,14 +842,12 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) return; /* - * The umap_lock is nested under mmap_sem since it used within + * The umap_lock is nested under mmap_lock since it used within * the vma_ops callbacks, so we have to clean the list one mm * at a time to get the lock ordering right. Typically there * will only be one mm, so no big deal. */ - down_read(&mm->mmap_sem); - if (!mmget_still_valid(mm)) - goto skip_mm; + mmap_read_lock(mm); mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -856,8 +866,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) } } mutex_unlock(&ufile->umap_lock); - skip_mm: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); } } @@ -882,7 +891,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) int srcu_key; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); - if (!atomic_inc_not_zero(&dev->refcount)) + if (!refcount_inc_not_zero(&dev->refcount)) return -ENXIO; get_device(&dev->dev); @@ -946,7 +955,7 @@ err_module: err: mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); - if (atomic_dec_and_test(&dev->refcount)) + if (refcount_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); put_device(&dev->dev); @@ -1037,7 +1046,7 @@ static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev)); + ret = sysfs_emit(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; @@ -1056,7 +1065,7 @@ static ssize_t abi_version_show(struct device *device, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); + ret = sysfs_emit(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; @@ -1089,7 +1098,7 @@ static int ib_uverbs_create_uapi(struct ib_device *device, return 0; } -static void ib_uverbs_add_one(struct ib_device *device) +static int ib_uverbs_add_one(struct ib_device *device) { int devnum; dev_t base; @@ -1097,16 +1106,16 @@ static void ib_uverbs_add_one(struct ib_device *device) int ret; if (!device->ops.alloc_ucontext) - return; + return -EOPNOTSUPP; uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); if (!uverbs_dev) - return; + return -ENOMEM; ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); if (ret) { kfree(uverbs_dev); - return; + return -ENOMEM; } device_initialize(&uverbs_dev->dev); @@ -1115,7 +1124,7 @@ static void ib_uverbs_add_one(struct ib_device *device) uverbs_dev->dev.release = ib_uverbs_release_dev; uverbs_dev->groups[0] = &dev_attr_group; uverbs_dev->dev.groups = uverbs_dev->groups; - atomic_set(&uverbs_dev->refcount, 1); + refcount_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); @@ -1126,15 +1135,18 @@ static void ib_uverbs_add_one(struct ib_device *device) devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, GFP_KERNEL); - if (devnum < 0) + if (devnum < 0) { + ret = -ENOMEM; goto err; + } uverbs_dev->devnum = devnum; if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else base = IB_UVERBS_BASE_DEV + devnum; - if (ib_uverbs_create_uapi(device, uverbs_dev)) + ret = ib_uverbs_create_uapi(device, uverbs_dev); + if (ret) goto err_uapi; uverbs_dev->dev.devt = base; @@ -1149,16 +1161,16 @@ static void ib_uverbs_add_one(struct ib_device *device) goto err_uapi; ib_set_client_data(device, &uverbs_client, uverbs_dev); - return; + return 0; err_uapi: ida_free(&uverbs_ida, devnum); err: - if (atomic_dec_and_test(&uverbs_dev->refcount)) + if (refcount_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); put_device(&uverbs_dev->dev); - return; + return ret; } static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, @@ -1183,9 +1195,6 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, */ mutex_unlock(&uverbs_dev->lists_mutex); - ib_uverbs_async_handler(READ_ONCE(file->async_file), 0, - IB_EVENT_DEVICE_FATAL, NULL, NULL); - uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE); kref_put(&file->ref, ib_uverbs_release_file); @@ -1201,9 +1210,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) struct ib_uverbs_device *uverbs_dev = client_data; int wait_clients = 1; - if (!uverbs_dev) - return; - cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); ida_free(&uverbs_ida, uverbs_dev->devnum); @@ -1223,7 +1229,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) wait_clients = 0; } - if (atomic_dec_and_test(&uverbs_dev->refcount)) + if (refcount_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index b8d715c68ca4..11a080646916 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -66,7 +66,7 @@ void ib_copy_ah_attr_to_user(struct ib_device *device, struct rdma_ah_attr *src = ah_attr; struct rdma_ah_attr conv_ah; - memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + memset(&dst->grh, 0, sizeof(dst->grh)); if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && (rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) && diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 3abfc63225cb..13776a66e2e4 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -75,96 +75,28 @@ static int uverbs_free_mw(struct ib_uobject *uobject, return uverbs_dealloc_mw((struct ib_mw *)uobject->object); } -static int uverbs_free_qp(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_qp *qp = uobject->object; - struct ib_uqp_object *uqp = - container_of(uobject, struct ib_uqp_object, uevent.uobject); - int ret; - - /* - * If this is a user triggered destroy then do not allow destruction - * until the user cleans up all the mcast bindings. Unlike in other - * places we forcibly clean up the mcast attachments for !DESTROY - * because the mcast attaches are not ubojects and will not be - * destroyed by anything else during cleanup processing. - */ - if (why == RDMA_REMOVE_DESTROY) { - if (!list_empty(&uqp->mcast_list)) - return -EBUSY; - } else if (qp == qp->real_qp) { - ib_uverbs_detach_umcast(qp, uqp); - } - - ret = ib_destroy_qp_user(qp, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - if (uqp->uxrcd) - atomic_dec(&uqp->uxrcd->refcnt); - - ib_uverbs_release_uevent(&uqp->uevent); - return ret; -} - static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) { struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object; struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; - int ret; - - ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; + u32 table_size = (1 << rwq_ind_tbl->log_ind_tbl_size); + int ret, i; - kfree(ind_tbl); - return ret; -} + if (atomic_read(&rwq_ind_tbl->usecnt)) + return -EBUSY; -static int uverbs_free_wq(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_wq *wq = uobject->object; - struct ib_uwq_object *uwq = - container_of(uobject, struct ib_uwq_object, uevent.uobject); - int ret; - - ret = ib_destroy_wq(wq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - ib_uverbs_release_uevent(&uwq->uevent); - return ret; -} - -static int uverbs_free_srq(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_srq *srq = uobject->object; - struct ib_uevent_object *uevent = - container_of(uobject, struct ib_uevent_object, uobject); - enum ib_srq_type srq_type = srq->srq_type; - int ret; - - ret = ib_destroy_srq_user(srq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) + ret = rwq_ind_tbl->device->ops.destroy_rwq_ind_table(rwq_ind_tbl); + if (ret) return ret; - if (srq_type == IB_SRQT_XRC) { - struct ib_usrq_object *us = - container_of(uevent, struct ib_usrq_object, uevent); - - atomic_dec(&us->uxrcd->refcnt); - } + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); - ib_uverbs_release_uevent(uevent); - return ret; + kfree(rwq_ind_tbl); + kfree(ind_tbl); + return 0; } static int uverbs_free_xrcd(struct ib_uobject *uobject, @@ -176,9 +108,8 @@ static int uverbs_free_xrcd(struct ib_uobject *uobject, container_of(uobject, struct ib_uxrcd_object, uobject); int ret; - ret = ib_destroy_usecnt(&uxrcd->refcnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&uxrcd->refcnt)) + return -EBUSY; mutex_lock(&attrs->ufile->device->xrcd_tree_mutex); ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs); @@ -192,14 +123,11 @@ static int uverbs_free_pd(struct ib_uobject *uobject, struct uverbs_attr_bundle *attrs) { struct ib_pd *pd = uobject->object; - int ret; - ret = ib_destroy_usecnt(&pd->usecnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&pd->usecnt)) + return -EBUSY; - ib_dealloc_pd_user(pd, &attrs->driver_udata); - return 0; + return ib_dealloc_pd_user(pd, &attrs->driver_udata); } void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue) @@ -226,7 +154,7 @@ void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue) spin_unlock_irq(&event_queue->lock); } -static int +static void uverbs_completion_event_file_destroy_uobj(struct ib_uobject *uobj, enum rdma_remove_reason why) { @@ -235,7 +163,6 @@ uverbs_completion_event_file_destroy_uobj(struct ib_uobject *uobj, uobj); ib_uverbs_free_event_queue(&file->ev_queue); - return 0; } int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) @@ -252,10 +179,6 @@ DECLARE_UVERBS_NAMED_OBJECT( "[infinibandevent]", O_RDONLY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_QP, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MW_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE, @@ -267,11 +190,6 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw), &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_SRQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), - uverbs_free_srq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_AH_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE, @@ -296,10 +214,6 @@ DECLARE_UVERBS_NAMED_OBJECT( uverbs_free_flow), &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_WQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_RWQ_IND_TBL_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, @@ -340,18 +254,12 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL, UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, - UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH, UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED( UVERBS_OBJECT_RWQ_IND_TBL, UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)), diff --git a/drivers/infiniband/core/uverbs_std_types_async_fd.c b/drivers/infiniband/core/uverbs_std_types_async_fd.c index 82ec0806b34b..cc24cfdf7aee 100644 --- a/drivers/infiniband/core/uverbs_std_types_async_fd.c +++ b/drivers/infiniband/core/uverbs_std_types_async_fd.c @@ -19,15 +19,42 @@ static int UVERBS_HANDLER(UVERBS_METHOD_ASYNC_EVENT_ALLOC)( return 0; } -static int uverbs_async_event_destroy_uobj(struct ib_uobject *uobj, - enum rdma_remove_reason why) +static void uverbs_async_event_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) { struct ib_uverbs_async_event_file *event_file = container_of(uobj, struct ib_uverbs_async_event_file, uobj); ib_unregister_event_handler(&event_file->event_handler); + + if (why == RDMA_REMOVE_DRIVER_REMOVE) + ib_uverbs_async_handler(event_file, 0, IB_EVENT_DEVICE_FATAL, + NULL, NULL); +} + +int uverbs_async_event_release(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_async_event_file *event_file; + struct ib_uobject *uobj = filp->private_data; + int ret; + + if (!uobj) + return uverbs_uobject_fd_release(inode, filp); + + event_file = + container_of(uobj, struct ib_uverbs_async_event_file, uobj); + + /* + * The async event FD has to deliver IB_EVENT_DEVICE_FATAL even after + * disassociation, so cleaning the event list must only happen after + * release. The user knows it has reached the end of the event stream + * when it sees IB_EVENT_DEVICE_FATAL. + */ + uverbs_uobject_get(uobj); + ret = uverbs_uobject_fd_release(inode, filp); ib_uverbs_free_event_queue(&event_file->ev_queue); - return 0; + uverbs_uobject_put(uobj); + return ret; } DECLARE_UVERBS_NAMED_METHOD( diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index 9f013304e677..999da9c79866 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -42,11 +42,14 @@ static int uverbs_free_counters(struct ib_uobject *uobject, struct ib_counters *counters = uobject->object; int ret; - ret = ib_destroy_usecnt(&counters->usecnt, why, uobject); + if (atomic_read(&counters->usecnt)) + return -EBUSY; + + ret = counters->device->ops.destroy_counters(counters); if (ret) return ret; - - return counters->device->ops.destroy_counters(counters); + kfree(counters); + return 0; } static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( @@ -66,20 +69,19 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( if (!ib_dev->ops.create_counters) return -EOPNOTSUPP; - counters = ib_dev->ops.create_counters(ib_dev, attrs); - if (IS_ERR(counters)) { - ret = PTR_ERR(counters); - goto err_create_counters; - } + counters = rdma_zalloc_drv_obj(ib_dev, ib_counters); + if (!counters) + return -ENOMEM; counters->device = ib_dev; counters->uobject = uobj; uobj->object = counters; atomic_set(&counters->usecnt, 0); - return 0; + ret = ib_dev->ops.create_counters(counters, attrs); + if (ret) + kfree(counters); -err_create_counters: return ret; } diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index da4110a0eea2..370ad7c83f88 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -33,6 +33,7 @@ #include <rdma/uverbs_std_types.h> #include "rdma_core.h" #include "uverbs.h" +#include "restrack.h" static int uverbs_free_cq(struct ib_uobject *uobject, enum rdma_remove_reason why, @@ -45,7 +46,7 @@ static int uverbs_free_cq(struct ib_uobject *uobject, int ret; ret = ib_destroy_cq_user(cq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) + if (ret) return ret; ib_uverbs_release_ucq( @@ -54,7 +55,7 @@ static int uverbs_free_cq(struct ib_uobject *uobject, ev_queue) : NULL, ucq); - return ret; + return 0; } static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( @@ -100,6 +101,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( uverbs_uobject_get(ev_file_uobj); } + obj->uevent.event_file = ib_uverbs_get_async_event( + attrs, UVERBS_ATTR_CREATE_CQ_EVENT_FD); + if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { ret = -EINVAL; goto err_event_file; @@ -120,7 +124,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); if (ret) @@ -128,20 +134,19 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( obj->uevent.uobject.object = cq; obj->uevent.uobject.user_handle = user_handle; - rdma_restrack_uadd(&cq->res); + rdma_restrack_add(&cq->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE); ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, sizeof(cq->cqe)); - if (ret) - goto err_cq; + return ret; - return 0; -err_cq: - ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs)); - cq = NULL; err_free: + rdma_restrack_put(&cq->res); kfree(cq); err_event_file: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); if (ev_file) uverbs_uobject_put(ev_file_uobj); return ret; @@ -171,6 +176,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( @@ -202,11 +211,8 @@ DECLARE_UVERBS_NAMED_METHOD( DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_CQ, UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq), - -#if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI) &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) -#endif ); const struct uapi_definition uverbs_def_obj_cq[] = { diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index ae4a59d6f9b1..049684880ae0 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -3,11 +3,13 @@ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. */ +#include <linux/overflow.h> #include <rdma/uverbs_std_types.h> #include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_ioctl.h> #include <rdma/opa_addr.h> +#include <rdma/ib_cache.h> /* * This ioctl method allows calling any defined write or write_ex @@ -38,7 +40,12 @@ static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)( attrs->ucore.outlen < method_elm->resp_size) return -ENOSPC; - return method_elm->handler(attrs); + attrs->uobject = NULL; + rc = method_elm->handler(attrs); + if (attrs->uobject) + uverbs_finalize_object(attrs->uobject, UVERBS_ACCESS_NEW, true, + !rc, attrs); + return rc; } DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE, @@ -110,8 +117,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)( return ret; uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id); - if (!uapi_object) - return -EINVAL; + if (IS_ERR(uapi_object)) + return PTR_ERR(uapi_object); handles = gather_objects_handle(attrs->ufile, uapi_object, attrs, out_len, &total); @@ -160,7 +167,8 @@ void copy_port_attr_to_resp(struct ib_port_attr *attr, resp->subnet_timeout = attr->subnet_timeout; resp->init_type_reply = attr->init_type_reply; resp->active_width = attr->active_width; - resp->active_speed = attr->active_speed; + /* This ABI needs to be extended to provide any speed more than IB_SPEED_NDR */ + resp->active_speed = min_t(u16, attr->active_speed, IB_SPEED_NDR); resp->phys_state = attr->phys_state; resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num); } @@ -229,6 +237,199 @@ static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( return 0; } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 num_comp; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_ucontext) + return -EOPNOTSUPP; + + num_comp = attrs->ufile->device->num_comp_vectors; + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + return ucontext->device->ops.query_ucontext(ucontext, attrs); +} + +static int copy_gid_entries_to_user(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_gid_entry *entries, + size_t num_entries, size_t user_entry_size) +{ + const struct uverbs_attr *attr; + void __user *user_entries; + size_t copy_len; + int ret; + int i; + + if (user_entry_size == sizeof(*entries)) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + entries, sizeof(*entries) * num_entries); + return ret; + } + + copy_len = min_t(size_t, user_entry_size, sizeof(*entries)); + attr = uverbs_attr_get(attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); + if (IS_ERR(attr)) + return PTR_ERR(attr); + + user_entries = u64_to_user_ptr(attr->ptr_attr.data); + for (i = 0; i < num_entries; i++) { + if (copy_to_user(user_entries, entries, copy_len)) + return -EFAULT; + + if (user_entry_size > sizeof(*entries)) { + if (clear_user(user_entries + sizeof(*entries), + user_entry_size - sizeof(*entries))) + return -EFAULT; + } + + entries++; + user_entries += user_entry_size; + } + + return uverbs_output_written(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_TABLE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry *entries; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + size_t user_entry_size; + ssize_t num_entries; + int max_entries; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&user_entry_size, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE); + if (ret) + return ret; + + if (!user_entry_size) + return -EINVAL; + + max_entries = uverbs_attr_ptr_get_array_size( + attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + user_entry_size); + if (max_entries <= 0) + return max_entries ?: -EINVAL; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + entries = uverbs_kcalloc(attrs, max_entries, sizeof(*entries)); + if (IS_ERR(entries)) + return PTR_ERR(entries); + + num_entries = rdma_query_gid_table(ib_dev, entries, max_entries); + if (num_entries < 0) + return -EINVAL; + + ret = copy_gid_entries_to_user(attrs, entries, num_entries, + user_entry_size); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + &num_entries, sizeof(num_entries)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_ENTRY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry entry = {}; + const struct ib_gid_attr *gid_attr; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + struct net_device *ndev; + u32 gid_index; + u32 port_num; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_PORT); + if (ret) + return ret; + + ret = uverbs_get_const(&gid_index, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX); + if (ret) + return ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + gid_attr = rdma_get_gid_attr(ib_dev, port_num, gid_index); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + memcpy(&entry.gid, &gid_attr->gid, sizeof(gid_attr->gid)); + entry.gid_index = gid_attr->index; + entry.port_num = gid_attr->port_num; + entry.gid_type = gid_attr->gid_type; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(gid_attr); + if (IS_ERR(ndev)) { + if (PTR_ERR(ndev) != -ENODEV) { + ret = PTR_ERR(ndev); + rcu_read_unlock(); + goto out; + } + } else { + entry.netdev_ifindex = ndev->ifindex; + } + rcu_read_unlock(); + + ret = uverbs_copy_to_struct_or_zero( + attrs, UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, &entry, + sizeof(entry)); +out: + rdma_put_gid_attr(gid_attr); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_GET_CONTEXT, UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, @@ -238,6 +439,13 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_UHW()); DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_INFO_HANDLES, /* Also includes any device specific object ids */ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID, @@ -256,11 +464,38 @@ DECLARE_UVERBS_NAMED_METHOD( reserved), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_TABLE, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, u32, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + UVERBS_ATTR_MIN_SIZE(0), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_PORT, u32, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, u32, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, + UVERBS_ATTR_STRUCT(struct ib_uverbs_gid_entry, + netdev_ifindex), + UA_MANDATORY)); + DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), - &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); const struct uapi_definition uverbs_def_obj_device[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE), diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c index d5a1de33c2c9..98c522cf86d6 100644 --- a/drivers/infiniband/core/uverbs_std_types_dm.c +++ b/drivers/infiniband/core/uverbs_std_types_dm.c @@ -39,11 +39,9 @@ static int uverbs_free_dm(struct ib_uobject *uobject, struct uverbs_attr_bundle *attrs) { struct ib_dm *dm = uobject->object; - int ret; - ret = ib_destroy_usecnt(&dm->usecnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&dm->usecnt)) + return -EBUSY; return dm->device->ops.dealloc_dm(dm, attrs); } diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index 459cf165b231..0ddcf6da66c4 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -39,394 +39,13 @@ static int uverbs_free_flow_action(struct ib_uobject *uobject, struct uverbs_attr_bundle *attrs) { struct ib_flow_action *action = uobject->object; - int ret; - ret = ib_destroy_usecnt(&action->usecnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&action->usecnt)) + return -EBUSY; return action->device->ops.destroy_flow_action(action); } -static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs, - u32 flags, bool is_modify) -{ - u64 verbs_flags = flags; - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN)) - verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED; - - if (is_modify && uverbs_attr_is_valid(attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) - verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS; - - return verbs_flags; -}; - -static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat) -{ - struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm = - &keymat->keymat.aes_gcm; - - if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) - return -EOPNOTSUPP; - - if (aes_gcm->key_len != 32 && - aes_gcm->key_len != 24 && - aes_gcm->key_len != 16) - return -EINVAL; - - if (aes_gcm->icv_len != 16 && - aes_gcm->icv_len != 8 && - aes_gcm->icv_len != 12) - return -EINVAL; - - return 0; -} - -static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = { - [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm, -}; - -static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay, - bool is_modify) -{ - /* This is used in order to modify an esp flow action with an enabled - * replay protection to a disabled one. This is only supported via - * modify, as in create verb we can simply drop the REPLAY attribute and - * achieve the same thing. - */ - return is_modify ? 0 : -EINVAL; -} - -static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay, - bool is_modify) -{ - /* Some replay protections could always be enabled without validating - * anything. - */ - return 0; -} - -static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay, - bool is_modify) = { - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none, - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok, -}; - -static int parse_esp_ip(enum ib_flow_spec_type proto, - const void __user *val_ptr, - size_t len, union ib_flow_spec *out) -{ - int ret; - const struct ib_uverbs_flow_ipv4_filter ipv4 = { - .src_ip = cpu_to_be32(0xffffffffUL), - .dst_ip = cpu_to_be32(0xffffffffUL), - .proto = 0xff, - .tos = 0xff, - .ttl = 0xff, - .flags = 0xff, - }; - const struct ib_uverbs_flow_ipv6_filter ipv6 = { - .src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - .dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - .flow_label = cpu_to_be32(0xffffffffUL), - .next_hdr = 0xff, - .traffic_class = 0xff, - .hop_limit = 0xff, - }; - union { - struct ib_uverbs_flow_ipv4_filter ipv4; - struct ib_uverbs_flow_ipv6_filter ipv6; - } user_val = {}; - const void *user_pmask; - size_t val_len; - - /* If the flow IPv4/IPv6 flow specifications are extended, the mask - * should be changed as well. - */ - BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) + - sizeof(ipv4.flags) != sizeof(ipv4)); - BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) + - sizeof(ipv6.reserved) != sizeof(ipv6)); - - switch (proto) { - case IB_FLOW_SPEC_IPV4: - if (len > sizeof(user_val.ipv4) && - !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4), - len - sizeof(user_val.ipv4))) - return -EOPNOTSUPP; - - val_len = min_t(size_t, len, sizeof(user_val.ipv4)); - ret = copy_from_user(&user_val.ipv4, val_ptr, - val_len); - if (ret) - return -EFAULT; - - user_pmask = &ipv4; - break; - case IB_FLOW_SPEC_IPV6: - if (len > sizeof(user_val.ipv6) && - !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6), - len - sizeof(user_val.ipv6))) - return -EOPNOTSUPP; - - val_len = min_t(size_t, len, sizeof(user_val.ipv6)); - ret = copy_from_user(&user_val.ipv6, val_ptr, - val_len); - if (ret) - return -EFAULT; - - user_pmask = &ipv6; - break; - default: - return -EOPNOTSUPP; - } - - return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask, - &user_val, - val_len, out); -} - -static int flow_action_esp_get_encap(struct ib_flow_spec_list *out, - struct uverbs_attr_bundle *attrs) -{ - struct ib_uverbs_flow_action_esp_encap uverbs_encap; - int ret; - - ret = uverbs_copy_from(&uverbs_encap, attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP); - if (ret) - return ret; - - /* We currently support only one encap */ - if (uverbs_encap.next_ptr) - return -EOPNOTSUPP; - - if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 && - uverbs_encap.type != IB_FLOW_SPEC_IPV6) - return -EOPNOTSUPP; - - return parse_esp_ip(uverbs_encap.type, - u64_to_user_ptr(uverbs_encap.val_ptr), - uverbs_encap.len, - &out->spec); -} - -struct ib_flow_action_esp_attr { - struct ib_flow_action_attrs_esp hdr; - struct ib_flow_action_attrs_esp_keymats keymat; - struct ib_flow_action_attrs_esp_replays replay; - /* We currently support only one spec */ - struct ib_flow_spec_list encap; -}; - -#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW -static int parse_flow_action_esp(struct ib_device *ib_dev, - struct uverbs_attr_bundle *attrs, - struct ib_flow_action_esp_attr *esp_attr, - bool is_modify) -{ - struct ib_uverbs_flow_action_esp uverbs_esp = {}; - int ret; - - /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ - ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ESN); - if (IS_UVERBS_COPY_ERR(ret)) - return ret; - - /* This can be called from FLOW_ACTION_ESP_MODIFY where - * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional - */ - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) { - ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS); - if (ret) - return ret; - - if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1)) - return -EOPNOTSUPP; - - esp_attr->hdr.spi = uverbs_esp.spi; - esp_attr->hdr.seq = uverbs_esp.seq; - esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad; - esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts; - } - esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags, - is_modify); - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) { - esp_attr->keymat.protocol = - uverbs_attr_get_enum_id(attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); - ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat, - attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); - if (ret) - return ret; - - ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat); - if (ret) - return ret; - - esp_attr->hdr.keymat = &esp_attr->keymat; - } - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) { - esp_attr->replay.protocol = - uverbs_attr_get_enum_id(attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); - - ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay, - attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); - if (ret) - return ret; - - ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay, - is_modify); - if (ret) - return ret; - - esp_attr->hdr.replay = &esp_attr->replay; - } - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) { - ret = flow_action_esp_get_encap(&esp_attr->encap, attrs); - if (ret) - return ret; - - esp_attr->hdr.encap = &esp_attr->encap; - } - - return 0; -} - -static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( - struct uverbs_attr_bundle *attrs) -{ - struct ib_uobject *uobj = uverbs_attr_get_uobject( - attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); - struct ib_device *ib_dev = attrs->context->device; - int ret; - struct ib_flow_action *action; - struct ib_flow_action_esp_attr esp_attr = {}; - - if (!ib_dev->ops.create_flow_action_esp) - return -EOPNOTSUPP; - - ret = parse_flow_action_esp(ib_dev, attrs, &esp_attr, false); - if (ret) - return ret; - - /* No need to check as this attribute is marked as MANDATORY */ - action = ib_dev->ops.create_flow_action_esp(ib_dev, &esp_attr.hdr, - attrs); - if (IS_ERR(action)) - return PTR_ERR(action); - - uverbs_flow_action_fill_action(action, uobj, ib_dev, - IB_FLOW_ACTION_ESP); - - return 0; -} - -static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)( - struct uverbs_attr_bundle *attrs) -{ - struct ib_uobject *uobj = uverbs_attr_get_uobject( - attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE); - struct ib_flow_action *action = uobj->object; - int ret; - struct ib_flow_action_esp_attr esp_attr = {}; - - if (!action->device->ops.modify_flow_action_esp) - return -EOPNOTSUPP; - - ret = parse_flow_action_esp(action->device, attrs, &esp_attr, true); - if (ret) - return ret; - - if (action->type != IB_FLOW_ACTION_ESP) - return -EINVAL; - - return action->device->ops.modify_flow_action_esp(action, - &esp_attr.hdr, - attrs); -} - -static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { - [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_STRUCT( - struct ib_uverbs_flow_action_esp_keymat_aes_gcm, - aes_key), - }, -}; - -static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = { - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_NO_DATA(), - }, - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, - size), - }, -}; - -DECLARE_UVERBS_NAMED_METHOD( - UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, - UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, - UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_NEW, - UA_MANDATORY), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, - hard_limit_pkts), - UA_MANDATORY), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, - UVERBS_ATTR_TYPE(__u32), - UA_OPTIONAL), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, - uverbs_flow_action_esp_keymat, - UA_MANDATORY), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, - uverbs_flow_action_esp_replay, - UA_OPTIONAL), - UVERBS_ATTR_PTR_IN( - UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, - UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), - UA_OPTIONAL)); - -DECLARE_UVERBS_NAMED_METHOD( - UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, - UVERBS_ATTR_IDR(UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE, - UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_WRITE, - UA_MANDATORY), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, - hard_limit_pkts), - UA_OPTIONAL), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, - UVERBS_ATTR_TYPE(__u32), - UA_OPTIONAL), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, - uverbs_flow_action_esp_keymat, - UA_OPTIONAL), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, - uverbs_flow_action_esp_replay, - UA_OPTIONAL), - UVERBS_ATTR_PTR_IN( - UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, - UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), - UA_OPTIONAL)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_FLOW_ACTION_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, @@ -437,9 +56,7 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY( DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_FLOW_ACTION, UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)); + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY)); const struct uapi_definition uverbs_def_obj_flow_action[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index c1286a52dc84..03e1db5d1e8c 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +34,7 @@ #include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> +#include "restrack.h" static int uverbs_free_mr(struct ib_uobject *uobject, enum rdma_remove_reason why, @@ -69,7 +71,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)( num_sge = uverbs_attr_ptr_get_array_size( attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge)); - if (num_sge < 0) + if (num_sge <= 0) return num_sge; sg_list = uverbs_attr_get_alloced_ptr(attrs, @@ -114,7 +116,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( if (!(attr.access_flags & IB_ZERO_BASED)) return -EINVAL; - ret = ib_check_mr_access(attr.access_flags); + ret = ib_check_mr_access(ib_dev, attr.access_flags); if (ret) return ret; @@ -134,23 +136,133 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( atomic_inc(&pd->usecnt); atomic_inc(&dm->usecnt); + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); uobj->object = mr; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE); + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey, sizeof(mr->lkey)); if (ret) - goto err_dereg; + return ret; ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY, &mr->rkey, sizeof(mr->rkey)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_mr *mr = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE); + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + &mr->length, sizeof(mr->length)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA, + &mr->iova, sizeof(mr->iova)); + + return IS_UVERBS_COPY_ERR(ret) ? ret : 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE); + struct ib_device *ib_dev = pd->device; + + u64 offset, length, iova; + u32 fd, access_flags; + struct ib_mr *mr; + int ret; + + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&offset, attrs, + UVERBS_ATTR_REG_DMABUF_MR_OFFSET); if (ret) - goto err_dereg; + return ret; + + ret = uverbs_copy_from(&length, attrs, + UVERBS_ATTR_REG_DMABUF_MR_LENGTH); + if (ret) + return ret; + + ret = uverbs_copy_from(&iova, attrs, + UVERBS_ATTR_REG_DMABUF_MR_IOVA); + if (ret) + return ret; - return 0; + if ((offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; -err_dereg: - ib_dereg_mr_user(mr, uverbs_get_cleared_udata(attrs)); + ret = uverbs_copy_from(&fd, attrs, + UVERBS_ATTR_REG_DMABUF_MR_FD); + if (ret) + return ret; + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, + access_flags, + &attrs->driver_udata); + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); return ret; } @@ -172,6 +284,25 @@ DECLARE_UVERBS_NAMED_METHOD( UA_ALLOC_AND_COPY)); DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_DM_MR_REG, UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_OBJECT_MR, @@ -200,6 +331,37 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_FD, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum ib_access_flags), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MR_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE, @@ -210,9 +372,11 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY( DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_MR, UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr), + &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR), &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c new file mode 100644 index 000000000000..dd1075466f61 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" +#include "uverbs.h" +#include "core_priv.h" + +static int uverbs_free_qp(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_qp *qp = uobject->object; + struct ib_uqp_object *uqp = + container_of(uobject, struct ib_uqp_object, uevent.uobject); + int ret; + + /* + * If this is a user triggered destroy then do not allow destruction + * until the user cleans up all the mcast bindings. Unlike in other + * places we forcibly clean up the mcast attachments for !DESTROY + * because the mcast attaches are not ubojects and will not be + * destroyed by anything else during cleanup processing. + */ + if (why == RDMA_REMOVE_DESTROY) { + if (!list_empty(&uqp->mcast_list)) + return -EBUSY; + } else if (qp == qp->real_qp) { + ib_uverbs_detach_umcast(qp, uqp); + } + + ret = ib_destroy_qp_user(qp, &attrs->driver_udata); + if (ret) + return ret; + + if (uqp->uxrcd) + atomic_dec(&uqp->uxrcd->refcnt); + + ib_uverbs_release_uevent(&uqp->uevent); + return 0; +} + +static int check_creation_flags(enum ib_qp_type qp_type, + u32 create_flags) +{ + create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + + if (!create_flags || qp_type == IB_QPT_DRIVER) + return 0; + + if (qp_type != IB_QPT_RAW_PACKET && qp_type != IB_QPT_UD) + return -EINVAL; + + if ((create_flags & IB_UVERBS_QP_CREATE_SCATTER_FCS || + create_flags & IB_UVERBS_QP_CREATE_CVLAN_STRIPPING) && + qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + return 0; +} + +static void set_caps(struct ib_qp_init_attr *attr, + struct ib_uverbs_qp_cap *cap, bool req) +{ + if (req) { + attr->cap.max_send_wr = cap->max_send_wr; + attr->cap.max_recv_wr = cap->max_recv_wr; + attr->cap.max_send_sge = cap->max_send_sge; + attr->cap.max_recv_sge = cap->max_recv_sge; + attr->cap.max_inline_data = cap->max_inline_data; + } else { + cap->max_send_wr = attr->cap.max_send_wr; + cap->max_recv_wr = attr->cap.max_recv_wr; + cap->max_send_sge = attr->cap.max_send_sge; + cap->max_recv_sge = attr->cap.max_recv_sge; + cap->max_inline_data = attr->cap.max_inline_data; + } +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uqp_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_QP_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_qp_cap cap = {}; + struct ib_rwq_ind_table *rwq_ind_tbl = NULL; + struct ib_qp *qp; + struct ib_pd *pd = NULL; + struct ib_srq *srq = NULL; + struct ib_cq *recv_cq = NULL; + struct ib_cq *send_cq = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *xrcd_uobj = NULL; + struct ib_device *device; + u64 user_handle; + int ret; + + ret = uverbs_copy_from_or_zero(&cap, attrs, + UVERBS_ATTR_CREATE_QP_CAP); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_QP_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.qp_type, attrs, + UVERBS_ATTR_CREATE_QP_TYPE); + if (ret) + return ret; + + switch (attr.qp_type) { + case IB_QPT_XRC_TGT: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE)) + return -EINVAL; + + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) + return -EINVAL; + device = xrcd->device; + break; + case IB_UVERBS_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + fallthrough; + case IB_UVERBS_QPT_RC: + case IB_UVERBS_QPT_UC: + case IB_UVERBS_QPT_UD: + case IB_UVERBS_QPT_XRC_INI: + case IB_UVERBS_QPT_DRIVER: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE) || + (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE) && + attr.qp_type == IB_QPT_XRC_INI)) + return -EINVAL; + + pd = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + rwq_ind_tbl = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE); + if (!IS_ERR(rwq_ind_tbl)) { + if (cap.max_recv_wr || cap.max_recv_sge || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) + return -EINVAL; + + /* send_cq is optinal */ + if (cap.max_send_wr) { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + } + attr.rwq_ind_tbl = rwq_ind_tbl; + } else { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + + if (attr.qp_type != IB_QPT_XRC_INI) { + recv_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE); + if (IS_ERR(recv_cq)) + return PTR_ERR(recv_cq); + } + } + + device = pd->device; + break; + default: + return -EINVAL; + } + + ret = uverbs_get_flags32(&attr.create_flags, attrs, + UVERBS_ATTR_CREATE_QP_FLAGS, + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_UVERBS_QP_CREATE_SCATTER_FCS | + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING | + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING | + IB_UVERBS_QP_CREATE_SQ_SIG_ALL); + if (ret) + return ret; + + ret = check_creation_flags(attr.qp_type, attr.create_flags); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN)) { + ret = uverbs_copy_from(&attr.source_qpn, attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN); + if (ret) + return ret; + attr.create_flags |= IB_QP_CREATE_SOURCE_QPN; + } + + srq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE); + if (!IS_ERR(srq)) { + if ((srq->srq_type == IB_SRQT_XRC && + attr.qp_type != IB_QPT_XRC_TGT) || + (srq->srq_type != IB_SRQT_XRC && + attr.qp_type == IB_QPT_XRC_TGT)) + return -EINVAL; + attr.srq = srq; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_QP_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + obj->uevent.uobject.user_handle = user_handle; + attr.event_handler = ib_uverbs_qp_event_handler; + attr.send_cq = send_cq; + attr.recv_cq = recv_cq; + attr.xrcd = xrcd; + if (attr.create_flags & IB_UVERBS_QP_CREATE_SQ_SIG_ALL) { + /* This creation bit is uverbs one, need to mask before + * calling drivers. It was added to prevent an extra user attr + * only for that when using ioctl. + */ + attr.create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + } else { + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + } + + set_caps(&attr, &cap, true); + mutex_init(&obj->mcast_lock); + + qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj, + KBUILD_MODNAME); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + ib_qp_usecnt_inc(qp); + + if (attr.qp_type == IB_QPT_XRC_TGT) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + } + + obj->uevent.uobject.object = qp; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE); + + set_caps(&attr, &cap, false); + ret = uverbs_copy_to_struct_or_zero(attrs, + UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap, + sizeof(cap)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + &qp->qp_num, + sizeof(qp->qp_num)); + + return ret; +err_put: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_QP_TYPE, + enum ib_uverbs_qp_type, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_QP_FLAGS, + enum ib_uverbs_qp_create_flags, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_QP_HANDLE); + struct ib_uqp_object *obj = + container_of(uobj, struct ib_uqp_object, uevent.uobject); + struct ib_uverbs_destroy_qp_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_QP_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_QP_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_qp_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_QP, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp), + &UVERBS_METHOD(UVERBS_METHOD_QP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_QP_DESTROY)); + +const struct uapi_definition uverbs_def_obj_qp[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, + UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_srq.c b/drivers/infiniband/core/uverbs_std_types_srq.c new file mode 100644 index 000000000000..e5513f828bdc --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_srq.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_srq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_srq *srq = uobject->object; + struct ib_uevent_object *uevent = + container_of(uobject, struct ib_uevent_object, uobject); + enum ib_srq_type srq_type = srq->srq_type; + int ret; + + ret = ib_destroy_srq_user(srq, &attrs->driver_udata); + if (ret) + return ret; + + if (srq_type == IB_SRQT_XRC) { + struct ib_usrq_object *us = + container_of(uobject, struct ib_usrq_object, + uevent.uobject); + + atomic_dec(&us->uxrcd->refcnt); + } + + ib_uverbs_release_uevent(uevent); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_usrq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE); + struct ib_srq_init_attr attr = {}; + struct ib_uobject *xrcd_uobj; + struct ib_srq *srq; + u64 user_handle; + int ret; + + ret = uverbs_copy_from(&attr.attr.max_sge, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&attr.attr.max_wr, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&attr.attr.srq_limit, attrs, + UVERBS_ATTR_CREATE_SRQ_LIMIT); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.srq_type, attrs, + UVERBS_ATTR_CREATE_SRQ_TYPE); + if (ret) + return ret; + + if (ib_srq_has_cq(attr.srq_type)) { + attr.ext.cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE); + if (IS_ERR(attr.ext.cq)) + return PTR_ERR(attr.ext.cq); + } + + switch (attr.srq_type) { + case IB_UVERBS_SRQT_XRC: + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) + return -EINVAL; + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + break; + case IB_UVERBS_SRQT_TM: + ret = uverbs_copy_from(&attr.ext.tag_matching.max_num_tags, + attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS); + if (ret) + return ret; + break; + case IB_UVERBS_SRQT_BASIC: + break; + default: + return -EINVAL; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + attr.event_handler = ib_uverbs_srq_event_handler; + obj->uevent.uobject.user_handle = user_handle; + + srq = ib_create_srq_user(pd, &attr, obj, &attrs->driver_udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err; + } + + obj->uevent.uobject.object = srq; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + &attr.attr.max_wr, + sizeof(attr.attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + &attr.attr.max_sge, + sizeof(attr.attr.max_sge)); + if (ret) + return ret; + + if (attr.srq_type == IB_SRQT_XRC) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + &srq->ext.xrc.srq_num, + sizeof(srq->ext.xrc.srq_num)); + if (ret) + return ret; + } + + return 0; +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + if (attr.srq_type == IB_SRQT_XRC) + atomic_dec(&obj->uxrcd->refcnt); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_SRQ_TYPE, + enum ib_uverbs_srq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_SRQ_HANDLE); + struct ib_usrq_object *obj = + container_of(uobj, struct ib_usrq_object, uevent.uobject); + struct ib_uverbs_destroy_srq_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_SRQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_srq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_SRQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), + uverbs_free_srq), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_srq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_wq.c b/drivers/infiniband/core/uverbs_std_types_wq.c new file mode 100644 index 000000000000..7ded8339346f --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_wq.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_wq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_wq *wq = uobject->object; + struct ib_uwq_object *uwq = + container_of(uobject, struct ib_uwq_object, uevent.uobject); + int ret; + + ret = ib_destroy_wq_user(wq, &attrs->driver_udata); + if (ret) + return ret; + + ib_uverbs_release_uevent(&uwq->uevent); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uwq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_PD_HANDLE); + struct ib_cq *cq = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE); + struct ib_wq_init_attr wq_init_attr = {}; + struct ib_wq *wq; + u64 user_handle; + int ret; + + ret = uverbs_get_flags32(&wq_init_attr.create_flags, attrs, + UVERBS_ATTR_CREATE_WQ_FLAGS, + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING | + IB_UVERBS_WQ_FLAGS_SCATTER_FCS | + IB_UVERBS_WQ_FLAGS_DELAY_DROP | + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_sge, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_wr, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&wq_init_attr.wq_type, attrs, + UVERBS_ATTR_CREATE_WQ_TYPE); + if (ret) + return ret; + + if (wq_init_attr.wq_type != IB_WQT_RQ) + return -EINVAL; + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_WQ_EVENT_FD); + obj->uevent.uobject.user_handle = user_handle; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + wq_init_attr.wq_context = attrs->ufile; + wq_init_attr.cq = cq; + + wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); + if (IS_ERR(wq)) { + ret = PTR_ERR(wq); + goto err; + } + + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = obj; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + &wq_init_attr.max_wr, + sizeof(wq_init_attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + &wq_init_attr.max_sge, + sizeof(wq_init_attr.max_sge)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + &wq->wq_num, + sizeof(wq->wq_num)); + return ret; + +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_WQ_TYPE, + enum ib_wq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_WQ_FLAGS, + enum ib_uverbs_wq_flags, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_WQ_HANDLE); + struct ib_uwq_object *obj = + container_of(uobj, struct ib_uwq_object, uevent.uobject); + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_WQ_RESP, + &obj->uevent.events_reported, + sizeof(obj->uevent.events_reported)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_WQ_RESP, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_WQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq), + &UVERBS_METHOD(UVERBS_METHOD_WQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_WQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_wq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 3f121ac31e0a..a02916a3a79c 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -79,10 +79,7 @@ static int uapi_create_write(struct uverbs_api *uapi, method_elm->is_ex = def->write.is_ex; method_elm->handler = def->func_write; - if (def->write.is_ex) - method_elm->disabled = !(ibdev->uverbs_ex_cmd_mask & - BIT_ULL(def->write.command_num)); - else + if (!def->write.is_ex) method_elm->disabled = !(ibdev->uverbs_cmd_mask & BIT_ULL(def->write.command_num)); @@ -450,6 +447,9 @@ static int uapi_finalize(struct uverbs_api *uapi) uapi->num_write_ex = max_write_ex + 1; data = kmalloc_array(uapi->num_write + uapi->num_write_ex, sizeof(*uapi->write_methods), GFP_KERNEL); + if (!data) + return -ENOMEM; + for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++) data[i] = &uapi->notsupp_method; uapi->write_methods = data; @@ -520,7 +520,7 @@ static void uapi_key_okay(u32 key) count++; if (uapi_key_is_attr(key)) count++; - WARN(count != 1, "Bad count %d key=%x", count, key); + WARN(count != 1, "Bad count %u key=%x", count, key); } static void uapi_finalize_disable(struct uverbs_api *uapi) @@ -634,6 +634,9 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_qp), + UAPI_DEF_CHAIN(uverbs_def_obj_srq), + UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e62c9dfc7837..26b021f43ba4 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -50,12 +50,11 @@ #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> #include <rdma/rw.h> +#include <rdma/lag.h> #include "core_priv.h" #include <trace/events/rdma_core.h> -#include <trace/events/rdma_core.h> - static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr); @@ -97,10 +96,10 @@ static const char * const wc_statuses[] = { [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", [IB_WC_LOC_PROT_ERR] = "local protection error", [IB_WC_WR_FLUSH_ERR] = "WR flushed", - [IB_WC_MW_BIND_ERR] = "memory management operation error", + [IB_WC_MW_BIND_ERR] = "memory bind operation error", [IB_WC_BAD_RESP_ERR] = "bad response error", [IB_WC_LOC_ACCESS_ERR] = "local access error", - [IB_WC_REM_INV_REQ_ERR] = "invalid request error", + [IB_WC_REM_INV_REQ_ERR] = "remote invalid request error", [IB_WC_REM_ACCESS_ERR] = "remote access error", [IB_WC_REM_OP_ERR] = "remote operation error", [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", @@ -228,7 +227,8 @@ rdma_node_get_transport(unsigned int node_type) } EXPORT_SYMBOL(rdma_node_get_transport); -enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, + u32 port_num) { enum rdma_transport_type lt; if (device->ops.get_link_layer) @@ -245,7 +245,7 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); /* Protection domains */ /** - * ib_alloc_pd - Allocates an unused protection domain. + * __ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * @caller: caller's build-time module name @@ -268,22 +268,20 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, return ERR_PTR(-ENOMEM); pd->device = device; - pd->uobject = NULL; - pd->__internal_mr = NULL; - atomic_set(&pd->usecnt, 0); pd->flags = flags; - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, caller); ret = device->ops.alloc_pd(pd, NULL); if (ret) { + rdma_restrack_put(&pd->res); kfree(pd); return ERR_PTR(ret); } - rdma_restrack_kadd(&pd->res); + rdma_restrack_add(&pd->res); - if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else mr_access_flags |= IB_ACCESS_LOCAL_WRITE; @@ -310,7 +308,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, pd->__internal_mr = mr; - if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)) pd->local_dma_lkey = pd->__internal_mr->lkey; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) @@ -330,7 +328,7 @@ EXPORT_SYMBOL(__ib_alloc_pd); * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ -void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) +int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) { int ret; @@ -340,13 +338,13 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) pd->__internal_mr = NULL; } - /* uverbs manipulates usecnt with proper locking, while the kabi - requires the caller to guarantee we can't race here. */ - WARN_ON(atomic_read(&pd->usecnt)); + ret = pd->device->ops.dealloc_pd(pd, udata); + if (ret) + return ret; rdma_restrack_del(&pd->res); - pd->device->ops.dealloc_pd(pd, udata); kfree(pd); + return ret; } EXPORT_SYMBOL(ib_dealloc_pd_user); @@ -502,15 +500,17 @@ rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, - struct ib_udata *udata) + struct ib_udata *udata, + struct net_device *xmit_slave) { + struct rdma_ah_init_attr init_attr = {}; struct ib_device *device = pd->device; struct ib_ah *ah; int ret; might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); - if (!device->ops.create_ah) + if (!udata && !device->ops.create_ah) return ERR_PTR(-EOPNOTSUPP); ah = rdma_zalloc_drv_obj_gfp( @@ -523,8 +523,14 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, ah->pd = pd; ah->type = ah_attr->type; ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + init_attr.ah_attr = ah_attr; + init_attr.flags = flags; + init_attr.xmit_slave = xmit_slave; - ret = device->ops.create_ah(ah, ah_attr, flags, udata); + if (udata) + ret = device->ops.create_user_ah(ah, &init_attr, udata); + else + ret = device->ops.create_ah(ah, &init_attr, NULL); if (ret) { kfree(ah); return ERR_PTR(ret); @@ -549,15 +555,22 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags) { const struct ib_gid_attr *old_sgid_attr; + struct net_device *slave; struct ib_ah *ah; int ret; ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (ret) return ERR_PTR(ret); - - ah = _rdma_create_ah(pd, ah_attr, flags, NULL); - + slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? + GFP_KERNEL : GFP_ATOMIC); + if (IS_ERR(slave)) { + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return (void *)slave; + } + ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); + rdma_lag_put_ah_roce_slave(slave); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } @@ -596,7 +609,8 @@ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, } } - ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata); + ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, + udata, NULL); out: rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); @@ -638,7 +652,7 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr) EXPORT_SYMBOL(ib_get_rdma_header_version); static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, - u8 port_num, + u32 port_num, const struct ib_grh *grh) { int grh_version; @@ -681,7 +695,7 @@ static bool find_gid_index(const union ib_gid *gid, } static const struct ib_gid_attr * -get_sgid_attr_from_eth(struct ib_device *device, u8 port_num, +get_sgid_attr_from_eth(struct ib_device *device, u32 port_num, u16 vlan_id, const union ib_gid *sgid, enum ib_gid_type gid_type) { @@ -716,7 +730,7 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, (struct in6_addr *)dgid); return 0; } else if (net_type == RDMA_NETWORK_IPV6 || - net_type == RDMA_NETWORK_IB) { + net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) { *dgid = hdr->ibgrh.dgid; *sgid = hdr->ibgrh.sgid; return 0; @@ -768,7 +782,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, * On success the caller is responsible to call rdma_destroy_ah_attr on the * attr. */ -int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, +int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr) { @@ -899,7 +913,7 @@ void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr) EXPORT_SYMBOL(rdma_destroy_ah_attr); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, - const struct ib_grh *grh, u8 port_num) + const struct ib_grh *grh, u32 port_num) { struct rdma_ah_attr ah_attr; struct ib_ah *ah; @@ -952,32 +966,50 @@ int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) { const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; + int ret; might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); pd = ah->pd; - ah->device->ops.destroy_ah(ah, flags); + ret = ah->device->ops.destroy_ah(ah, flags); + if (ret) + return ret; + atomic_dec(&pd->usecnt); if (sgid_attr) rdma_put_gid_attr(sgid_attr); kfree(ah); - return 0; + return ret; } EXPORT_SYMBOL(rdma_destroy_ah_user); /* Shared receive queues */ -struct ib_srq *ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *srq_init_attr) +/** + * ib_create_srq_user - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * SRQ. If SRQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created SRQ. + * @uobject: uobject pointer if this is not a kernel SRQ + * @udata: udata pointer if this is not a kernel SRQ + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ib_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_srq_user(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_usrq_object *uobject, + struct ib_udata *udata) { struct ib_srq *srq; int ret; - if (!pd->device->ops.create_srq) - return ERR_PTR(-EOPNOTSUPP); - srq = rdma_zalloc_drv_obj(pd->device, ib_srq); if (!srq) return ERR_PTR(-ENOMEM); @@ -987,6 +1019,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + srq->uobject = uobject; if (ib_srq_has_cq(srq->srq_type)) { srq->ext.cq = srq_init_attr->ext.cq; @@ -994,14 +1027,19 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - atomic_inc(&srq->ext.xrc.xrcd->usecnt); + if (srq->ext.xrc.xrcd) + atomic_inc(&srq->ext.xrc.xrcd->usecnt); } atomic_inc(&pd->usecnt); - ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL); + rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ); + rdma_restrack_parent_name(&srq->res, &pd->res); + + ret = pd->device->ops.create_srq(srq, srq_init_attr, udata); if (ret) { - atomic_dec(&srq->pd->usecnt); - if (srq->srq_type == IB_SRQT_XRC) + rdma_restrack_put(&srq->res); + atomic_dec(&pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) atomic_dec(&srq->ext.xrc.xrcd->usecnt); if (ib_srq_has_cq(srq->srq_type)) atomic_dec(&srq->ext.cq->usecnt); @@ -1009,9 +1047,11 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, return ERR_PTR(ret); } + rdma_restrack_add(&srq->res); + return srq; } -EXPORT_SYMBOL(ib_create_srq); +EXPORT_SYMBOL(ib_create_srq_user); int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, @@ -1033,19 +1073,24 @@ EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) { + int ret; + if (atomic_read(&srq->usecnt)) return -EBUSY; - srq->device->ops.destroy_srq(srq, udata); + ret = srq->device->ops.destroy_srq(srq, udata); + if (ret) + return ret; atomic_dec(&srq->pd->usecnt); - if (srq->srq_type == IB_SRQT_XRC) + if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) atomic_dec(&srq->ext.xrc.xrcd->usecnt); if (ib_srq_has_cq(srq->srq_type)) atomic_dec(&srq->ext.cq->usecnt); + rdma_restrack_del(&srq->res); kfree(srq); - return 0; + return ret; } EXPORT_SYMBOL(ib_destroy_srq_user); @@ -1063,13 +1108,6 @@ static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags); } -static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) -{ - mutex_lock(&xrcd->tgt_qp_mutex); - list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); - mutex_unlock(&xrcd->tgt_qp_mutex); -} - static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, void (*event_handler)(struct ib_event *, void *), void *qp_context) @@ -1112,25 +1150,24 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) return ERR_PTR(-EINVAL); - qp = ERR_PTR(-EINVAL); - mutex_lock(&xrcd->tgt_qp_mutex); - list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { - if (real_qp->qp_num == qp_open_attr->qp_num) { - qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, - qp_open_attr->qp_context); - break; - } + down_read(&xrcd->tgt_qps_rwsem); + real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num); + if (!real_qp) { + up_read(&xrcd->tgt_qps_rwsem); + return ERR_PTR(-EINVAL); } - mutex_unlock(&xrcd->tgt_qp_mutex); + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + up_read(&xrcd->tgt_qps_rwsem); return qp; } EXPORT_SYMBOL(ib_open_qp); static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, - struct ib_qp_init_attr *qp_init_attr, - struct ib_udata *udata) + struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; + int err; qp->event_handler = __ib_shared_qp_event_handler; qp->qp_context = qp; @@ -1146,27 +1183,154 @@ static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, if (IS_ERR(qp)) return qp; - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num, + real_qp, GFP_KERNEL)); + if (err) { + ib_close_qp(qp); + return ERR_PTR(err); + } return qp; } -struct ib_qp *ib_create_qp_user(struct ib_pd *pd, - struct ib_qp_init_attr *qp_init_attr, - struct ib_udata *udata) +static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller) { - struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_udata dummy = {}; struct ib_qp *qp; int ret; - if (qp_init_attr->rwq_ind_tbl && - (qp_init_attr->recv_cq || - qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || - qp_init_attr->cap.max_recv_sge)) - return ERR_PTR(-EINVAL); + if (!dev->ops.create_qp) + return ERR_PTR(-EOPNOTSUPP); - if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) && - !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER)) - return ERR_PTR(-EINVAL); + qp = rdma_zalloc_drv_obj_numa(dev, ib_qp); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->device = dev; + qp->pd = pd; + qp->uobject = uobj; + qp->real_qp = qp; + + qp->qp_type = attr->qp_type; + qp->rwq_ind_tbl = attr->rwq_ind_tbl; + qp->srq = attr->srq; + qp->event_handler = attr->event_handler; + qp->port = attr->port_num; + qp->qp_context = attr->qp_context; + + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + + rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); + WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); + rdma_restrack_set_name(&qp->res, udata ? NULL : caller); + ret = dev->ops.create_qp(qp, attr, udata); + if (ret) + goto err_create; + + /* + * TODO: The mlx4 internally overwrites send_cq and recv_cq. + * Unfortunately, it is not an easy task to fix that driver. + */ + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + + ret = ib_create_qp_security(qp, dev); + if (ret) + goto err_security; + + rdma_restrack_add(&qp->res); + return qp; + +err_security: + qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL); +err_create: + rdma_restrack_put(&qp->res); + kfree(qp); + return ERR_PTR(ret); + +} + +/** + * ib_create_qp_user - Creates a QP associated with the specified protection + * domain. + * @dev: IB device + * @pd: The protection domain associated with the QP. + * @attr: A list of initial attributes required to create the + * QP. If QP creation succeeds, then the attributes are updated to + * the actual capabilities of the created QP. + * @udata: User data + * @uobj: uverbs obect + * @caller: caller's build-time module name + */ +struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller) +{ + struct ib_qp *qp, *xrc_qp; + + if (attr->qp_type == IB_QPT_XRC_TGT) + qp = create_qp(dev, pd, attr, NULL, NULL, caller); + else + qp = create_qp(dev, pd, attr, udata, uobj, NULL); + if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp)) + return qp; + + xrc_qp = create_xrc_qp_user(qp, attr); + if (IS_ERR(xrc_qp)) { + ib_destroy_qp(qp); + return xrc_qp; + } + + xrc_qp->uobject = uobj; + return xrc_qp; +} +EXPORT_SYMBOL(ib_create_qp_user); + +void ib_qp_usecnt_inc(struct ib_qp *qp) +{ + if (qp->pd) + atomic_inc(&qp->pd->usecnt); + if (qp->send_cq) + atomic_inc(&qp->send_cq->usecnt); + if (qp->recv_cq) + atomic_inc(&qp->recv_cq->usecnt); + if (qp->srq) + atomic_inc(&qp->srq->usecnt); + if (qp->rwq_ind_tbl) + atomic_inc(&qp->rwq_ind_tbl->usecnt); +} +EXPORT_SYMBOL(ib_qp_usecnt_inc); + +void ib_qp_usecnt_dec(struct ib_qp *qp) +{ + if (qp->rwq_ind_tbl) + atomic_dec(&qp->rwq_ind_tbl->usecnt); + if (qp->srq) + atomic_dec(&qp->srq->usecnt); + if (qp->recv_cq) + atomic_dec(&qp->recv_cq->usecnt); + if (qp->send_cq) + atomic_dec(&qp->send_cq->usecnt); + if (qp->pd) + atomic_dec(&qp->pd->usecnt); +} +EXPORT_SYMBOL(ib_qp_usecnt_dec); + +struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + const char *caller) +{ + struct ib_device *device = pd->device; + struct ib_qp *qp; + int ret; /* * If the callers is using the RDMA API calculate the resources @@ -1177,47 +1341,11 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); - qp = _ib_create_qp(device, pd, qp_init_attr, NULL, NULL); + qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller); if (IS_ERR(qp)) return qp; - ret = ib_create_qp_security(qp, device); - if (ret) - goto err; - - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - struct ib_qp *xrc_qp = - create_xrc_qp_user(qp, qp_init_attr, udata); - - if (IS_ERR(xrc_qp)) { - ret = PTR_ERR(xrc_qp); - goto err; - } - return xrc_qp; - } - - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - if (qp_init_attr->recv_cq) - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } - - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; - - atomic_inc(&pd->usecnt); - if (qp_init_attr->send_cq) - atomic_inc(&qp_init_attr->send_cq->usecnt); - if (qp_init_attr->rwq_ind_tbl) - atomic_inc(&qp->rwq_ind_tbl->usecnt); + ib_qp_usecnt_inc(qp); if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); @@ -1243,7 +1371,7 @@ err: return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_create_qp_user); +EXPORT_SYMBOL(ib_create_qp_kernel); static const struct { int valid; @@ -1616,22 +1744,48 @@ static bool is_qp_type_connected(const struct ib_qp *qp) qp->qp_type == IB_QPT_XRC_TGT); } -/** +/* * IB core internal function to perform QP attributes modification. */ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; const struct ib_gid_attr *old_sgid_attr_av; const struct ib_gid_attr *old_sgid_attr_alt_av; int ret; + attr->xmit_slave = NULL; if (attr_mask & IB_QP_AV) { ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, &old_sgid_attr_av); if (ret) return ret; + + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { + struct net_device *slave; + + /* + * If the user provided the qp_attr then we have to + * resolve it. Kerne users have to provide already + * resolved rdma_ah_attr's. + */ + if (udata) { + ret = ib_resolve_eth_dmac(qp->device, + &attr->ah_attr); + if (ret) + goto out_av; + } + slave = rdma_lag_get_ah_roce_slave(qp->device, + &attr->ah_attr, + GFP_KERNEL); + if (IS_ERR(slave)) { + ret = PTR_ERR(slave); + goto out_av; + } + attr->xmit_slave = slave; + } } if (attr_mask & IB_QP_ALT_PATH) { /* @@ -1653,23 +1807,11 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, if (!(rdma_protocol_ib(qp->device, attr->alt_ah_attr.port_num) && rdma_protocol_ib(qp->device, port))) { - ret = EINVAL; + ret = -EINVAL; goto out; } } - /* - * If the user provided the qp_attr then we have to resolve it. Kernel - * users have to provide already resolved rdma_ah_attr's - */ - if (udata && (attr_mask & IB_QP_AV) && - attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && - is_qp_type_connected(qp)) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - goto out; - } - if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { dev_warn(&qp->device->dev, @@ -1711,8 +1853,10 @@ out: if (attr_mask & IB_QP_ALT_PATH) rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); out_av: - if (attr_mask & IB_QP_AV) + if (attr_mask & IB_QP_AV) { + rdma_lag_put_ah_roce_slave(attr->xmit_slave); rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); + } return ret; } @@ -1734,7 +1878,7 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); -int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width) { int rc; u32 netdev_speed; @@ -1754,11 +1898,11 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) dev_put(netdev); - if (!rc) { + if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) { netdev_speed = lksettings.base.speed; } else { netdev_speed = SPEED_1000; - pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, + pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name, netdev_speed); } @@ -1838,21 +1982,18 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp) real_qp = qp->real_qp; xrcd = real_qp->xrcd; - - mutex_lock(&xrcd->tgt_qp_mutex); + down_write(&xrcd->tgt_qps_rwsem); ib_close_qp(qp); if (atomic_read(&real_qp->usecnt) == 0) - list_del(&real_qp->xrcd_list); + xa_erase(&xrcd->tgt_qps, real_qp->qp_num); else real_qp = NULL; - mutex_unlock(&xrcd->tgt_qp_mutex); + up_write(&xrcd->tgt_qps_rwsem); if (real_qp) { ret = ib_destroy_qp(real_qp); if (!ret) atomic_dec(&xrcd->usecnt); - else - __ib_insert_xrcd_qp(xrcd, real_qp); } return 0; @@ -1862,10 +2003,6 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) { const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; - struct ib_pd *pd; - struct ib_cq *scq, *rcq; - struct ib_srq *srq; - struct ib_rwq_ind_table *ind_tbl; struct ib_qp_security *sec; int ret; @@ -1877,11 +2014,6 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) if (qp->real_qp != qp) return __ib_destroy_shared_qp(qp); - pd = qp->pd; - scq = qp->send_cq; - rcq = qp->recv_cq; - srq = qp->srq; - ind_tbl = qp->rwq_ind_tbl; sec = qp->qp_sec; if (sec) ib_destroy_qp_security_begin(sec); @@ -1890,30 +2022,24 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) rdma_rw_cleanup_mrs(qp); rdma_counter_unbind_qp(qp, true); - rdma_restrack_del(&qp->res); ret = qp->device->ops.destroy_qp(qp, udata); - if (!ret) { - if (alt_path_sgid_attr) - rdma_put_gid_attr(alt_path_sgid_attr); - if (av_sgid_attr) - rdma_put_gid_attr(av_sgid_attr); - if (pd) - atomic_dec(&pd->usecnt); - if (scq) - atomic_dec(&scq->usecnt); - if (rcq) - atomic_dec(&rcq->usecnt); - if (srq) - atomic_dec(&srq->usecnt); - if (ind_tbl) - atomic_dec(&ind_tbl->usecnt); - if (sec) - ib_destroy_qp_security_end(sec); - } else { + if (ret) { if (sec) ib_destroy_qp_security_abort(sec); + return ret; } + if (alt_path_sgid_attr) + rdma_put_gid_attr(alt_path_sgid_attr); + if (av_sgid_attr) + rdma_put_gid_attr(av_sgid_attr); + + ib_qp_usecnt_dec(qp); + if (sec) + ib_destroy_qp_security_end(sec); + + rdma_restrack_del(&qp->res); + kfree(qp); return ret; } EXPORT_SYMBOL(ib_destroy_qp_user); @@ -1940,22 +2066,27 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); ret = device->ops.create_cq(cq, cq_attr, NULL); if (ret) { + rdma_restrack_put(&cq->res); kfree(cq); return ERR_PTR(ret); } - rdma_restrack_kadd(&cq->res); + rdma_restrack_add(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { + if (cq->shared) + return -EOPNOTSUPP; + return cq->device->ops.modify_cq ? cq->device->ops.modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP; @@ -1964,18 +2095,29 @@ EXPORT_SYMBOL(rdma_set_cq_moderation); int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { + int ret; + + if (WARN_ON_ONCE(cq->shared)) + return -EOPNOTSUPP; + if (atomic_read(&cq->usecnt)) return -EBUSY; + ret = cq->device->ops.destroy_cq(cq, udata); + if (ret) + return ret; + rdma_restrack_del(&cq->res); - cq->device->ops.destroy_cq(cq, udata); kfree(cq); - return 0; + return ret; } EXPORT_SYMBOL(ib_destroy_cq_user); int ib_resize_cq(struct ib_cq *cq, int cqe) { + if (cq->shared) + return -EOPNOTSUPP; + return cq->device->ops.resize_cq ? cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; } @@ -1989,8 +2131,8 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mr; if (access_flags & IB_ACCESS_ON_DEMAND) { - if (!(pd->device->attrs.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { + if (!(pd->device->attrs.kernel_cap_flags & + IBK_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); return ERR_PTR(-EINVAL); } @@ -2003,11 +2145,16 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return mr; mr->device = pd->device; + mr->type = IB_MR_TYPE_USER; mr->pd = pd; mr->dm = NULL; atomic_inc(&pd->usecnt); - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); + mr->iova = virt_addr; + mr->length = length; + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); return mr; } @@ -2019,6 +2166,9 @@ int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, if (!pd->device->ops.advise_mr) return -EOPNOTSUPP; + if (!num_sge) + return 0; + return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, NULL); } @@ -2046,11 +2196,10 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) EXPORT_SYMBOL(ib_dereg_mr_user); /** - * ib_alloc_mr_user() - Allocates a memory region + * ib_alloc_mr() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. - * @udata: user data or null for kernel objects * * Notes: * Memory registeration page/sg lists must not exceed max_num_sg. @@ -2058,8 +2207,8 @@ EXPORT_SYMBOL(ib_dereg_mr_user); * max_num_sg * used_page_size. * */ -struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, - u32 max_num_sg, struct ib_udata *udata) +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg) { struct ib_mr *mr; @@ -2074,25 +2223,27 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, goto out; } - mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata); - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->dm = NULL; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - mr->need_inval = false; - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); - mr->type = mr_type; - mr->sig_attrs = NULL; - } + mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); + if (IS_ERR(mr)) + goto out; + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->type = mr_type; + mr->sig_attrs = NULL; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); out: trace_mr_alloc(pd, mr_type, max_num_sg, mr); return mr; } -EXPORT_SYMBOL(ib_alloc_mr_user); +EXPORT_SYMBOL(ib_alloc_mr); /** * ib_alloc_mr_integrity() - Allocates an integrity memory region @@ -2143,65 +2294,18 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); mr->type = IB_MR_TYPE_INTEGRITY; mr->sig_attrs = sig_attrs; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); out: trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); return mr; } EXPORT_SYMBOL(ib_alloc_mr_integrity); -/* "Fast" memory regions */ - -struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, - int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct ib_fmr *fmr; - - if (!pd->device->ops.alloc_fmr) - return ERR_PTR(-EOPNOTSUPP); - - fmr = pd->device->ops.alloc_fmr(pd, mr_access_flags, fmr_attr); - if (!IS_ERR(fmr)) { - fmr->device = pd->device; - fmr->pd = pd; - atomic_inc(&pd->usecnt); - } - - return fmr; -} -EXPORT_SYMBOL(ib_alloc_fmr); - -int ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *fmr; - - if (list_empty(fmr_list)) - return 0; - - fmr = list_entry(fmr_list->next, struct ib_fmr, list); - return fmr->device->ops.unmap_fmr(fmr_list); -} -EXPORT_SYMBOL(ib_unmap_fmr); - -int ib_dealloc_fmr(struct ib_fmr *fmr) -{ - struct ib_pd *pd; - int ret; - - pd = fmr->pd; - ret = fmr->device->ops.dealloc_fmr(fmr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_fmr); - /* Multicast groups */ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) @@ -2209,7 +2313,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) struct ib_qp_init_attr init_attr = {}; struct ib_qp_attr attr = {}; int num_eth_ports = 0; - int port; + unsigned int port; /* If QP state >= init, it is assigned to a port and we can check this * port only. @@ -2224,7 +2328,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) } /* Can't get a quick answer, iterate over all ports */ - for (port = 0; port < qp->device->phys_port_cnt; port++) + rdma_for_each_port(qp->device, port) if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; @@ -2278,45 +2382,61 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) } EXPORT_SYMBOL(ib_detach_mcast); -struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) +/** + * ib_alloc_xrcd_user - Allocates an XRC domain. + * @device: The device on which to allocate the XRC domain. + * @inode: inode to connect XRCD + * @udata: Valid user data or NULL for kernel object + */ +struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, + struct inode *inode, struct ib_udata *udata) { struct ib_xrcd *xrcd; + int ret; if (!device->ops.alloc_xrcd) return ERR_PTR(-EOPNOTSUPP); - xrcd = device->ops.alloc_xrcd(device, NULL); - if (!IS_ERR(xrcd)) { - xrcd->device = device; - xrcd->inode = NULL; - atomic_set(&xrcd->usecnt, 0); - mutex_init(&xrcd->tgt_qp_mutex); - INIT_LIST_HEAD(&xrcd->tgt_qp_list); - } + xrcd = rdma_zalloc_drv_obj(device, ib_xrcd); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + xrcd->device = device; + xrcd->inode = inode; + atomic_set(&xrcd->usecnt, 0); + init_rwsem(&xrcd->tgt_qps_rwsem); + xa_init(&xrcd->tgt_qps); + ret = device->ops.alloc_xrcd(xrcd, udata); + if (ret) + goto err; return xrcd; +err: + kfree(xrcd); + return ERR_PTR(ret); } -EXPORT_SYMBOL(__ib_alloc_xrcd); +EXPORT_SYMBOL(ib_alloc_xrcd_user); -int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata) +/** + * ib_dealloc_xrcd_user - Deallocates an XRC domain. + * @xrcd: The XRC domain to deallocate. + * @udata: Valid user data or NULL for kernel object + */ +int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata) { - struct ib_qp *qp; int ret; if (atomic_read(&xrcd->usecnt)) return -EBUSY; - while (!list_empty(&xrcd->tgt_qp_list)) { - qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); - ret = ib_destroy_qp(qp); - if (ret) - return ret; - } - mutex_destroy(&xrcd->tgt_qp_mutex); - - return xrcd->device->ops.dealloc_xrcd(xrcd, udata); + WARN_ON(!xa_empty(&xrcd->tgt_qps)); + ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata); + if (ret) + return ret; + kfree(xrcd); + return ret; } -EXPORT_SYMBOL(ib_dealloc_xrcd); +EXPORT_SYMBOL(ib_dealloc_xrcd_user); /** * ib_create_wq - Creates a WQ associated with the specified protection @@ -2358,108 +2478,28 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd, EXPORT_SYMBOL(ib_create_wq); /** - * ib_destroy_wq - Destroys the specified user WQ. + * ib_destroy_wq_user - Destroys the specified user WQ. * @wq: The WQ to destroy. * @udata: Valid user data */ -int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) +int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata) { struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; + int ret; if (atomic_read(&wq->usecnt)) return -EBUSY; - wq->device->ops.destroy_wq(wq, udata); + ret = wq->device->ops.destroy_wq(wq, udata); + if (ret) + return ret; + atomic_dec(&pd->usecnt); atomic_dec(&cq->usecnt); - - return 0; -} -EXPORT_SYMBOL(ib_destroy_wq); - -/** - * ib_modify_wq - Modifies the specified WQ. - * @wq: The WQ to modify. - * @wq_attr: On input, specifies the WQ attributes to modify. - * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ - * are being modified. - * On output, the current values of selected WQ attributes are returned. - */ -int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, - u32 wq_attr_mask) -{ - int err; - - if (!wq->device->ops.modify_wq) - return -EOPNOTSUPP; - - err = wq->device->ops.modify_wq(wq, wq_attr, wq_attr_mask, NULL); - return err; -} -EXPORT_SYMBOL(ib_modify_wq); - -/* - * ib_create_rwq_ind_table - Creates a RQ Indirection Table. - * @device: The device on which to create the rwq indirection table. - * @ib_rwq_ind_table_init_attr: A list of initial attributes required to - * create the Indirection Table. - * - * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less - * than the created ib_rwq_ind_table object and the caller is responsible - * for its memory allocation/free. - */ -struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, - struct ib_rwq_ind_table_init_attr *init_attr) -{ - struct ib_rwq_ind_table *rwq_ind_table; - int i; - u32 table_size; - - if (!device->ops.create_rwq_ind_table) - return ERR_PTR(-EOPNOTSUPP); - - table_size = (1 << init_attr->log_ind_tbl_size); - rwq_ind_table = device->ops.create_rwq_ind_table(device, - init_attr, NULL); - if (IS_ERR(rwq_ind_table)) - return rwq_ind_table; - - rwq_ind_table->ind_tbl = init_attr->ind_tbl; - rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; - rwq_ind_table->device = device; - rwq_ind_table->uobject = NULL; - atomic_set(&rwq_ind_table->usecnt, 0); - - for (i = 0; i < table_size; i++) - atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); - - return rwq_ind_table; -} -EXPORT_SYMBOL(ib_create_rwq_ind_table); - -/* - * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. - * @wq_ind_table: The Indirection Table to destroy. -*/ -int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) -{ - int err, i; - u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); - struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; - - if (atomic_read(&rwq_ind_table->usecnt)) - return -EBUSY; - - err = rwq_ind_table->device->ops.destroy_rwq_ind_table(rwq_ind_table); - if (!err) { - for (i = 0; i < table_size; i++) - atomic_dec(&ind_tbl[i]->usecnt); - } - - return err; + return ret; } -EXPORT_SYMBOL(ib_destroy_rwq_ind_table); +EXPORT_SYMBOL(ib_destroy_wq_user); int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) @@ -2471,7 +2511,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, } EXPORT_SYMBOL(ib_check_mr_status); -int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, +int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state) { if (!device->ops.set_vf_link_state) @@ -2481,7 +2521,7 @@ int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, } EXPORT_SYMBOL(ib_set_vf_link_state); -int ib_get_vf_config(struct ib_device *device, int vf, u8 port, +int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info) { if (!device->ops.get_vf_config) @@ -2491,7 +2531,7 @@ int ib_get_vf_config(struct ib_device *device, int vf, u8 port, } EXPORT_SYMBOL(ib_get_vf_config); -int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, +int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats) { if (!device->ops.get_vf_stats) @@ -2501,7 +2541,7 @@ int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, } EXPORT_SYMBOL(ib_get_vf_stats); -int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, +int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type) { if (!device->ops.set_vf_guid) @@ -2511,7 +2551,7 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, } EXPORT_SYMBOL(ib_set_vf_guid); -int ib_get_vf_guid(struct ib_device *device, int vf, u8 port, +int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { @@ -2568,6 +2608,7 @@ EXPORT_SYMBOL(ib_map_mr_sg_pi); * @page_size: page vector desired page size * * Constraints: + * * - The first sg element is allowed to have an offset. * - Each sg element must either be aligned to page_size or virtually * contiguous to the previous element. In case an sg element has a @@ -2601,10 +2642,12 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg - * @sg_offset_p: IN: start offset in bytes into sg - * OUT: offset in bytes for element n of the sg of the first + * @sg_offset_p: ==== ======================================================= + * IN start offset in bytes into sg + * OUT offset in bytes for element n of the sg of the first * byte that has not been processed where n is the return * value of this function. + * ==== ======================================================= * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest @@ -2850,7 +2893,7 @@ void ib_drain_qp(struct ib_qp *qp) } EXPORT_SYMBOL(ib_drain_qp); -struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, +struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)) @@ -2876,7 +2919,7 @@ struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, } EXPORT_SYMBOL(rdma_alloc_netdev); -int rdma_init_netdev(struct ib_device *device, u8 port_num, +int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), @@ -2931,3 +2974,52 @@ bool __rdma_block_iter_next(struct ib_block_iter *biter) return true; } EXPORT_SYMBOL(__rdma_block_iter_next); + +/** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for the drivers. + * @descs: array of static descriptors + * @num_counters: number of elements in array + * @lifespan: milliseconds between updates + */ +struct rdma_hw_stats *rdma_alloc_hw_stats_struct( + const struct rdma_stat_desc *descs, int num_counters, + unsigned long lifespan) +{ + struct rdma_hw_stats *stats; + + stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL); + if (!stats) + return NULL; + + stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters), + sizeof(*stats->is_disabled), GFP_KERNEL); + if (!stats->is_disabled) + goto err; + + stats->descs = descs; + stats->num_counters = num_counters; + stats->lifespan = msecs_to_jiffies(lifespan); + mutex_init(&stats->lock); + + return stats; + +err: + kfree(stats); + return NULL; +} +EXPORT_SYMBOL(rdma_alloc_hw_stats_struct); + +/** + * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats + * @stats: statistics to release + */ +void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats) +{ + if (!stats) + return; + + kfree(stats->is_disabled); + kfree(stats); +} +EXPORT_SYMBOL(rdma_free_hw_stats_struct); |